In [1]:
import pandas as pd                         #Data manipulation library
import re  # re is used to match the known brand names in the item names and extract them. Specifically, the re.search() method is used to search for a regular expression pattern within a string.
from sklearn.feature_extraction.text import CountVectorizer#CountVectorizer is used to convert text data into a matrix of token counts.
from sklearn.model_selection import train_test_split  # is a module that provides various tools for model selection, including splitting a dataset into a training set and a test set.
from sklearn.naive_bayes import MultinomialNB # module that provides implementation of the Naive Bayes algorithm for classification

In [3]:
# Load the data
df = pd.read_csv("carbonated_soft_drinks.csv")


In [34]:
# Define the list of known brand names
brands = ["Coca-Cola", "Pepsi", "Mountain Dew", "Dr. Pepper", "Sprite", "Fanta"]

In [50]:
df.head(10)

Unnamed: 0,item_name,brand_name
3,Bottle Pepsi 20oz,Pepsi
5,Bottle Mountain Dew 20oz,Mountain Dew
7,Bottle Dr. Pepper 20oz,Dr. Pepper
8,Bottle Dr. Pepper Diet 20oz,Dr. Pepper
9,Diet Pepsi 20oz,Pepsi
10,Bottle Mountain Dew Diet 20oz,Mountain Dew
11,Bottle Sprite 20oz,Sprite
17,Bottle Sprite Zero 20oz,Sprite
24,Soda Pepsi 20oz Bottle,Pepsi
26,BVC - Bottle Dr. Pepper 20oz,Dr. Pepper


In [51]:
df.describe()

Unnamed: 0,item_name,brand_name
count,4119,4119
unique,3558,7
top,Bottle Pepsi 20oz,Pepsi
freq,4,1573


In [35]:
# Define a function to extract the brand name from the item name
def extract_brand_name(item_name):
    for brand in brands:
        # Use a regular expression to match the brand name
        match = re.search(r"\b{}\b".format(brand), item_name) #"\b{}\b" finds the begining and ending words in sentence
        if match:
            # Extract the brand name from the item name
            extracted_brand = match.group(0)
            return extracted_brand
    # If no brand is found, return None
    return None

In [36]:
# Add a new column to the DataFrame to store the extracted brand names
df["brand_name"] = df["item_name"].apply(extract_brand_name)


In [37]:
# Remove rows with missing brand names, dropna drops all the missing values 
df.dropna(subset=["brand_name"], inplace=True)


In [38]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df["item_name"], df["brand_name"], test_size=0.5, random_state=30)


In [39]:
X_train

12538                   Soda Mountain Dew Flamin Hot 20o
5448                          20oz Dr. Pepper Zero Sugar
13054                                  BEV - Sprite 20oz
8068                                Soda Diet Pepsi 20Oz
1918                            BVC - Dr. Pepper - 20oz.
                              ...                       
9964                               Bev Pepsi diet cherry
4508                                          Sprite Can
3463     Mountain Dew Rise Strawberry Melon Spark 16 oz.
1777                                          20oz Pepsi
15974                                Mountain Dew VooDew
Name: item_name, Length: 2059, dtype: object

In [40]:
X_test

8640        Mountain Dew Baja Blast 20oz
8983     BVC - Bottle Pepsi Vanilla 20oz
965           2 for $2.22 Pepsi Products
9356                     Pepsi Zero 20oz
9708                    1L Pl Pepsi 1/15
                      ...               
4528                   Mountain Dew Diet
2501               Mountain Dew btl 20oz
12500         BVC - Mountain Dew 16.9 oz
8647           Sprite Glass Bottle 355ml
13173      Diet Pepsi - Classic Sweetner
Name: item_name, Length: 2060, dtype: object

In [41]:
y_train

12538    Mountain Dew
5448       Dr. Pepper
13054          Sprite
8068            Pepsi
1918       Dr. Pepper
             ...     
9964            Pepsi
4508           Sprite
3463     Mountain Dew
1777            Pepsi
15974    Mountain Dew
Name: brand_name, Length: 2059, dtype: object

In [42]:
y_test

8640     Mountain Dew
8983            Pepsi
965             Pepsi
9356            Pepsi
9708            Pepsi
             ...     
4528     Mountain Dew
2501     Mountain Dew
12500    Mountain Dew
8647           Sprite
13173           Pepsi
Name: brand_name, Length: 2060, dtype: object

In [46]:
# Vectorize the data , This creates an instance of the CountVectorizer class. By default, this will tokenize the text, lowercase it, and count the number of times each word appears in the text.
vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)


In [47]:
# Train a Naive Bayes classifier , basically it is used to train a supervised machine learning model for text classification
clf = MultinomialNB() #clf is an instance of the MultinomialNB class
clf.fit(X_train_vect, y_train) #t is trained on the training data using the fit() method which takes in the vectorized input X_train_vect and the corresponding output y_train. Once the classifier is trained, it can be used to predict the class labels for new data points.

MultinomialNB()

In [48]:
# Evaluate the classifier
accuracy = clf.score(X_test_vect, y_test)
print("Accuracy: {:.2f}%".format(accuracy * 100)) #The "{:.2f}%" format string specifies that the accuracy value should be displayed with two decimal places and a percentage sign.

Accuracy: 99.66%


In [49]:
df.to_csv('Carbonated_soft_drinks_with_brands.csv', index=False)