In [3]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
import string

In [4]:
data = pd.read_csv('carbonated_soft_drinks.csv');

In [6]:
data.item_name.nunique()
data.head(10)

Unnamed: 0,item_name
0,Bottle Coke Classic 20oz
1,Bottle Coke Diet 20oz
2,20oz Fountain Beverage
3,Bottle Pepsi 20oz
4,32oz Fountain Beverage
5,Bottle Mountain Dew 20oz
6,Bottle Coke Zero 20oz
7,Bottle Dr. Pepper 20oz
8,Bottle Dr. Pepper Diet 20oz
9,Diet Pepsi 20oz


In [7]:
def cleaning(sentence):
    
    # Basic cleaning
    sentence = sentence.strip() ## remove whitespaces
    sentence = sentence.lower() ## lowercase 
    sentence = ''.join(char for char in sentence if not char.isdigit()) ## remove numbers
    
    removables = ['bottle','bvc' ,'can', 'oz', 'ml', 'large', 'diet', 'zero', 'sugar', 'original', 'medium', 'lt', 'fl',
                  'soft', 'drink', 'blend', 'canned', 'bl', 'glass', 'soda', 'bvg']
    alphabet = list(string.ascii_lowercase)
    
    for punctuation in string.punctuation:
        sentence = sentence.replace(punctuation, '')

    word_tokens = word_tokenize(sentence)
    
    # Advanced cleaning
    word_tokens = [w for w in word_tokens if w not in removables] ## remove punctuation
    word_tokens = [w for w in word_tokens if w not in alphabet]
        
    cleaned_sentence = ' '.join(word for word in word_tokens)
    
    return cleaned_sentence

In [10]:
data['item_name_clean'] = data.item_name.apply(cleaning)

In [11]:
#This function returns a list of brand name subset found in item_name
#If it find nothing it returns an empty string

def identify_brand(text):   
    tokens = word_tokenize(text)
    
    brands = ['schweppes', 'sunkist', 'coke', 'coca', 'cola', 'cocacola', 'pepsi', 'topo' 'chico', 'crush', 
              'fanta', 'mountain' , 'dew', 'canada' ,'dry', 'dr', 'pepper', 'sprite', 'aw', 'stewarts']
    brand = [w for w in tokens if w in brands]
    return brand

#This function returns a string containing identiable brand text found in item_name
#If it does not identify then it returns the input itself, namely item_name_clean

def identify_brand_str(text):   
    tokens = word_tokenize(text)
    
    brands = ['schweppes', 'sunkist', 'coke', 'coca', 'cola', 'cocacola', 'pepsi', 'topo' 'chico', 'crush', 
              'fanta', 'mountain' , 'dew', 'canada' ,'dry', 'dr', 'pepper', 'sprite', 'aw', 'stewarts']
    brand = [w for w in tokens if w in brands]
    
    if brand:
        return ' '.join(w for w in brand)
    else:
        return text
    
def preliminary_label(text):   
    tokens = word_tokenize(text)
    
    brands = ['schweppes', 'sunkist', 'coke', 'coca', 'cola', 'cocacola', 'pepsi', 'topo' 'chico', 'crush', 
              'fanta', 'mountain' , 'dew', 'canada' ,'dry', 'dr', 'pepper', 'sprite', 'aw', 'stewarts']
    brand = [w for w in tokens if w in brands]
    coke = ['coke', 'coca', 'cola', 'cocacola']
    
    if brand:
        return ' '.join(w for w in brand)
    else:
        return text

In [12]:
data['brand'] = data.item_name_clean.apply(identify_brand)
data['brands'] = data.item_name_clean.apply(identify_brand_str)


In [13]:
#62 unique values when leaving unidentifiable text empty
pd.set_option('display.min_rows', 100)
data.brand.value_counts()


[]                          5768
[coke]                      2331
[pepsi]                     1611
[dew]                       1050
[mountain, dew]             1027
[dr, pepper]                 821
[sprite]                     762
[fanta]                      618
[canada, dry]                586
[coca, cola]                 376
[crush]                      369
[sunkist]                    329
[aw]                         229
[schweppes]                  205
[cocacola]                   166
[stewarts]                   135
[cola]                       116
[dry]                         77
[dr]                          69
[pepsi, cola]                 67
[mountain]                    34
[dew, dew]                    30
[coca, cola, coke]            21
[mountain, dew, dew]          20
[coke, coke]                  17
[topochico]                   15
[coke, cola]                  12
[pepper]                      12
[pepsi, dew]                  11
[coca]                         9
          

In [14]:
#2560 values when keeping item_name_clean
data.brands.nunique()
data.brands.value_counts()

coke                             2331
pepsi                            1611
dew                              1050
mountain dew                     1027
dr pepper                         821
sprite                            762
fanta                             618
canada dry                        586
coca cola                         376
crush                             369
fountain                          343
sunkist                           329
                                  258
aw                                229
schweppes                         205
cocacola                          166
stewarts                          135
up                                128
cola                              116
seagrams ginger ale                99
sierra mist                        79
dry                                77
fountain beverage                  75
dr                                 69
squirt                             68
pepsi cola                         67
barqs root b

In [15]:
#NER model considered (which does not always work very well in this case)

import spacy
nlp = spacy.load("en_core_web_sm")

doc = nlp("Coke Classic")
for ent in doc.ents:
    print(ent.text, ent.label_)

2023-02-01 05:59:43.149832: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Coke ORG
Classic PRODUCT


In [18]:
#applying the model to a sample from data.brands 

sample = data.brands.sample(100) 
for sm in sample:
    doc = nlp(sm)
    for ent in doc.ents:
        print(ent.text, ent.label_)
    

pepsi ORG
nsm squirt btl ORG
coke ORG
coke ORG
pepsi ORG
pepsi ORG
bundaberg GPE
coke ORG
sodas ORG
barqs ORG
pepsi ORG
canada GPE
fanta GPE
coca cola ORG
pepsi ORG
pepsi ORG
pepsi ORG
pibb xtra PERSON
coke ORG
coke ORG
sierra mist ORG
cola ORG
coke ORG
fanta GPE
pepsi ORG
cola ORG
coke ORG
coke ORG
coke ORG
canada GPE
coke ORG
coke ORG
canada GPE
canada GPE
coke ORG
coke ORG
pepsi ORG
coke ORG
coca cola ORG
pomegrana ORG


In [34]:
training_data = pd.read_csv('supervised_tagged_dataset.csv')
#format required:

# Create a training data set
TRAIN_DATA = [
    ("Who is Shaka Khan?", {"entities": [(7, 17, "PERSON")]}),
    ("I love London and Berlin.", {"entities": [(7, 13, "GPE"), (18, 24, "GPE")]}),
    ("Apple is looking at buying U.K. startup for $1 billion", 
     {"entities": [(0, 5, "PRODUCT"), (44, 54, "GPE"), (67, 76, "MONEY")]})
]

import csv

TRAIN_DATA = []
with open("supervised_tagged_dataset.csv") as csvfile:
    reader = csv.reader(csvfile)
    for row in reader:
        string, tags = row
        words = string.split()
        word_tags = tags.split()
        TRAIN_DATA.append(list(zip(words, word_tags)))

NameError: name 'csv' is not defined

In [None]:
import spacy
from spacy.util import minibatch, compounding

# Load a pre-trained model
nlp = spacy.load('en_core_web_sm')

# Add a custom NER component to the pipeline
ner = nlp.get_pipe("ner")

# Define the labels you want to predict
LABELS = ["ORG", "PERSON", "GPE", "PRODUCT", "EVENT"]

# Add the labels to the NER component
for label in LABELS:
    ner.add_label(label)

# Train the model
optimizer = nlp.begin_training()
for i in range(20):
    random.shuffle(TRAIN_DATA)
    losses = {}
    for text, annotations in TRAIN_DATA:
        nlp.update(
            [text],  # batch of texts
            [annotations],  # batch of annotations
            drop=0.5,  # dropout - make it harder to memorise data
            sgd=optimizer,  # callable to update weights
            losses=losses)
    print(losses)

# Save the updated model to disk
nlp.to_disk('/path/to/model')

# Test the model on some new text
test_text = "Apple is going to build a factory in Mexico City."
doc = nlp(test_text)
for ent in doc.ents:
    print(ent.text, ent.label_)