# Libraries

In [45]:
import pandas as pd
from tqdm import tqdm
from sklearn.cluster import KMeans
from collections import defaultdict
from collections import Counter
import math

# Cleaning the Data

In [2]:
dataset = pd.read_csv('Reviews.csv')

In [3]:
dataset.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


Trasform the time in DateTime variable

In [3]:
dataset['Datetime'] = dataset.Time.apply(lambda value: pd.to_datetime(value, unit = 's'))

In [8]:
dataset['Datetime'].head()

0   2011-04-27
1   2012-09-07
2   2008-08-18
3   2011-06-13
4   2012-10-21
Name: Datetime, dtype: datetime64[ns]

In [4]:
#The time column is no longer usefull
dataset.drop('Time', inplace = True, axis = 1)

In [5]:
#How many null value do we have?
dataset.isnull().sum()

Id                         0
ProductId                  0
UserId                     0
ProfileName               16
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Summary                   27
Text                       0
Datetime                   0
dtype: int64

In [17]:
#Replacing null values with empty strings
dataset['ProfileName'].fillna('', inplace=True)
dataset['Summary'].fillna('', inplace=True)

# Text Mining
We want to cluster the products using the reviews that we can find in the column 'Text' of our dataset. In order to do this we need to represent the reviews as TF-IDF score used in the previous homework.

In [6]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
import string
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\giorg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\giorg\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
stop_words = set(stopwords.words('english'))

In [8]:
ps = PorterStemmer()

In [10]:
#Realizing the cleaned token 
def clean_text(text):
    words = word_tokenize(text)
    good_words = []
    for word in words:
        if word.lower() not in stop_words and word.isalpha() and word.lower() not in string.punctuation:
            good_words.append(ps.stem(word).lower())
    return good_words

In [27]:
#test
print(dataset['Text'][0])
print(clean_text(dataset['Text'][0]))

I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.
['bought', 'several', 'vitality', 'canned', 'dog', 'food', 'products', 'found', 'good', 'quality', 'product', 'looks', 'like', 'stew', 'processed', 'meat', 'smells', 'better', 'labrador', 'finicky', 'appreciates', 'product', 'better']


In [11]:
#Applying the clean_text function to each element of the column 'Text'
dataset['Text_Words'] = dataset.Text.apply(lambda x: clean_text(x))

In [30]:
dataset.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Summary,Text,Datetime,Text_Words
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,2011-04-27,"[bought, several, vitality, canned, dog, food,..."
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,2012-09-07,"[product, arrived, labeled, jumbo, salted, pea..."
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,"""Delight"" says it all",This is a confection that has been around a fe...,2008-08-18,"[confection, around, centuries, light, pillowy..."
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,Cough Medicine,If you are looking for the secret ingredient i...,2011-06-13,"[looking, secret, ingredient, robitussin, beli..."
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,Great taffy,Great taffy at a great price. There was a wid...,2012-10-21,"[great, taffy, great, price, wide, assortment,..."


In [30]:
def vocabulary(df):
    words = defaultdict(list)
    for i,el in tqdm(enumerate(df['Text_Words'])):
        for w in el:
            words[w].append(i)
    return words
voc = vocabulary(dataset)

568454it [00:06, 84543.47it/s] 


In [36]:
#counting the ripetition of each token for each plot
dataset["Text_Ripetition"] = dataset["Text_Words"].apply(lambda x : Counter(x))

In [37]:
dataset.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Summary,Text,Datetime,Text_Words,Text_Ripetition
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,2011-04-27,"[bought, sever, vital, can, dog, food, product...","{'bought': 1, 'sever': 1, 'vital': 1, 'can': 1..."
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,2012-09-07,"[product, arriv, label, jumbo, salt, peanut, p...","{'product': 2, 'arriv': 1, 'label': 1, 'jumbo'..."
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,"""Delight"" says it all",This is a confection that has been around a fe...,2008-08-18,"[confect, around, centuri, light, pillowi, cit...","{'confect': 1, 'around': 1, 'centuri': 1, 'lig..."
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,Cough Medicine,If you are looking for the secret ingredient i...,2011-06-13,"[look, secret, ingredi, robitussin, believ, fo...","{'look': 1, 'secret': 1, 'ingredi': 1, 'robitu..."
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,Great taffy,Great taffy at a great price. There was a wid...,2012-10-21,"[great, taffi, great, price, wide, assort, yum...","{'great': 2, 'taffi': 3, 'price': 1, 'wide': 1..."


In [41]:
def tf_i(text):
    tot_token = 0
    for k, v in text.items():
        tot_token += v #numbers of total tokens in the plot
    tf = {}
    for token, rip in text.items():#the values in this dictionary are the ripetition of that specific token
        tf[token] = rip / tot_token
    return tf     

In [43]:
tf = []
for i in tqdm(range(len(dataset['Text_Ripetition']))):
    tf.append(tf_i(dataset["Text_Ripetition"][i])) #applying the previous function to each plot
dataset["tf"] = tf #adding the column tf to the dataset

100%|███████████████████████████████████████████████████████████████████████| 568454/568454 [00:11<00:00, 50714.60it/s]


In [44]:
dataset.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Summary,Text,Datetime,Text_Words,Text_Ripetition,tf
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,Good Quality Dog Food,I have bought several of the Vitality canned d...,2011-04-27,"[bought, sever, vital, can, dog, food, product...","{'bought': 1, 'sever': 1, 'vital': 1, 'can': 1...","{'bought': 0.043478260869565216, 'sever': 0.04..."
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...,2012-09-07,"[product, arriv, label, jumbo, salt, peanut, p...","{'product': 2, 'arriv': 1, 'label': 1, 'jumbo'...","{'product': 0.1111111111111111, 'arriv': 0.055..."
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,"""Delight"" says it all",This is a confection that has been around a fe...,2008-08-18,"[confect, around, centuri, light, pillowi, cit...","{'confect': 1, 'around': 1, 'centuri': 1, 'lig...","{'confect': 0.02564102564102564, 'around': 0.0..."
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,Cough Medicine,If you are looking for the secret ingredient i...,2011-06-13,"[look, secret, ingredi, robitussin, believ, fo...","{'look': 1, 'secret': 1, 'ingredi': 1, 'robitu...","{'look': 0.05555555555555555, 'secret': 0.0555..."
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,Great taffy,Great taffy at a great price. There was a wid...,2012-10-21,"[great, taffi, great, price, wide, assort, yum...","{'great': 2, 'taffi': 3, 'price': 1, 'wide': 1...","{'great': 0.15384615384615385, 'taffi': 0.2307..."


In [49]:
idf = {}
for el in tqdm(dataset['Text_Words']):   
    for token in el:
         idf.update({ token : math.log(len(dataset)/ len(voc[token]))})  #applying the formula for each token 

100%|███████████████████████████████████████████████████████████████████████| 568454/568454 [00:43<00:00, 12958.30it/s]


In [59]:
idf

{'bought': 2.4348853310787604,
 'sever': 3.2369445475762957,
 'vital': 6.644025488012968,
 'can': 3.1550812641463035,
 'dog': 1.6755738508142788,
 'food': 1.3370765618689209,
 'product': 1.149202722725902,
 'found': 2.3529738490532064,
 'good': 1.036005097056079,
 'qualiti': 2.805910890453236,
 'look': 2.2260807551795745,
 'like': 0.6899611500196362,
 'stew': 5.803507314611143,
 'process': 4.1546244486371275,
 'meat': 3.653781219018596,
 'smell': 2.7782445445744157,
 'better': 2.1026733620361826,
 'labrador': 7.241862488768588,
 'finicki': 5.616822114529414,
 'appreci': 4.762911293485757,
 'arriv': 3.2757050200531905,
 'label': 3.928346800286953,
 'jumbo': 7.536942868701814,
 'salt': 2.87400206173802,
 'peanut': 3.275053523915856,
 'actual': 2.8992384019607353,
 'small': 2.8312555605409195,
 'size': 2.8735037745897465,
 'unsalt': 6.703890263450659,
 'sure': 2.873784030558698,
 'error': 6.514895659968857,
 'vendor': 5.212809439501565,
 'intend': 5.5440627602469865,
 'repres': 6.73300440

In [56]:
# Creating a vocabulary that as keys as the word and as values its index
def vocabulary_words(df):
    words = dict()
    i = 0
    for el in tqdm(df['Text_Words']):
        for w in el:
            if w not in words.keys():
                words[w] = i
                i += 1
    return words

In [57]:
voc_words = vocabulary_words(dataset)

100%|███████████████████████████████████████████████████████████████████████| 568454/568454 [00:05<00:00, 98517.65it/s]


In [None]:
def score(tf):
    v = [0]*len(voc_words)
    for k in tf.keys():
        v[voc_words[k]] = tf[k]*idf[k]

In [64]:
tf[0]['bought']

0.043478260869565216

In [None]:
#lamda
score(dataset['tf'])

In [None]:
#TF-IDF score or binary representation to represent Text_Words ?

## Implement KMeans from scratch
### Step 0 
Elbow method to choose the number of clusters k 
### Step 1
Random inizialization of the k representative points
### Step 2
Find the cluster $C_i$ defined as $\{x : \parallel x-\mu_i\parallel \leq \parallel x-\mu_j\parallel \forall j\ne i\}  \forall i=1..k$
### Step 3 
Find $\mu_i$ which is defined as $\frac{1}{|C_i|} \sum_{x\in C_i} x$ $\forall i=1..k$
<hr>
Repeat step 2 and step 3 until <b>convergence</b>.

### Convergence
We stop when the clusters don't change from the previous iteration.

In [None]:
#slice should be filled with the point representative of text_words

In [12]:
#step 0
elbow = {}
for k in tqdm(range(3,20)):
    elbow_model = KMeans (n_clusters = k)
    elbow_model.fit_predict(slice)
    elbow[k] = elbow_model.inertia_

  0%|                                                                                           | 0/17 [00:00<?, ?it/s]


NameError: name 'KMeans' is not defined