# Libraries

In [1]:
import pandas as pd
from tqdm import tqdm
from sklearn.cluster import KMeans
from collections import defaultdict
from collections import Counter
import math

# Cleaning the Data

In [2]:
path = '/Users/domenicomattiacinque/Documents/Università/ADM2020/HW4/archive'
dataset = pd.read_csv(path + '/Reviews.csv', nrows = 10000)

In [3]:
dataset.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [4]:
dataset.isnull().sum()

Id                        0
ProductId                 0
UserId                    0
ProfileName               0
HelpfulnessNumerator      0
HelpfulnessDenominator    0
Score                     0
Time                      0
Summary                   0
Text                      0
dtype: int64

In [5]:
dataset['Summary'].fillna('', inplace=True)

In [6]:
dataset['Text'] = dataset['Summary'] + ' ' + dataset['Text'] 

In [7]:
dataset.drop(['ProductId','UserId','ProfileName','HelpfulnessNumerator',
              'HelpfulnessDenominator','Score','Time','Summary'],axis=1, inplace=True)

In [8]:
dataset.head()

Unnamed: 0,Id,Text
0,1,Good Quality Dog Food I have bought several of...
1,2,Not as Advertised Product arrived labeled as J...
2,3,"""Delight"" says it all This is a confection tha..."
3,4,Cough Medicine If you are looking for the secr...
4,5,Great taffy Great taffy at a great price. The...


# Text Mining
We want to cluster the products using the reviews that we can find in the column 'Text' of our dataset. In order to do this we need to represent the reviews as TF-IDF score used in the previous homework.

In [9]:
import nltk
from nltk.corpus import stopwords
import string
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

In [10]:
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

In [11]:
#Realizing the cleaned token 
def clean_text(text):
    words = word_tokenize(text)
    
    good_words = []
    for word in words:
        if word.lower() not in stop_words and word.isalpha() and word.lower() not in string.punctuation:
            good_words.append(ps.stem(word).lower())
    return good_words

In [12]:
#test
print(dataset['Text'][0])
print(clean_text(dataset['Text'][0]))

Good Quality Dog Food I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.
['good', 'qualiti', 'dog', 'food', 'bought', 'sever', 'vital', 'can', 'dog', 'food', 'product', 'found', 'good', 'qualiti', 'product', 'look', 'like', 'stew', 'process', 'meat', 'smell', 'better', 'labrador', 'finicki', 'appreci', 'product', 'better']


In [13]:
#Applying the clean_text function to each element of the column 'Text'
dataset['Text_Words'] = dataset.Text.apply(lambda x: clean_text(x))

In [14]:
dataset.head()

Unnamed: 0,Id,Text,Text_Words
0,1,Good Quality Dog Food I have bought several of...,"[good, qualiti, dog, food, bought, sever, vita..."
1,2,Not as Advertised Product arrived labeled as J...,"[advertis, product, arriv, label, jumbo, salt,..."
2,3,"""Delight"" says it all This is a confection tha...","[delight, say, confect, around, centuri, light..."
3,4,Cough Medicine If you are looking for the secr...,"[cough, medicin, look, secret, ingredi, robitu..."
4,5,Great taffy Great taffy at a great price. The...,"[great, taffi, great, taffi, great, price, wid..."


In [15]:
def vocabulary(df):
    words = defaultdict(list)
    for i,el in enumerate(df['Text_Words']):
        for w in el:
            words[w].append(i)
    return words
voc = vocabulary(dataset)

In [16]:
#counting the ripetition of each token for each plot
dataset["Text_Ripetition"] = dataset["Text_Words"].apply(lambda x : Counter(x))

In [17]:
dataset.head()

Unnamed: 0,Id,Text,Text_Words,Text_Ripetition
0,1,Good Quality Dog Food I have bought several of...,"[good, qualiti, dog, food, bought, sever, vita...","{'good': 2, 'qualiti': 2, 'dog': 2, 'food': 2,..."
1,2,Not as Advertised Product arrived labeled as J...,"[advertis, product, arriv, label, jumbo, salt,...","{'advertis': 1, 'product': 2, 'arriv': 1, 'lab..."
2,3,"""Delight"" says it all This is a confection tha...","[delight, say, confect, around, centuri, light...","{'delight': 1, 'say': 1, 'confect': 1, 'around..."
3,4,Cough Medicine If you are looking for the secr...,"[cough, medicin, look, secret, ingredi, robitu...","{'cough': 1, 'medicin': 2, 'look': 1, 'secret'..."
4,5,Great taffy Great taffy at a great price. The...,"[great, taffi, great, taffi, great, price, wid...","{'great': 3, 'taffi': 4, 'price': 1, 'wide': 1..."


In [18]:
def tf_i(text):
    tot_token = 0
    for k, v in text.items():
        tot_token += v #numbers of total tokens in the plot
    tf = {}
    for token, rip in text.items():#the values in this dictionary are the ripetition of that specific token
        tf[token] = rip / tot_token
    return tf     

In [19]:
tf = []
for i in range(len(dataset['Text_Ripetition'])):
    tf.append(tf_i(dataset["Text_Ripetition"][i])) #applying the previous function to each plot
dataset["Tf"] = tf #adding the column tf to the dataset

In [20]:
dataset.head()

Unnamed: 0,Id,Text,Text_Words,Text_Ripetition,Tf
0,1,Good Quality Dog Food I have bought several of...,"[good, qualiti, dog, food, bought, sever, vita...","{'good': 2, 'qualiti': 2, 'dog': 2, 'food': 2,...","{'good': 0.07407407407407407, 'qualiti': 0.074..."
1,2,Not as Advertised Product arrived labeled as J...,"[advertis, product, arriv, label, jumbo, salt,...","{'advertis': 1, 'product': 2, 'arriv': 1, 'lab...","{'advertis': 0.05263157894736842, 'product': 0..."
2,3,"""Delight"" says it all This is a confection tha...","[delight, say, confect, around, centuri, light...","{'delight': 1, 'say': 1, 'confect': 1, 'around...","{'delight': 0.024390243902439025, 'say': 0.024..."
3,4,Cough Medicine If you are looking for the secr...,"[cough, medicin, look, secret, ingredi, robitu...","{'cough': 1, 'medicin': 2, 'look': 1, 'secret'...","{'cough': 0.05, 'medicin': 0.1, 'look': 0.05, ..."
4,5,Great taffy Great taffy at a great price. The...,"[great, taffi, great, taffi, great, price, wid...","{'great': 3, 'taffi': 4, 'price': 1, 'wide': 1...","{'great': 0.2, 'taffi': 0.26666666666666666, '..."


In [21]:
idf = {}
for el in dataset['Text_Words']:   
    for token in el:
         idf.update({ token : math.log(len(dataset)/ len(voc[token]))})  #applying the formula for each token 

In [22]:
# Creating a vocabulary that as keys as the word and as values its index
def vocabulary_words(df):
    words = dict()
    i = 0
    for el in tqdm(df['Text_Words']):
        for w in el:
            if w not in words.keys():
                words[w] = i
                i += 1
    return words

In [23]:
voc_words = vocabulary_words(dataset)

100%|██████████| 10000/10000 [00:00<00:00, 149697.66it/s]


In [24]:
def score(dictionary):
    v = [0]*len(voc_words)
    for word in dictionary.keys():
        v[voc_words[word]] = dictionary[word]*idf[word]
    return v

In [26]:
# lamda
dataset['Tf-Idf'] = dataset.Tf.apply(lambda x: score(x))

In [27]:
dataset.head()

Unnamed: 0,Id,Text,Text_Words,Text_Ripetition,Tf,Tf-Idf
0,1,Good Quality Dog Food I have bought several of...,"[good, qualiti, dog, food, bought, sever, vita...","{'good': 2, 'qualiti': 2, 'dog': 2, 'food': 2,...","{'good': 0.07407407407407407, 'qualiti': 0.074...","[0.058967869227243466, 0.1996761245261965, 0.1..."
1,2,Not as Advertised Product arrived labeled as J...,"[advertis, product, arriv, label, jumbo, salt,...","{'advertis': 1, 'product': 2, 'arriv': 1, 'lab...","{'advertis': 0.05263157894736842, 'product': 0...","[0, 0, 0, 0, 0, 0, 0, 0, 0.11483597709093576, ..."
2,3,"""Delight"" says it all This is a confection tha...","[delight, say, confect, around, centuri, light...","{'delight': 1, 'say': 1, 'confect': 1, 'around...","{'delight': 0.024390243902439025, 'say': 0.024...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,4,Cough Medicine If you are looking for the secr...,"[cough, medicin, look, secret, ingredi, robitu...","{'cough': 1, 'medicin': 2, 'look': 1, 'secret'...","{'cough': 0.05, 'medicin': 0.1, 'look': 0.05, ...","[0.03980331172838934, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,5,Great taffy Great taffy at a great price. The...,"[great, taffi, great, taffi, great, price, wid...","{'great': 3, 'taffi': 4, 'price': 1, 'wide': 1...","{'great': 0.2, 'taffi': 0.26666666666666666, '...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
#TF-IDF score or binary representation to represent Text_Words ?

## Implement KMeans from scratch
### Step 0 
Elbow method to choose the number of clusters k 
### Step 1
Random inizialization of the k representative points
### Step 2
Find the cluster $C_i$ defined as $\{x : \parallel x-\mu_i\parallel \leq \parallel x-\mu_j\parallel \forall j\ne i\}  \forall i=1..k$
### Step 3 
Find $\mu_i$ which is defined as $\frac{1}{|C_i|} \sum_{x\in C_i} x$ $\forall i=1..k$
<hr>
Repeat step 2 and step 3 until <b>convergence</b>.

### Convergence
We stop when the clusters don't change from the previous iteration.

In [None]:
#slice should be filled with the point representative of text_words

In [12]:
#step 0
elbow = {}
for k in tqdm(range(3,20)):
    elbow_model = KMeans (n_clusters = k)
    elbow_model.fit_predict(slice)
    elbow[k] = elbow_model.inertia_

  0%|                                                                                           | 0/17 [00:00<?, ?it/s]


NameError: name 'KMeans' is not defined