# Libraries

In [1]:
import pandas as pd
from tqdm import tqdm
from sklearn.cluster import KMeans
from collections import defaultdict
from collections import Counter
import math

# Cleaning the Data

In [2]:
path = '/Users/domenicomattiacinque/Documents/Università/ADM2020/HW4/archive'
dataset = pd.read_csv(path + '/Reviews.csv', nrows = 10000)

In [3]:
dataset.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [4]:
dataset.isnull().sum()

Id                        0
ProductId                 0
UserId                    0
ProfileName               0
HelpfulnessNumerator      0
HelpfulnessDenominator    0
Score                     0
Time                      0
Summary                   0
Text                      0
dtype: int64

In [5]:
dataset['Summary'].fillna('', inplace=True)

In [6]:
dataset['Text'] = dataset['Summary'] + ' ' + dataset['Text'] 

In [7]:
dataset.drop(['UserId','ProfileName','HelpfulnessNumerator',
              'HelpfulnessDenominator','Score','Time','Summary'],axis=1, inplace=True)

In [8]:
dataset.head()

Unnamed: 0,Id,ProductId,Text
0,1,B001E4KFG0,Good Quality Dog Food I have bought several of...
1,2,B00813GRG4,Not as Advertised Product arrived labeled as J...
2,3,B000LQOCH0,"""Delight"" says it all This is a confection tha..."
3,4,B000UA0QIQ,Cough Medicine If you are looking for the secr...
4,5,B006K2ZZ7K,Great taffy Great taffy at a great price. The...


In [10]:
dataset['ProductId'].nunique()

1422

In [18]:
df = dataset.groupby('ProductId')['Text'].apply(lambda x: ' '.join(x)).reset_index()

In [21]:
df.head()

Unnamed: 0,ProductId,Text
0,B00002NCJC,thirty bucks? Why is this $[...] when the same...
1,B00002Z754,WOW Make your own 'slickers' ! I just received...
2,B00005V3DC,Best herbal tea for digestion If you're new to...
3,B000084DVR,Premium Quality Dog Food!!! We have been using...
4,B000084E1U,Cats love it! I have nine cats and they are cr...


# Text Mining
We want to cluster the products using the reviews that we can find in the column 'Text' of our dataset. In order to do this we need to represent the reviews as TF-IDF score used in the previous homework.

In [22]:
import nltk
from nltk.corpus import stopwords
import string
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

In [23]:
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

In [24]:
#Realizing the cleaned token 
def clean_text(text):
    words = word_tokenize(text)
    tagged = nltk.pos_tag(words)
    
    good_words = []
    for word,tag in tagged:
        if tag == 'NN' or tag == 'NNS' or tag =='NNPS' or tag == 'NNP':
            if word.lower() not in stop_words and word.isalpha() and word.lower() not in string.punctuation:
                good_words.append(ps.stem(word).lower())
    return good_words

In [25]:
#test
print(df['Text'][0])
print(clean_text(df['Text'][0]))

thirty bucks? Why is this $[...] when the same product is available for $[...] here?<br />http://www.amazon.com/VICTOR-FLY-MAGNET-BAIT-REFILL/dp/B00004RBDY<br /><br />The Victor M380 and M502 traps are unreal, of course -- total fly genocide. Pretty stinky, but only right nearby. Flies Begone We have used the Victor fly bait for 3 seasons.  Can't beat it.  Great product!
['thirti', 'buck', 'product', 'br', 'http', 'br', 'br', 'victor', 'trap', 'cours', 'genocid', 'pretti', 'stinki', 'fli', 'victor', 'fli', 'bait', 'season', 'great', 'product']


In [27]:
#Applying the clean_text function to each element of the column 'Text'
df['Text_Words'] = df.Text.apply(lambda x: clean_text(x))

In [28]:
df.head(10)

Unnamed: 0,ProductId,Text,Text_Words
0,B00002NCJC,thirty bucks? Why is this $[...] when the same...,"[thirti, buck, product, br, http, br, br, vict..."
1,B00002Z754,WOW Make your own 'slickers' ! I just received...,"[wow, make, shipment, product, slicker, quot, ..."
2,B00005V3DC,Best herbal tea for digestion If you're new to...,"[tea, digest, product, dosag, batch, other, pr..."
3,B000084DVR,Premium Quality Dog Food!!! We have been using...,"[premium, qualiti, dog, food, food, month, fac..."
4,B000084E1U,Cats love it! I have nine cats and they are cr...,"[cat, cat, kibbl, thing, cat, food, cat, hate]"
5,B000084EK4,"Great beef look This food variety is ground, t...","[great, beef, look, food, varieti, cat, variet..."
6,B000084EK5,Family favorite - looks like steak! This is my...,"[steak, cat, food, gravi, food, bit, steak, fr..."
7,B000084EK6,Great food! This is another favorite in our ho...,"[food, favorit, hous, cat, time, month, bowl, ..."
8,B000084EK7,What's in this? This one is a great basic food...,"[one, food, mine, except, cat, ground, varieti..."
9,B000084EK8,Not the favorite in our house Once or twice a ...,"[favorit, hous, year, varieti, cat, varieti, c..."


In [38]:
def vocabulary(df):
    words = defaultdict(list)
    for i,el in enumerate(df['Text_Words']):
        for w in el:
            words[w].append(i)
    return words
voc = vocabulary(df)

In [39]:
#counting the ripetition of each token for each plot
df["Text_Ripetition"] = df["Text_Words"].apply(lambda x : Counter(x))

In [40]:
df.head()

Unnamed: 0,ProductId,Text,Text_Words,Text_Ripetition,Tf
0,B00002NCJC,thirty bucks? Why is this $[...] when the same...,"[thirti, buck, product, br, http, br, br, vict...","{'thirti': 1, 'buck': 1, 'product': 2, 'br': 3...","{'thirti': 0.05, 'buck': 0.05, 'product': 0.1,..."
1,B00002Z754,WOW Make your own 'slickers' ! I just received...,"[wow, make, shipment, product, slicker, quot, ...","{'wow': 1, 'make': 1, 'shipment': 1, 'product'...","{'wow': 0.030303030303030304, 'make': 0.030303..."
2,B00005V3DC,Best herbal tea for digestion If you're new to...,"[tea, digest, product, dosag, batch, other, pr...","{'tea': 5, 'digest': 1, 'product': 2, 'dosag':...","{'tea': 0.16129032258064516, 'digest': 0.03225..."
3,B000084DVR,Premium Quality Dog Food!!! We have been using...,"[premium, qualiti, dog, food, food, month, fac...","{'premium': 1, 'qualiti': 1, 'dog': 3, 'food':...","{'premium': 0.02857142857142857, 'qualiti': 0...."
4,B000084E1U,Cats love it! I have nine cats and they are cr...,"[cat, cat, kibbl, thing, cat, food, cat, hate]","{'cat': 4, 'kibbl': 1, 'thing': 1, 'food': 1, ...","{'cat': 0.5, 'kibbl': 0.125, 'thing': 0.125, '..."


In [41]:
def tf_i(text):
    tot_token = 0
    for k, v in text.items():
        tot_token += v #numbers of total tokens in the plot
    tf = {}
    for token, rip in text.items():#the values in this dictionary are the ripetition of that specific token
        tf[token] = rip / tot_token
    return tf     

In [42]:
tf = []
for i in range(len(df['Text_Ripetition'])):
    tf.append(tf_i(df["Text_Ripetition"][i])) #applying the previous function to each plot
df["Tf"] = tf #adding the column tf to the dataset

In [43]:
df.head()

Unnamed: 0,ProductId,Text,Text_Words,Text_Ripetition,Tf
0,B00002NCJC,thirty bucks? Why is this $[...] when the same...,"[thirti, buck, product, br, http, br, br, vict...","{'thirti': 1, 'buck': 1, 'product': 2, 'br': 3...","{'thirti': 0.05, 'buck': 0.05, 'product': 0.1,..."
1,B00002Z754,WOW Make your own 'slickers' ! I just received...,"[wow, make, shipment, product, slicker, quot, ...","{'wow': 1, 'make': 1, 'shipment': 1, 'product'...","{'wow': 0.030303030303030304, 'make': 0.030303..."
2,B00005V3DC,Best herbal tea for digestion If you're new to...,"[tea, digest, product, dosag, batch, other, pr...","{'tea': 5, 'digest': 1, 'product': 2, 'dosag':...","{'tea': 0.16129032258064516, 'digest': 0.03225..."
3,B000084DVR,Premium Quality Dog Food!!! We have been using...,"[premium, qualiti, dog, food, food, month, fac...","{'premium': 1, 'qualiti': 1, 'dog': 3, 'food':...","{'premium': 0.02857142857142857, 'qualiti': 0...."
4,B000084E1U,Cats love it! I have nine cats and they are cr...,"[cat, cat, kibbl, thing, cat, food, cat, hate]","{'cat': 4, 'kibbl': 1, 'thing': 1, 'food': 1, ...","{'cat': 0.5, 'kibbl': 0.125, 'thing': 0.125, '..."


In [44]:
idf = {}
for el in df['Text_Words']:   
    for token in el:
         idf.update({ token : math.log(len(df)/ len(voc[token]))})  #applying the formula for each token 

In [45]:
# Creating a vocabulary that as keys as the word and as values its index
def vocabulary_words(df):
    words = dict()
    i = 0
    for el in tqdm(df['Text_Words']):
        for w in el:
            if w not in words.keys():
                words[w] = i
                i += 1
    return words

In [46]:
voc_words = vocabulary_words(df)

100%|██████████| 1422/1422 [00:00<00:00, 36305.92it/s]


In [47]:
def score(dictionary):
    v = [0]*(len(voc_words))
    for word in dictionary.keys():
        v[voc_words[word]] = dictionary[word]*idf[word]
    return v

In [48]:
# lambda
df['Tf-Idf'] = df.Tf.apply(lambda x: score(x))

In [49]:
df.head()

Unnamed: 0,ProductId,Text,Text_Words,Text_Ripetition,Tf,Tf-Idf
0,B00002NCJC,thirty bucks? Why is this $[...] when the same...,"[thirti, buck, product, br, http, br, br, vict...","{'thirti': 1, 'buck': 1, 'product': 2, 'br': 3...","{'thirti': 0.05, 'buck': 0.05, 'product': 0.1,...","[0.26569547306539365, 0.15427161702337747, -0...."
1,B00002Z754,WOW Make your own 'slickers' ! I just received...,"[wow, make, shipment, product, slicker, quot, ...","{'wow': 1, 'make': 1, 'shipment': 1, 'product'...","{'wow': 0.030303030303030304, 'make': 0.030303...","[0, 0, -0.10408308572411039, 0, 0, 0, 0, 0, 0,..."
2,B00005V3DC,Best herbal tea for digestion If you're new to...,"[tea, digest, product, dosag, batch, other, pr...","{'tea': 5, 'digest': 1, 'product': 2, 'dosag':...","{'tea': 0.16129032258064516, 'digest': 0.03225...","[0, 0, -0.05539906175638133, -0.12771574252249..."
3,B000084DVR,Premium Quality Dog Food!!! We have been using...,"[premium, qualiti, dog, food, food, month, fac...","{'premium': 1, 'qualiti': 1, 'dog': 3, 'food':...","{'premium': 0.02857142857142857, 'qualiti': 0....","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,B000084E1U,Cats love it! I have nine cats and they are cr...,"[cat, cat, kibbl, thing, cat, food, cat, hate]","{'cat': 4, 'kibbl': 1, 'thing': 1, 'food': 1, ...","{'cat': 0.5, 'kibbl': 0.125, 'thing': 0.125, '...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


## Dimensionality Reduction

In [50]:
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.decomposition import TruncatedSVD

In [64]:
mlb = MultiLabelBinarizer(sparse_output=True)
a = mlb.fit_transform(df['Tf-Idf'])

In [74]:
a.shape

(1422, 51650)

In [75]:
svd = TruncatedSVD(n_components=500, n_iter=30, random_state=42)
svd.fit(a)

TruncatedSVD(n_components=500, n_iter=30, random_state=42)

In [76]:
print(svd.explained_variance_ratio_.sum())

0.7779640338936401


In [77]:
svd_a = svd.transform(a)

In [78]:
svd_a.shape

(1422, 500)

## Implement KMeans from scratch
### Step 0 
Elbow method to choose the number of clusters k 
### Step 1
Random inizialization of the k representative points
### Step 2
Find the cluster $C_i$ defined as $\{x : \parallel x-\mu_i\parallel \leq \parallel x-\mu_j\parallel \forall j\ne i\}  \forall i=1..k$
### Step 3 
Find $\mu_i$ which is defined as $\frac{1}{|C_i|} \sum_{x\in C_i} x$ $\forall i=1..k$
<hr>
Repeat step 2 and step 3 until <b>convergence</b>.

### Convergence
We stop when the clusters don't change from the previous iteration.

In [85]:
from sklearn.metrics import pairwise_distances
import random

In [86]:
def euclidian_dist (x, y):
    dist = 0
    for i in range(len(x)):    #iterating over the number of components
        dist += (x[i]-y[i])**2  
    return (math.sqrt(dist))

In [87]:
#K-means Algorithm
def KMeans(k, matrix):
     #Step 1
    representatives_idx = random.sample(range(1, len(matrix)), k)  #Chosing the initial representatives
    new_representatives =  []
    
    for i in range(k):
        new_representatives.append(matrix[representatives_idx[i]])
        
    representatives = []
    
    run = True
    limit = 0
    
    while run:
        representatives = new_representatives
        #Creating the clusters
        clusters = {i: [] for i in range(k)}  # empty dictionary for each cluster with list as value 
        #Step 2
        for point in matrix:
            dist_xmu = []      #This variable contains the distance between the point and the representatives
            for i in range(k): #for each representative
                dist_xmu.append(euclidian_dist(representatives[i], point))
            nearest_representative = dist_xmu.index(min(dist_xmu))
            clusters[nearest_representative].append(point) #Put the point in the cluster with the minimum distance
        
        #Step 3
        new_representatives = []
        for i in range(k):
            run = False
            new_val = np.sum(clusters[i],axis=0)/len(clusters[i])
            new_representatives.append(new_val)
            if representatives[i][0] != new_representatives[i][0] or representatives[i][1] != new_representatives[i][1]:
                run = True
                limit += 1
        
        if limit == 15:
            return clusters, new_representatives
                
    return clusters, new_representatives

In [89]:
clusters, new_representatives = KMeans(10, svd_a)

1
2
3
4
5
6
7


In [12]:
#step 0
elbow = {}
for k in tqdm(range(3,20)):
    elbow_model = KMeans (n_clusters = k)
    elbow_model.fit_predict(slice)
    elbow[k] = elbow_model.inertia_

  0%|                                                                                           | 0/17 [00:00<?, ?it/s]


NameError: name 'KMeans' is not defined