# Libraries

In [3]:
import pandas as pd
from tqdm import tqdm
from sklearn.cluster import KMeans
from collections import defaultdict
from collections import Counter
import math
from ast import literal_eval
import numpy as np

# Cleaning the Data

In [2]:
path = '/Users/domenicomattiacinque/Documents/Università/ADM2020/HW4/archive'
dataset = pd.read_csv(path + '/Reviews.csv', nrows = 10000)

In [3]:
dataset.head()

Unnamed: 0,Id,ProductId,UserId,ProfileName,HelpfulnessNumerator,HelpfulnessDenominator,Score,Time,Summary,Text
0,1,B001E4KFG0,A3SGXH7AUHU8GW,delmartian,1,1,5,1303862400,Good Quality Dog Food,I have bought several of the Vitality canned d...
1,2,B00813GRG4,A1D87F6ZCVE5NK,dll pa,0,0,1,1346976000,Not as Advertised,Product arrived labeled as Jumbo Salted Peanut...
2,3,B000LQOCH0,ABXLMWJIXXAIN,"Natalia Corres ""Natalia Corres""",1,1,4,1219017600,"""Delight"" says it all",This is a confection that has been around a fe...
3,4,B000UA0QIQ,A395BORC6FGVXV,Karl,3,3,2,1307923200,Cough Medicine,If you are looking for the secret ingredient i...
4,5,B006K2ZZ7K,A1UQRSCLF8GW1T,"Michael D. Bigham ""M. Wassir""",0,0,5,1350777600,Great taffy,Great taffy at a great price. There was a wid...


In [4]:
dataset.isnull().sum()

Id                        0
ProductId                 0
UserId                    0
ProfileName               0
HelpfulnessNumerator      0
HelpfulnessDenominator    0
Score                     0
Time                      0
Summary                   0
Text                      0
dtype: int64

In [5]:
dataset['Summary'].fillna('', inplace=True)

In [6]:
dataset['Text'] = dataset['Summary'] + ' ' + dataset['Text'] 

In [7]:
dataset.drop(['UserId','ProfileName','HelpfulnessNumerator',
              'HelpfulnessDenominator','Score','Time','Summary'],axis=1, inplace=True)

In [8]:
dataset.head()

Unnamed: 0,Id,ProductId,Text
0,1,B001E4KFG0,Good Quality Dog Food I have bought several of...
1,2,B00813GRG4,Not as Advertised Product arrived labeled as J...
2,3,B000LQOCH0,"""Delight"" says it all This is a confection tha..."
3,4,B000UA0QIQ,Cough Medicine If you are looking for the secr...
4,5,B006K2ZZ7K,Great taffy Great taffy at a great price. The...


In [10]:
dataset['ProductId'].nunique()

1422

In [18]:
df = dataset.groupby('ProductId')['Text'].apply(lambda x: ' '.join(x)).reset_index()

In [21]:
df.head()

Unnamed: 0,ProductId,Text
0,B00002NCJC,thirty bucks? Why is this $[...] when the same...
1,B00002Z754,WOW Make your own 'slickers' ! I just received...
2,B00005V3DC,Best herbal tea for digestion If you're new to...
3,B000084DVR,Premium Quality Dog Food!!! We have been using...
4,B000084E1U,Cats love it! I have nine cats and they are cr...


# Text Mining
We want to cluster the products using the reviews that we can find in the column 'Text' of our dataset. In order to do this we need to represent the reviews as TF-IDF score used in the previous homework.

In [1]:
import nltk
from nltk.corpus import stopwords
import string
import re
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize

In [23]:
stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

In [24]:
#Realizing the cleaned token 
def clean_text(text):
    words = word_tokenize(text)
    tagged = nltk.pos_tag(words)
    
    good_words = []
    for word,tag in tagged:
        if tag == 'NN' or tag == 'NNS' or tag =='NNPS' or tag == 'NNP':
            if word.lower() not in stop_words and word.isalpha() and word.lower() not in string.punctuation:
                good_words.append(ps.stem(word).lower())
    return good_words

In [25]:
#test
print(df['Text'][0])
print(clean_text(df['Text'][0]))

thirty bucks? Why is this $[...] when the same product is available for $[...] here?<br />http://www.amazon.com/VICTOR-FLY-MAGNET-BAIT-REFILL/dp/B00004RBDY<br /><br />The Victor M380 and M502 traps are unreal, of course -- total fly genocide. Pretty stinky, but only right nearby. Flies Begone We have used the Victor fly bait for 3 seasons.  Can't beat it.  Great product!
['thirti', 'buck', 'product', 'br', 'http', 'br', 'br', 'victor', 'trap', 'cours', 'genocid', 'pretti', 'stinki', 'fli', 'victor', 'fli', 'bait', 'season', 'great', 'product']


In [1]:
#Applying the clean_text function to each element of the column 'Text'
df['Text_Words'] = df.Text.apply(lambda x: clean_text(x))

NameError: name 'df' is not defined

In [28]:
df.head(5)

Unnamed: 0,ProductId,Text,Text_Words
0,B00002NCJC,thirty bucks? Why is this $[...] when the same...,"[thirti, buck, product, br, http, br, br, vict..."
1,B00002Z754,WOW Make your own 'slickers' ! I just received...,"[wow, make, shipment, product, slicker, quot, ..."
2,B00005V3DC,Best herbal tea for digestion If you're new to...,"[tea, digest, product, dosag, batch, other, pr..."
3,B000084DVR,Premium Quality Dog Food!!! We have been using...,"[premium, qualiti, dog, food, food, month, fac..."
4,B000084E1U,Cats love it! I have nine cats and they are cr...,"[cat, cat, kibbl, thing, cat, food, cat, hate]"
5,B000084EK4,"Great beef look This food variety is ground, t...","[great, beef, look, food, varieti, cat, variet..."
6,B000084EK5,Family favorite - looks like steak! This is my...,"[steak, cat, food, gravi, food, bit, steak, fr..."
7,B000084EK6,Great food! This is another favorite in our ho...,"[food, favorit, hous, cat, time, month, bowl, ..."
8,B000084EK7,What's in this? This one is a great basic food...,"[one, food, mine, except, cat, ground, varieti..."
9,B000084EK8,Not the favorite in our house Once or twice a ...,"[favorit, hous, year, varieti, cat, varieti, c..."


In [4]:
df = pd.read_csv('new_reviews.csv')

In [5]:
df.Text_Words = df.Text_Words.apply(literal_eval)
df.head()

Unnamed: 0,ProductId,Text,Text_Words
0,0006641040,Read it once. Read it twice. Reading Chicken S...,"[chicken, soup, rice, day, person, chicken, so..."
1,141278509X,The best drink mix This product by Archer Farm...,"[drink, mix, product, archer, farm, drink, mix..."
2,2734888454,made in china My dogs loves this chicken but i...,"[dog, chicken, product, china, chicken, produc..."
3,2841233731,Great recipe book for my babycook This book is...,"[great, book, babycook, book, ingredi, store, ..."
4,7310172001,very good This product is a very health snack ...,"[product, health, snack, pup, beef, liver, tri..."


In [6]:
def vocabulary(df):
    words = defaultdict(list)
    for i,el in enumerate(df['Text_Words']):
        for w in el:
            words[w].append(i)
    return words
voc = vocabulary(df)

In [7]:
#counting the ripetition of each token for each plot
df["Text_Ripetition"] = df["Text_Words"].apply(lambda x : Counter(x))

In [8]:
df.head()

Unnamed: 0,ProductId,Text,Text_Words,Text_Ripetition
0,0006641040,Read it once. Read it twice. Reading Chicken S...,"[chicken, soup, rice, day, person, chicken, so...","{'chicken': 19, 'soup': 35, 'rice': 21, 'day':..."
1,141278509X,The best drink mix This product by Archer Farm...,"[drink, mix, product, archer, farm, drink, mix...","{'drink': 2, 'mix': 2, 'product': 1, 'archer':..."
2,2734888454,made in china My dogs loves this chicken but i...,"[dog, chicken, product, china, chicken, produc...","{'dog': 3, 'chicken': 2, 'product': 3, 'china'..."
3,2841233731,Great recipe book for my babycook This book is...,"[great, book, babycook, book, ingredi, store, ...","{'great': 1, 'book': 3, 'babycook': 1, 'ingred..."
4,7310172001,very good This product is a very health snack ...,"[product, health, snack, pup, beef, liver, tri...","{'product': 78, 'health': 1, 'snack': 11, 'pup..."


In [9]:
def tf_i(text):
    tot_token = 0
    for k, v in text.items():
        tot_token += v #numbers of total tokens in the plot
    tf = {}
    for token, rip in text.items():#the values in this dictionary are the ripetition of that specific token
        tf[token] = round(rip / tot_token,4)
    return tf 

In [10]:
tf = []
for i in range(len(df['Text_Ripetition'])):
    tf.append(tf_i(df["Text_Ripetition"][i])) #applying the previous function to each plot
df["Tf"] = tf #adding the column tf to the dataset

In [11]:
df.head()

Unnamed: 0,ProductId,Text,Text_Words,Text_Ripetition,Tf
0,0006641040,Read it once. Read it twice. Reading Chicken S...,"[chicken, soup, rice, day, person, chicken, so...","{'chicken': 19, 'soup': 35, 'rice': 21, 'day':...","{'chicken': 0.0239, 'soup': 0.0441, 'rice': 0...."
1,141278509X,The best drink mix This product by Archer Farm...,"[drink, mix, product, archer, farm, drink, mix...","{'drink': 2, 'mix': 2, 'product': 1, 'archer':...","{'drink': 0.08, 'mix': 0.08, 'product': 0.04, ..."
2,2734888454,made in china My dogs loves this chicken but i...,"[dog, chicken, product, china, chicken, produc...","{'dog': 3, 'chicken': 2, 'product': 3, 'china'...","{'dog': 0.1579, 'chicken': 0.1053, 'product': ..."
3,2841233731,Great recipe book for my babycook This book is...,"[great, book, babycook, book, ingredi, store, ...","{'great': 1, 'book': 3, 'babycook': 1, 'ingred...","{'great': 0.0667, 'book': 0.2, 'babycook': 0.0..."
4,7310172001,very good This product is a very health snack ...,"[product, health, snack, pup, beef, liver, tri...","{'product': 78, 'health': 1, 'snack': 11, 'pup...","{'product': 0.024, 'health': 0.0003, 'snack': ..."


In [12]:
idf = {}
for el in df['Text_Words']:   
    for token in el:
         idf.update({ token : round(math.log(len(df)/ len(voc[token]),3))})  #applying the formula for each token 

In [13]:
# Creating a vocabulary that as keys as the word and as values its index
def vocabulary_words(df):
    words = dict()
    i = 0
    for el in tqdm(df['Text_Words']):
        for w in el:
            if w not in words.keys():
                words[w] = i
                i += 1
    return words

In [14]:
voc_words = vocabulary_words(df)

100%|██████████| 74258/74258 [00:01<00:00, 45387.42it/s]


In [15]:
def score(dictionary):
    v = [0]*(len(voc_words))
    for word in dictionary.keys():
        v[voc_words[word]] = dictionary[word]*idf[word]
    return v

In [18]:
from scipy.sparse import lil_matrix

matrix = lil_matrix((len(df), len(voc_words)), dtype=np.float)
for i in tqdm(range(len(df["Tf"]))):
    matrix[i] = np.array(score(df["Tf"][i]))

100%|██████████| 74258/74258 [09:47<00:00, 126.34it/s]


In [19]:
from scipy.sparse import csr_matrix

matrix = csr_matrix(matrix)

## Dimensionality Reduction

In [21]:
from sklearn.decomposition import TruncatedSVD

In [26]:
matrix.shape

(74258, 62787)

In [27]:
svd = TruncatedSVD(n_components=2000, random_state=42)
svd.fit(matrix)

TruncatedSVD(n_components=2000, random_state=42)

In [28]:
print(svd.explained_variance_ratio_.sum())

0.6081094855890445


In [29]:
svd_matrix = svd.transform(matrix)

In [30]:
svd_matrix.shape

(74258, 2000)

## Implement KMeans from scratch
### Step 0 
Elbow method to choose the number of clusters k 
### Step 1
Random inizialization of the k representative points
### Step 2
Find the cluster $C_i$ defined as $\{x : \parallel x-\mu_i\parallel \leq \parallel x-\mu_j\parallel \forall j\ne i\}  \forall i=1..k$
### Step 3 
Find $\mu_i$ which is defined as $\frac{1}{|C_i|} \sum_{x\in C_i} x \;\; \forall i=1..k$
<hr>
Repeat step 2 and step 3 until <b>convergence</b>.

### Convergence
We stop when the clusters don't change from the previous iteration.

In [34]:
import random

In [86]:
def euclidian_dist(x, y):
    dist = 0
    for i in range(len(x)):    #iterating over the number of components
        dist += (x[i]-y[i])**2  
    return (math.sqrt(dist))

In [31]:
def euclidian_dist(x,y):
    return np.linalg.norm(x-y)

In [37]:
def KMeans(k, matrix):
    # 1. Choosing k random representatives
    representatives_idx = random.sample(range(1, len(matrix)), k)  #Chosing the initial representatives
    new_representatives =  []
    for i in range(k):
        new_representatives.append(matrix[representatives_idx[i]])
    representatives = []
    
    run = True
    limit = 0
    while run:
        representatives = new_representatives
        
        # Creating the clusters
        clusters = {i: [] for i in range(k)}  # Empty dictionary for each cluster with list as value 
        i_clusters = {i: [] for i in range(k)}
        
        # 2. Computing distances and choosing 
        for ind, point in enumerate(matrix):
            dist_xmu = []      # Distance between the point and the representatives for each representative
            for i in range(k): 
                dist_xmu.append(euclidian_dist(representatives[i], point))
            nearest_representative = dist_xmu.index(min(dist_xmu))
            clusters[nearest_representative].append(point) #Put the point in the cluster with the minimum distance
            i_clusters[nearest_representative].append(ind)
            
        # 3. Selecting new representatives
        new_representatives = []
        for i in range(k):
            run = False
            new_val = np.sum(clusters[i],axis=0)/len(clusters[i])
            new_representatives.append(new_val)
            if representatives[i][0] != new_representatives[i][0] or representatives[i][1] != new_representatives[i][1]:
                run = True
                limit += 1
        
        if limit == 15:
            return clusters, new_representatives, i_clusters
                
    return clusters, new_representatives, i_clusters

In [38]:
clusters, new_representatives, i_clusters = KMeans(10, svd_matrix)

## Cluster Analysis 

In [39]:
df.iloc[i_clusters[0]]

Unnamed: 0,ProductId,Text,Text_Words,Text_Ripetition,Tf
401,B0000CNU2H,Anything else is just... disappointing. Curry ...,"[anyth, curri, powder, blend, spice, orient, c...","{'anyth': 2, 'curri': 10, 'powder': 5, 'blend'...","{'anyth': 0.0526, 'curri': 0.2632, 'powder': 0..."
424,B0000CNU78,"Spicy curry is very easy to use, but too hot f...","[spici, curri, rme, love, chicken, curri, vaca...","{'spici': 2, 'curri': 5, 'rme': 1, 'love': 1, ...","{'spici': 0.0741, 'curri': 0.1852, 'rme': 0.03..."
425,B0000CNU7C,It's curry-tastic!!! I have been a fan of S&am...,"[fan, amp, b, golden, curri, box, sf, japantow...","{'fan': 1, 'amp': 1, 'b': 1, 'golden': 3, 'cur...","{'fan': 0.0123, 'amp': 0.0123, 'b': 0.0123, 'g..."
1699,B00012OI64,1st try This was delightful. I made 2 catfish ...,"[tri, thick, steak, onion, cilantro, tomato, c...","{'tri': 1, 'thick': 1, 'steak': 1, 'onion': 1,...","{'tri': 0.0526, 'thick': 0.0526, 'steak': 0.05..."
1757,B00013YMVY,Really Great Tasting Green Curry! Highly Recom...,"[great, tast, green, curri, highli, thai, food...","{'great': 1, 'tast': 2, 'green': 2, 'curri': 3...","{'great': 0.0455, 'tast': 0.0909, 'green': 0.0..."
...,...,...,...,...,...
73204,B008EDKDDE,Good mix if you don't have time or ingredients...,"[mix, time, ingredi, homemad, mix, time, every...","{'mix': 7, 'time': 3, 'ingredi': 1, 'homemad':...","{'mix': 0.2593, 'time': 0.1111, 'ingredi': 0.0..."
73217,B008EE64UY,"Easy, Excellent, Curry I love this curry powde...","[easi, excel, curri, curri, powder, mix, varie...","{'easi': 1, 'excel': 1, 'curri': 7, 'powder': ...","{'easi': 0.0278, 'excel': 0.0278, 'curri': 0.1..."
73886,B00914BS6G,Aromatic I purchased these fresh curry leaves ...,"[curri, south, stew, addit, tarka, spice, leav...","{'curri': 4, 'south': 1, 'stew': 1, 'addit': 1...","{'curri': 0.0816, 'south': 0.0204, 'stew': 0.0..."
74041,B0096BRED0,Great Curry for Chicken Salad This is the perf...,"[great, curri, chicken, salad, curri, curri, c...","{'great': 1, 'curri': 7, 'chicken': 4, 'salad'...","{'great': 0.0175, 'curri': 0.1228, 'chicken': ..."
