# Unsupervised Clustering using PCA and K Means for Products


In [None]:
#Import libraries
import pandas as pd
import seaborn as sns
import numpy as np
import string

from tqdm import tqdm_notebook as tqdm
from keras.utils import np_utils
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.cluster import KMeans, DBSCAN
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn import metrics
from sklearn.metrics import silhouette_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import PCA
from sklearn.feature_extraction import text 


In [None]:
#Import data
df = pd.read_csv('../raw/meals.csv')

In [None]:
#Inspect data
# pd.options.display.max_columns = None
df.sample(3)

In [None]:
ini_rows = df.shape[0]
ini_columns=df.shape[1]
print('We have {} records of transactions for {} unique items.'.format(ini_rows,df.id.nunique()))

In [None]:
df.isnull().sum().sort_values(ascending=False)

In [None]:
menu_features = df[['ingredients','temperature','macros','price','category','name','id']]

In [None]:
menu_features.dropna(how='any', inplace=True)

In [None]:
menu_features.shape

### Ingredients

In [None]:
#Compile ingredients as corpus
texts = pd.DataFrame(menu_features.ingredients)

In [None]:
#Inspect corpus
texts.sample(5)

In [None]:
#Removes punctuation
texts['clean_text'] = texts.ingredients.map(lambda x : ''.join(k for k in str(x) if k not in string.punctuation))

In [None]:
add_stopwords = ['001', '003', '01', '05', '07', '11', '13', '18', '6070', '74']

In [None]:
cvec_full_list = CountVectorizer(stop_words = text.ENGLISH_STOP_WORDS.union(add_stopwords),max_features=500)
cvec_full_list.fit(texts.clean_text)

In [None]:
bow = pd.DataFrame(cvec_full_list.transform(texts.clean_text).todense(),columns=cvec_full_list.get_feature_names())
word_count = bow.sum(axis=0)

print("Most common ingredients:")
word_count.sort_values(ascending = False).head(20)

In [None]:
len(cvec_full_list.get_feature_names())

In [None]:
# ingre_dum = np.zeros((menu_features.id.nunique(),500)) #row=items, col=ingredients

In [None]:
matrix = pd.DataFrame(np.zeros((menu_features.id.nunique(),500)),index = menu_features.id.unique(), columns = cvec_full_list.get_feature_names())

In [None]:
menu_features['clean_ingre'] = menu_features.ingredients.map(lambda x : ''.join(k for k in str(x) if k not in string.punctuation))

In [None]:
for item in menu_features.id.unique():
    cvec = CountVectorizer(stop_words = text.ENGLISH_STOP_WORDS.union(add_stopwords))
    cvec.fit(menu_features.loc[menu_features.id==item,'ingredients'])
    for word in cvec.get_feature_names():
        if word in cvec_full_list.get_feature_names():
            matrix.loc[item,word] = 1
            

In [None]:
df_temp = pd.get_dummies(menu_features.temperature, drop_first=True)

In [None]:
matrix['price'] = 0
for i in matrix.index:
    matrix.loc[i,'price'] = menu_features.loc[menu_features.id==i,'price'].iloc[0]

In [None]:
matrix.sample(5)

In [None]:
#Check possible values
menu_features.category.value_counts()

There are repeated categories such as 'main' and 'Main Course' which can be consolidated.

In [None]:
#Consolidating similiar categories
def consolidate_cat(x):
    if x == 'Main Course':
        return 'main'
    elif x == 'Appetiser':
        return 'side'
    elif x == 'Dessert':
        return 'dessert'
    elif x == 'partnership':
        return 'off_menu'
    else:
        return x

menu_features.category = menu_features.category.apply(lambda x : consolidate_cat(x))

#Print new categories
menu_features.category.value_counts()

In [None]:
df_cat = pd.get_dummies(menu_features.category, drop_first=True)

In [None]:
new = menu_features.macros.str.split(', ', n=5, expand=True)

In [None]:
def split_values_into_name_and_val(col):
    new[str(col)+'_macros_name'] = new[col].str.split(':', n=2, expand=True)[0].fillna('None').map(lambda x: x.lower())
    new[str(col)+'_macros_val'] = new[col].str.split(':', n=2, expand=True)[1]
    print(new[str(col)+'_macros_name'].unique())

for col in new:
    split_values_into_name_and_val(col)

new.sample(5)

In [None]:
new['calories'] = 0
new['fat'] = 0
new['protein'] = 0
new['carb'] = 0

In [None]:
new.loc[new['0_macros_name'] == 'calories', 'calories'] = new.loc[new['0_macros_name'] == 'calories', '0_macros_val']
new.loc[new['0_macros_name'] == ' calories', 'calories'] = new.loc[new['0_macros_name'] == ' calories', '0_macros_val']

new.loc[new['1_macros_name'] == 'calories', 'calories'] = new.loc[new['1_macros_name'] == 'calories', '1_macros_val']
new.loc[new['1_macros_name'] == 'fat', 'fat'] = new.loc[new['1_macros_name'] == 'fat', '1_macros_val']
new.loc[new['1_macros_name'] == 'protein', 'protein'] = new.loc[new['1_macros_name'] == 'protein', '1_macros_val']
new.loc[new['1_macros_name'] == 'carbs', 'carb'] = new.loc[new['1_macros_name'] == 'carbs', '1_macros_val']

new.loc[new['2_macros_name'] == 'fat', 'fat'] = new.loc[new['2_macros_name'] == 'fat', '2_macros_val']
new.loc[new['2_macros_name'] == 'protein', 'protein'] = new.loc[new['2_macros_name'] == 'protein', '2_macros_val']
new.loc[new['2_macros_name'] == 'carb', 'carb'] = new.loc[new['2_macros_name'] == 'carb', '2_macros_val']

new.loc[new['3_macros_name'] == ' fat', 'fat'] = new.loc[new['3_macros_name'] == ' fat', '3_macros_val']
new.loc[new['3_macros_name'] == 'protein', 'protein'] = new.loc[new['3_macros_name'] == 'protein', '3_macros_val']
new.loc[new['3_macros_name'] == 'carb', 'carb'] = new.loc[new['3_macros_name'] == 'carb', '3_macros_val']

new.loc[new['4_macros_name'] == 'protein', 'protein'] = new.loc[new['4_macros_name'] == 'protein', '4_macros_val']

In [None]:
#Remove g
cols = ['fat','protein','carb']
for i in cols:
    new[str(i)] = new[str(i)].apply(lambda x: str(x).split('g')[0])

In [None]:
for i in ['calories', 'fat','protein','carb']:
    new[i] = new[i].astype(float)

In [None]:
df_macros = new[['calories', 'fat','protein','carb']]

In [None]:
df = pd.concat([menu_features,df_cat,df_macros,df_temp], axis=1)

In [None]:
df = df[['id','drink', 'favorite', 'fresh', 'guilt-free','highlight', 'local', 'main', 
   'off_menu', 'regular', 'side', 'calories','fat', 'protein', 'carb', 'room', 'warm']]

In [None]:
X = matrix.merge(df,how='left',left_on=matrix.index, right_on='id', suffixes=('_mat','_features')).drop('id', axis=1)

## Modelling

In [None]:
X.sample(5)

In [None]:
ss = StandardScaler()
Xs = ss.fit_transform(X)

### PCA

In [None]:
pca = PCA(n_components=20)
principalComponents = pca.fit_transform(Xs)

# Plot the explained variances
features = range(pca.n_components_)
plt.bar(features, pca.explained_variance_ratio_)
plt.xlabel('PCA features')
plt.ylabel('variance %')
plt.xticks(features)

# Save components to a DataFrame
PCA_components = pd.DataFrame(principalComponents)

### KMeans

In [None]:
ks = range(1, 10)
inertias = []
silhouette_scores = []
for k in ks:
    # Create a KMeans instance with k clusters: model
    knn_model = KMeans(n_clusters=k)
    
    # Fit model to samples
    knn_model.fit(PCA_components.iloc[:,:9])
    
    # Append the inertia to the list of inertias
    inertias.append(knn_model.inertia_)

    
plt.plot(ks, inertias, '-o', color='black')
plt.xlabel('number of clusters, k')
plt.ylabel('inertia')
plt.xticks(ks)
plt.show()

In [None]:
plt.scatter(PCA_components[3], PCA_components[2], alpha=.05, color='brown')
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')

In [None]:
model = KMeans(n_clusters=2)
model.fit(PCA_components.iloc[:,:2])

In [None]:
pred = model.labels_

In [None]:
X['cluster'] = pred

In [None]:
X.cluster.value_counts()

In [None]:
silhouette_score(Xs, pred, metric='euclidean')

### Interpretation