In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# import sklearn
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer  # TF-IDF
from sklearn.preprocessing import scale
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

import nltk
import spacy #for faster tokenization and lemmatization

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import re
import string

import project4_functions

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
cdata = pd.read_json (r'/Users/xinrucheng/Documents/GitHub/metisproject04/data/raw/whats-cooking/train.json')

In [3]:
cdata.head()

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."


In [4]:
cdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39774 entries, 0 to 39773
Data columns (total 3 columns):
id             39774 non-null int64
cuisine        39774 non-null object
ingredients    39774 non-null object
dtypes: int64(1), object(2)
memory usage: 932.3+ KB


In [5]:
cdata['cuisine'].nunique()

20

In [6]:
cdata['cuisine'].unique()

array(['greek', 'southern_us', 'filipino', 'indian', 'jamaican',
       'spanish', 'italian', 'mexican', 'chinese', 'british', 'thai',
       'vietnamese', 'cajun_creole', 'brazilian', 'french', 'japanese',
       'irish', 'korean', 'moroccan', 'russian'], dtype=object)

-----

In [7]:
#put ADVERTISEMENT in regex function instead of adding to nltk stopwords list

In [8]:
cmod=cdata['ingredients'].apply(project4_functions.regex_nodigits_new)
cmod

0        ['romaine lettuce', 'black olives', 'grape tom...
1        ['plain flour', 'ground pepper', 'salt', 'toma...
2        ['eggs', 'pepper', 'salt', 'mayonaise', 'cooki...
3              ['water', 'vegetable oil', 'wheat', 'salt']
4        ['black pepper', 'shallots', 'cornflour', 'cay...
                               ...                        
39769    ['light brown sugar', 'granulated sugar', 'but...
39770    ['KRAFT Zesty Italian Dressing', 'purple onion...
39771    ['eggs', 'citrus fruit', 'raisins', 'sourdough...
39772    ['boneless chicken skinless thigh', 'minced ga...
39773    ['green chile', 'jalapeno chilies', 'onions', ...
Name: ingredients, Length: 39774, dtype: object

In [9]:
#removed digits successfully, need to remove [] and \n also, should be included in vectorizer
#-- but why is regex ftn not doing that?


In [11]:
len(cmod) #length correct but all 1st recipe
#if return s in nodigits ftn, len=661 much shorter than expected, why?

39774

In [13]:
test=pd.Series(cmod)
vectorizer = CountVectorizer(stop_words='english')
doc_word = vectorizer.fit_transform(test)
doc_word.shape

(39774, 2958)

In [16]:
type(doc_word)

scipy.sparse.csr.csr_matrix

In [17]:
#dim reduction (LSA) on wsc data, ingredients column only:

#from topic modelling LSA/NMF notebook:
# Acronynms: Latent Semantic Analysis (LSA) is just another name for 
#  Signular Value Decomposition (SVD) applied to Natural Language Processing (NLP)
lsa = TruncatedSVD(10)  
#TruncatedSVD--cuts off 0s (singular values) in decomposition result, just keeps important ones
#docstring for TruncatedSVD says this does not center the data!
doc_topic = lsa.fit_transform(doc_word)
lsa.explained_variance_ratio_  #variance for topics!

array([0.05469238, 0.03920768, 0.03985984, 0.03390693, 0.03084454,
       0.02539423, 0.02314709, 0.02131655, 0.01859653, 0.01582786])

In [18]:
topic_word = pd.DataFrame(lsa.components_.round(5),
             columns = vectorizer.get_feature_names()) #index = ["component_1","component_2"],
topic_word

Unnamed: 0,abalone,abbamele,absinthe,abura,acai,accent,accompaniment,achiote,acid,acini,...,za,zatarain,zatarains,zero,zest,zesty,zinfandel,ziti,zucchini,épices
0,1e-05,1e-05,0.0,2e-05,1e-05,9e-05,1e-05,0.00023,7e-05,3e-05,...,3e-05,0.0,1e-05,2e-05,0.00991,0.00018,0.0001,0.00046,0.01247,3e-05
1,3e-05,0.0,0.0,0.0001,8e-05,9e-05,5e-05,-5e-05,4e-05,-4e-05,...,-2e-05,0.0,0.0,6e-05,0.0124,-0.00027,0.0,-0.00052,-0.01105,7e-05
2,5e-05,0.0,-0.0,6e-05,-3e-05,0.00022,-3e-05,0.00011,3e-05,-3e-05,...,-3e-05,1e-05,1e-05,1e-05,-0.00938,-0.00015,-1e-05,-0.00052,0.00191,-6e-05
3,-3e-05,5e-05,0.0,-4e-05,3e-05,-0.00025,5e-05,0.00019,-6e-05,8e-05,...,-0.0,-1e-05,1e-05,-8e-05,0.02575,-5e-05,0.00032,0.00042,0.0062,4e-05
4,-1e-05,-3e-05,0.0,0.0,1e-05,8e-05,2e-05,0.00017,4e-05,-8e-05,...,4e-05,-1e-05,-2e-05,2e-05,0.00724,-0.00064,-6e-05,-0.00152,-0.01279,5e-05
5,-3e-05,1e-05,-0.0,-1e-05,-3e-05,6e-05,-3e-05,-7e-05,-0.00011,3e-05,...,-7e-05,3e-05,-5e-05,3e-05,-0.01176,0.00084,3e-05,0.00083,-0.00012,-8e-05
6,-3e-05,-2e-05,0.0,0.0,1e-05,-0.00024,4e-05,0.00013,5e-05,-4e-05,...,4e-05,1e-05,-3e-05,6e-05,0.00719,0.00049,0.0001,-0.00129,-0.00574,0.00011
7,0.0,5e-05,-0.0,-4e-05,4e-05,-3e-05,-0.0,0.00062,5e-05,-4e-05,...,2e-05,5e-05,3e-05,3e-05,-0.00695,-0.00011,0.00013,-0.00156,-0.0033,-4e-05
8,6e-05,8e-05,0.0,3e-05,-4e-05,0.00022,2e-05,-0.00022,-0.00023,9e-05,...,-0.0001,3e-05,-1e-05,-0.00019,0.01479,2e-05,0.00035,-0.00022,-0.00978,-3e-05
9,-2e-05,-1e-05,0.0,9e-05,2e-05,0.00023,3e-05,-0.00065,1e-05,0.0,...,-9e-05,4e-05,0.0,-9e-05,-0.00094,-7e-05,0.00031,0.00044,0.00464,-7e-05


In [19]:
project4_functions.display_topics(lsa, vectorizer.get_feature_names(), 10)


Topic  0
pepper, oil, salt, ground, garlic, fresh, sauce, onions, black, chicken

Topic  1
sugar, flour, sauce, butter, eggs, purpose, water, baking, milk, large

Topic  2
sauce, oil, soy, sesame, rice, garlic, onions, ginger, chicken, green

Topic  3
fresh, juice, olive, lemon, chopped, cheese, lime, cloves, oil, parsley

Topic  4
ground, fresh, cumin, ginger, cilantro, juice, coriander, cinnamon, lime, turmeric

Topic  5
cheese, chicken, sauce, cream, shredded, ground, cilantro, onions, green, chopped

Topic  6
pepper, fresh, bell, green, juice, lime, red, chopped, cilantro, chicken

Topic  7
chicken, powder, oil, salt, broth, onions, tomatoes, olive, garlic, cumin

Topic  8
chicken, broth, ground, boneless, skinless, white, sodium, butter, wine, breasts

Topic  9
onions, green, water, fresh, butter, parsley, ground, carrots, tomatoes, celery


In [20]:
Vt = pd.DataFrame(doc_topic.round(5))
Vt

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.30098,-0.84611,-0.30520,-0.20134,-0.82252,0.41176,0.00801,-0.22633,-0.47333,-0.58897
1,2.86948,-0.35954,-1.30734,-1.20643,0.29855,-0.50109,-0.05482,-0.14437,-0.26696,0.21000
2,2.38582,0.66374,0.62131,-0.82327,-0.60374,0.40380,0.37355,0.99668,0.61217,-0.84870
3,0.78745,0.42599,0.08999,0.07075,-0.06057,-0.51023,-0.51727,0.46079,-0.26791,-0.01874
4,3.35583,-0.02866,-0.68800,-0.99715,0.39514,0.00457,0.55794,0.99688,0.26718,0.04977
...,...,...,...,...,...,...,...,...,...,...
39769,1.32952,2.94078,-1.33541,0.30657,-0.38653,-0.80232,0.33582,-0.05934,0.06211,0.45389
39770,1.06608,-0.74327,-0.43469,-0.16136,-1.03135,0.19963,0.08155,-0.79191,-0.08478,-0.51031
39771,1.32729,1.21220,-1.74574,-0.42690,1.06688,-0.06569,-0.35239,-0.46338,0.44793,0.36327
39772,2.94254,2.55587,1.70625,-0.01684,0.13151,0.26706,-0.09524,0.98885,1.64533,-1.02152


In [21]:
from sklearn.metrics.pairwise import cosine_similarity

In [23]:
cosine_similarity(doc_word).round(3) #faster then prev, but still almost crashed laptop

array([[1.   , 0.25 , 0.16 , ..., 0.   , 0.035, 0.241],
       [0.25 , 1.   , 0.298, ..., 0.367, 0.112, 0.462],
       [0.16 , 0.298, 1.   , ..., 0.147, 0.299, 0.287],
       ...,
       [0.   , 0.367, 0.147, ..., 1.   , 0.096, 0.177],
       [0.035, 0.112, 0.299, ..., 0.096, 1.   , 0.135],
       [0.241, 0.462, 0.287, ..., 0.177, 0.135, 1.   ]])

In [24]:
#try clustering?
from sklearn.preprocessing import scale
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.datasets import fetch_mldata
from sklearn.utils import shuffle

In [28]:
num_clusters = 10
km = KMeans(n_clusters=num_clusters)
km.fit(topic_word)
project4_functions.display_cluster(X,km,num_clusters)

AttributeError: module 'project4_functions' has no attribute 'display_cluster'