In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# import sklearn
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer  # TF-IDF
from sklearn.preprocessing import scale
from sklearn.cluster import KMeans
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

import nltk
import spacy #for faster tokenization and lemmatization

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import re
import string

import project4_functions

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
cdata = pd.read_json (r'/Users/xinrucheng/Documents/GitHub/metisproject04/data/raw/whats-cooking/train.json')

In [3]:
cdata.head()

Unnamed: 0,id,cuisine,ingredients
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes..."
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g..."
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,22213,indian,"[water, vegetable oil, wheat, salt]"
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe..."


In [4]:
cdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39774 entries, 0 to 39773
Data columns (total 3 columns):
id             39774 non-null int64
cuisine        39774 non-null object
ingredients    39774 non-null object
dtypes: int64(1), object(2)
memory usage: 932.3+ KB


In [5]:
cdata['cuisine'].nunique()

20

In [6]:
cdata['cuisine'].unique()

array(['greek', 'southern_us', 'filipino', 'indian', 'jamaican',
       'spanish', 'italian', 'mexican', 'chinese', 'british', 'thai',
       'vietnamese', 'cajun_creole', 'brazilian', 'french', 'japanese',
       'irish', 'korean', 'moroccan', 'russian'], dtype=object)

-----

In [7]:
#put ADVERTISEMENT in regex function instead of adding to nltk stopwords list

In [8]:
cmod=cdata['ingredients'].apply(project4_functions.regex_nodigits_new)
cmod

0        ['romaine lettuce', 'black olives', 'grape tom...
1        ['plain flour', 'ground pepper', 'salt', 'toma...
2        ['eggs', 'pepper', 'salt', 'mayonaise', 'cooki...
3              ['water', 'vegetable oil', 'wheat', 'salt']
4        ['black pepper', 'shallots', 'cornflour', 'cay...
                               ...                        
39769    ['light brown sugar', 'granulated sugar', 'but...
39770    ['KRAFT Zesty Italian Dressing', 'purple onion...
39771    ['eggs', 'citrus fruit', 'raisins', 'sourdough...
39772    ['boneless chicken skinless thigh', 'minced ga...
39773    ['green chile', 'jalapeno chilies', 'onions', ...
Name: ingredients, Length: 39774, dtype: object

In [9]:
#removed digits successfully, need to remove [] and \n also, should be included in vectorizer
#-- but why is regex ftn not doing that?


In [10]:
len(cmod) #length correct but all 1st recipe
#if return s in nodigits ftn, len=661 much shorter than expected, why?

39774

In [11]:
test=pd.Series(cmod)
vectorizer = CountVectorizer(stop_words='english')
doc_word = vectorizer.fit_transform(test)
doc_word.shape

(39774, 2958)

In [12]:
type(doc_word)

scipy.sparse.csr.csr_matrix

In [13]:
#dim reduction (LSA) on wsc data, ingredients column only:

#from topic modelling LSA/NMF notebook:
# Acronynms: Latent Semantic Analysis (LSA) is just another name for 
#  Signular Value Decomposition (SVD) applied to Natural Language Processing (NLP)
lsa = TruncatedSVD(10)  
#TruncatedSVD--cuts off 0s (singular values) in decomposition result, just keeps important ones
#docstring for TruncatedSVD says this does not center the data!
doc_topic = lsa.fit_transform(doc_word)
lsa.explained_variance_ratio_  #variance for topics!

array([0.05469238, 0.03920751, 0.03985984, 0.03390681, 0.03084459,
       0.02539434, 0.0231474 , 0.02131703, 0.01859195, 0.01583114])

In [14]:
topic_word = pd.DataFrame(lsa.components_.round(5),
             columns = vectorizer.get_feature_names()) #index = ["component_1","component_2"],
topic_word

Unnamed: 0,abalone,abbamele,absinthe,abura,acai,accent,accompaniment,achiote,acid,acini,...,za,zatarain,zatarains,zero,zest,zesty,zinfandel,ziti,zucchini,épices
0,1e-05,1e-05,0.0,2e-05,1e-05,9e-05,1e-05,0.00023,7e-05,3e-05,...,3e-05,0.0,1e-05,2e-05,0.00991,0.00018,0.0001,0.00046,0.01247,3e-05
1,3e-05,0.0,0.0,0.0001,8e-05,9e-05,5e-05,-5e-05,4e-05,-4e-05,...,-2e-05,0.0,0.0,6e-05,0.0124,-0.00027,0.0,-0.00052,-0.01105,7e-05
2,5e-05,0.0,-0.0,6e-05,-3e-05,0.00022,-3e-05,0.00011,3e-05,-3e-05,...,-3e-05,1e-05,1e-05,1e-05,-0.00937,-0.00015,-1e-05,-0.00051,0.00191,-6e-05
3,-3e-05,5e-05,0.0,-4e-05,3e-05,-0.00025,5e-05,0.00019,-6e-05,8e-05,...,-0.0,-1e-05,1e-05,-8e-05,0.02575,-5e-05,0.00032,0.00042,0.0062,4e-05
4,-1e-05,-3e-05,0.0,0.0,1e-05,8e-05,2e-05,0.00017,4e-05,-8e-05,...,5e-05,-1e-05,-2e-05,2e-05,0.00723,-0.00064,-6e-05,-0.00152,-0.01279,5e-05
5,-3e-05,1e-05,-0.0,-1e-05,-3e-05,6e-05,-3e-05,-7e-05,-0.00011,3e-05,...,-7e-05,3e-05,-5e-05,3e-05,-0.01171,0.00084,3e-05,0.00083,-0.00015,-8e-05
6,-3e-05,-2e-05,0.0,0.0,1e-05,-0.00024,4e-05,0.00012,5e-05,-4e-05,...,4e-05,1e-05,-4e-05,6e-05,0.00729,0.00048,0.0001,-0.00129,-0.00568,0.00011
7,0.0,5e-05,-0.0,-4e-05,4e-05,-3e-05,-0.0,0.00062,5e-05,-4e-05,...,2e-05,5e-05,3e-05,2e-05,-0.0071,-0.00011,0.00012,-0.00156,-0.0033,-4e-05
8,6e-05,8e-05,0.0,3e-05,-4e-05,0.00022,2e-05,-0.00021,-0.00023,8e-05,...,-0.0001,4e-05,-1e-05,-0.00019,0.01445,2e-05,0.00034,-0.0002,-0.00961,-3e-05
9,-3e-05,-2e-05,0.0,9e-05,1e-05,0.00024,3e-05,-0.00068,-0.0,-1e-05,...,-9e-05,5e-05,-0.0,-9e-05,-0.00024,-6e-05,0.00031,0.00047,0.00485,-6e-05


In [15]:
project4_functions.display_topics(lsa, vectorizer.get_feature_names(), 10)


Topic  0
pepper, oil, salt, ground, garlic, fresh, sauce, onions, black, chicken

Topic  1
sugar, flour, sauce, butter, eggs, purpose, water, baking, milk, large

Topic  2
sauce, oil, soy, sesame, rice, garlic, onions, ginger, chicken, green

Topic  3
fresh, juice, olive, lemon, chopped, cheese, lime, cloves, oil, parsley

Topic  4
ground, fresh, cumin, ginger, cilantro, juice, coriander, cinnamon, lime, turmeric

Topic  5
cheese, chicken, cream, sauce, shredded, ground, cilantro, onions, green, chopped

Topic  6
pepper, fresh, bell, green, juice, lime, red, chopped, cilantro, chicken

Topic  7
chicken, powder, oil, salt, broth, onions, tomatoes, olive, garlic, cumin

Topic  8
chicken, broth, ground, boneless, skinless, white, sodium, butter, wine, breasts

Topic  9
onions, green, water, fresh, butter, parsley, tomatoes, ground, celery, leaves


In [16]:
Vt = pd.DataFrame(doc_topic.round(5))
Vt

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.30098,-0.84635,-0.30508,-0.20061,-0.82243,0.41008,-0.00268,-0.22122,-0.46641,-0.55618
1,2.86948,-0.36027,-1.30699,-1.20747,0.29918,-0.50620,-0.05337,-0.13468,-0.22841,0.29912
2,2.38582,0.66339,0.62161,-0.82463,-0.60551,0.40178,0.38794,0.99214,0.62451,-0.77200
3,0.78745,0.42585,0.09000,0.06970,-0.06087,-0.51041,-0.51289,0.46342,-0.25974,-0.02600
4,3.35583,-0.02829,-0.68810,-0.99802,0.39396,0.00828,0.55928,0.99479,0.25718,0.00884
...,...,...,...,...,...,...,...,...,...,...
39769,1.32952,2.94040,-1.33512,0.30671,-0.38576,-0.80465,0.33169,-0.06185,0.08831,0.49779
39770,1.06608,-0.74332,-0.43467,-0.16080,-1.03160,0.20231,0.07055,-0.79044,-0.08499,-0.52348
39771,1.32729,1.21214,-1.74558,-0.42715,1.06681,-0.06795,-0.34693,-0.46602,0.45400,0.38075
39772,2.94254,2.55612,1.70647,-0.01549,0.13156,0.26542,-0.10113,0.99503,1.61689,-1.06701


In [17]:
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
cosine_similarity(doc_word).round(3) #faster than prev (15~16mins instead of 41), but still almost crashed

array([[1.   , 0.25 , 0.16 , ..., 0.   , 0.035, 0.241],
       [0.25 , 1.   , 0.298, ..., 0.367, 0.112, 0.462],
       [0.16 , 0.298, 1.   , ..., 0.147, 0.299, 0.287],
       ...,
       [0.   , 0.367, 0.147, ..., 1.   , 0.096, 0.177],
       [0.035, 0.112, 0.299, ..., 0.096, 1.   , 0.135],
       [0.241, 0.462, 0.287, ..., 0.177, 0.135, 1.   ]])

In [19]:
#try clustering?
from sklearn.preprocessing import scale
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.datasets import fetch_mldata
from sklearn.utils import shuffle

import matplotlib.pyplot as plt
import seaborn as sns

# see https://ipython.readthedocs.io/en/stable/interactive/magics.html
%pylab inline

# sets backend to render higher res images
%config InlineBackend.figure_formats = ['retina']

Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  "\n`%matplotlib` prevents importing * from pylab and numpy"


In [20]:
#check kmeans pair soln, what are we plotting?

# num_clusters = 10
# km = KMeans(n_clusters=num_clusters)
# km.fit(topic_word)
# project4_functions.display_cluster(topic_word,km,num_clusters)

TypeError: '(array([False, False, False,  True, False, False, False, False, False,
       False]), 0)' is an invalid key