In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.ensemble import BaggingClassifier

In [2]:
pd.set_option('display.max_rows', 10000)
pd.set_option('display.max_columns', 500)
pd.set_option('display.precision',150)

In [3]:
df = pd.read_json('../data/whats-cooking/train.json')

In [4]:
def lower_list(arr):
    return [x.lower() for x in arr]

In [5]:
df['ingredient_count'] = df['ingredients'].map(lambda x: len(x))

In [6]:
df['ingredients'] = df['ingredients'].map(lower_list)

In [7]:
df.head(10)

Unnamed: 0,id,cuisine,ingredients,ingredient_count
0,10259,greek,"[romaine lettuce, black olives, grape tomatoes...",9
1,25693,southern_us,"[plain flour, ground pepper, salt, tomatoes, g...",11
2,20130,filipino,"[eggs, pepper, salt, mayonaise, cooking oil, g...",12
3,22213,indian,"[water, vegetable oil, wheat, salt]",4
4,13162,indian,"[black pepper, shallots, cornflour, cayenne pe...",20
5,6602,jamaican,"[plain flour, sugar, butter, eggs, fresh ginge...",12
6,42779,spanish,"[olive oil, salt, medium shrimp, pepper, garli...",13
7,3735,italian,"[sugar, pistachio nuts, white almond bark, flo...",10
8,16903,mexican,"[olive oil, purple onion, fresh pineapple, por...",13
9,12734,italian,"[chopped tomatoes, fresh basil, garlic, extra-...",6


In [8]:
def remove_spaces(arr):
    return [x.replace(' ', '_') for x in arr]

In [9]:
def strip_words(arr):
     return [x.strip() for x in arr]

In [10]:
def remove_commas(arr):
    return [x.replace(',', '') for x in arr]

In [11]:
def remove_dots(arr):
    return [x.replace('.', '') for x in arr]

In [12]:
def remove_weird_characters(arr):
    weird_characters = [',', '.', '(', ')', "'", 'u"\u2122"']
    for char in weird_characters:
        for i in range(len(arr)):
            arr[i].replace(char, '')
    return arr

In [13]:

df['ingredients'] = df['ingredients'].map(strip_words)
df['ingredients'] = df['ingredients'].map(remove_weird_characters)
df['ingredients'] = df['ingredients'].map(remove_spaces)


In [14]:
df['ingredients'].head(15)

0     [romaine_lettuce, black_olives, grape_tomatoes...
1     [plain_flour, ground_pepper, salt, tomatoes, g...
2     [eggs, pepper, salt, mayonaise, cooking_oil, g...
3                   [water, vegetable_oil, wheat, salt]
4     [black_pepper, shallots, cornflour, cayenne_pe...
5     [plain_flour, sugar, butter, eggs, fresh_ginge...
6     [olive_oil, salt, medium_shrimp, pepper, garli...
7     [sugar, pistachio_nuts, white_almond_bark, flo...
8     [olive_oil, purple_onion, fresh_pineapple, por...
9     [chopped_tomatoes, fresh_basil, garlic, extra-...
10    [pimentos, sweet_pepper, dried_oregano, olive_...
11    [low_sodium_soy_sauce, fresh_ginger, dry_musta...
12    [italian_parsley_leaves, walnuts, hot_red_pepp...
13    [ground_cinnamon, fresh_cilantro, chili_powder...
14    [fresh_parmesan_cheese, butter, all-purpose_fl...
Name: ingredients, dtype: object

In [15]:
df['ingredients'] = df['ingredients'].map(lambda x: " ".join(x)) 

In [16]:
df['ingredients'] = df['ingredients'].replace('[^a-zA-Z\d\s:]', '', regex=True)

In [17]:
pd.set_option('max_colwidth', 300)

In [18]:
print(df[df['ingredients'].str.contains('or_best_food_real_mayonnais')]['ingredients'])

Series([], Name: ingredients, dtype: object)


In [19]:
print(df[df['ingredients'].str.contains(',')]['ingredients'])

Series([], Name: ingredients, dtype: object)


In [20]:
count = CountVectorizer(min_df=2)

In [21]:
X = df['ingredients']
y = df['cuisine']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

In [23]:
X_train_cv = count.fit_transform(X_train)
X_test_cv = count.transform(X_test)
df_X_train = pd.DataFrame(X_train_cv.todense(), columns=count.get_feature_names())

In [24]:
df_X_train['koshersalt']

0        0
1        0
2        0
3        0
4        0
        ..
29825    0
29826    0
29827    0
29828    0
29829    1
Name: koshersalt, Length: 29830, dtype: int64

In [25]:
df_X_train.head()

Unnamed: 0,10ozfrozenchoppedspinachthawedandsqueezeddry,145ozdicedtomatoes,14ozsweetenedcondensedmilk,15ozrefriedbeans,1lowfatbuttermilk,1lowfatcottagecheese,1lowfatmilk,212to3lbchickencutintoservingpieces,25lesssodiumchickenbroth,2lowfatcottagecheese,2reducedfatmilk,33lesssodiumham,40lesssodiumtacoseasoning,40lesssodiumtacoseasoningmix,7up,95leangroundbeef,aai,aaipowder,abalone,aburaage,accentseasoning,achiote,achiotepaste,achiotepowder,acinidipepe,ackee,acornsquash,activedryyeast,adobo,adobosauce,adoboseasoning,adzukibeans,agar,agavenectar,agedbalsamicvinegar,agedcheddarcheese,ahi,ahitunasteaks,aioli,ajinomoto,ajwain,akamiso,alaskankingcrablegs,albacoretunainwater,alcohol,ale,aleppopepper,alfredosauce,allbeefhotdogs,allpotatopurpos,allpurposeflour,allpurposeseasoning,allpurposeunbleachedflour,allspice,allspiceberries,almondbutter,almondextract,almondflour,almondliqueur,almondmeal,almondmilk,almondpaste,almonds,alphabetpasta,amaranth,amaretti,amaretticookies,amaretto,amarettoliqueur,amchur,americancheese,americancheeseslices,ampalaya,anaheimchile,anasazibeans,ancho,anchochilepepper,anchochiligroundpepper,anchopowder,anchovies,anchovyfilets,anchovyfillets,anchovypaste,andfatfreehalfhalf,andouillechickensausage,andouillesausage,andouillesausagelinks,angelfoodcake,angelfoodcakemix,angelhair,angosturabitters,anise,aniseextract,aniseoil,anisepowder,aniseseed,anisette,anjoupears,annatto,annattoseeds,applebrandy,applebutter,applecider,applecidervinegar,applejelly,applejuice,applejuiceconcentrate,applepiefilling,applepiespice,apples,applesauce,applewoodsmokedbacon,apricothalves,apricotjam,apricotnectar,apricotpreserves,apricots,arbolchile,arboriorice,armagnac,arrowroot,arrowrootpowder,artichokebottoms,artichokehearts,artichokes,artichokheartmarin,arugula,asadero,asafetida,asafetidapowder,asafoetida,asafoetidapowder,asiago,asianbasil,asianchilepaste,asianchiliredsauc,asianchilisauce,asianeggplants,asianfishsauce,asiannoodles,asianpear,asianricenoodles,asianwheatnoodles,asparagus,asparagusspears,asparagustips,aspic,assortedfreshvegetables,atta,aujusgravymix,avocado,avocadoleaves,avocadooil,babyartichokes,babyarugula,babybackribs,babybokchoy,babybroccoli,babycarrots,babycorn,babyeggplants,babygemlettuce,babygreens,babyleaflettuce,babylimabeans,babyportobellomushrooms,babypotatoes,babyradishes,babyspinach,babyspinachleaves,babytatsoi,babyturnips,babyzucchini,bacardi,backbacon,backbaconrashers,bacon,baconbits,bacondrippings,baconfat,bacongrease,baconpieces,baconsalt,baconslices,bagels,baguette,baileysirishcreamliqueur,bakedbeans,bakedham,bakedpizzacrust,bakedtortillachips,bakingapples,bakingchocolate,bakingmix,bakingpotatoes,bakingpowder,bakingsoda,bakingspray,bakingyeast,balm,balsamicobianco,balsamicreduction,balsamicvinaigrette,balsamicvinaigrettesaladdressing,balsamicvinegar,bambooshoots,bananablossom,bananaleaves,bananaliqueur,bananapeppers,bananas,bananasquash,banger,barbecuedpork,barbecuerub,barbecuesauce,barilla,barley,barleyflour,barleymiso,bartlettpears,base,basil,basildriedleaves,basilleaves,basilmayonnaise,basilpestosauce,basmati,basmatirice,bassfillets,baton,batter,bawanggoreng,bayleaf,bayleaves,bayscallops,bbqsauce,bbqseasoning,beancurd,beancurdskins,beandip,beanpaste,beans,beansauce,beansoup,beansprouts,beanthreads,beanthreadvermicelli,beateneggs,beaujolais,...,vealchops,vealcutlets,vealdemiglace,vealforstew,vealloinchops,vealribchops,vealscallops,vealshanks,vealshoulder,vealstock,veganbutter,veganmargarine,veganmayonnaise,veganparmesancheese,vegansourcream,veganworcestershiresauce,vegetablebouillon,vegetablebouilloncube,vegetablebroth,vegetabledemiglace,vegetablegumbo,vegetablejuice,vegetablejuicecocktail,vegetableoil,vegetableoilcookingspray,vegetableoilspray,vegetables,vegetableseasoning,vegetableshortening,vegetablestock,vegetarianoystersauce,vegetarianrefriedbeans,veggiecrumbles,veggies,velveeta,venison,verjus,vermicelli,vermicellinoodles,vermouth,vidalia,vidaliaonion,vietnamesecoriander,vietnamesefishsauce,vietnamesericepaper,vinaigrette,vinaigrettedressing,vinegar,vineripenedtomatoes,vinetomatoes,vinsanto,virgincoconutoil,virginiaham,virginoliveoil,vitalwheatgluten,vodka,wafer,waffle,wakame,walnuthalves,walnutoil,walnutpieces,walnuts,warmwater,wasabi,wasabipaste,wasabipowder,water,waterchestnuts,waterchestnutsdrainedandchopped,watercress,watercressleaves,watermelon,waterspinach,waxbeans,waxypotatoes,wheat,wheatbeer,wheatberries,wheatbran,wheatbread,wheatcereal,wheatflour,wheatfreesoysauce,wheatgerm,wheatstarch,wheels,whey,whippedcream,whippedcreamcheese,whippedtopping,whippingcream,whippingheavycream,whiskey,whiteasparagus,whitebakingbar,whitebeans,whitebread,whitebreadflour,whitebuttonmushrooms,whitecabbage,whitecakemix,whitecheddarcheese,whitecheese,whitechocolate,whitechocolatechips,whitecorn,whitecornmeal,whitecornsyrup,whitecorntortillas,whitedistilledvinegar,whitefish,whitefishfillets,whitefleshedfish,whiteflour,whitefrostings,whitehominy,whiteitaliantunainoliveoil,whitekidneybeans,whitemiso,whitemushrooms,whiteonion,whitepeaches,whitepepper,whitepeppercorns,whitepoppyseeds,whiteradish,whiterice,whitericeflour,whitericevinegar,whiterum,whitesandwichbread,whitesesameseeds,whitesugar,whitetequila,whitetruffleoil,whitetunainwater,whitevinegar,whitewine,whitewinevinegar,wholeallspice,wholealmonds,wholechicken,wholecloves,wholecranberrysauce,wholegrainbread,wholegraindijonmustard,wholegrainmustard,wholegrainpasta,wholegrainrice,wholekernelcorndrain,wholemealflour,wholemilk,wholemilkgreekyogurt,wholemilkricottacheese,wholemilkyoghurt,wholenutmegs,wholeokra,wholepeeledtomatoes,wholepeppercorn,wholewheatangelhairpasta,wholewheatbread,wholewheatbreadcrumbs,wholewheatbreadslices,wholewheatbuns,wholewheatcouscous,wholewheatfettuccine,wholewheatflour,wholewheatfrenchbread,wholewheathamburgerbuns,wholewheatlasagnanoodles,wholewheatlinguine,wholewheatpasta,wholewheatpastryflour,wholewheatpenne,wholewheatpennepasta,wholewheatpita,wholewheatpitabread,wholewheatpizzacrust,wholewheatpizzadough,wholewheatrigatoni,wholewheatrotinipasta,wholewheatspaghetti,wholewheatspaghettinoodles,wholewheatthinspaghetti,wholewheattortillas,wideeggnoodles,widericenoodles,wildflowerhoney,wildgarlic,wildmushrooms,wildrice,wildsalmon,wine,winevinegar,wintermelon,wintersquash,wishboneitaliandressing,wishboneranchdress,wondraflour,wontonnoodles,wontonskins,wontonwrappers,woodearmushrooms,worcestershiresauce,xanthangum,yakinori,yakisobanoodles,yams,yardlongbeans,yeast,yellowbeansauce,yellowbellpepper,yellowcakemix,yellowchives,yellowcorn,yellowcornmeal,yellowcrooknecksquash,yellowcurrypaste,yellowfintuna,yellowfoodcoloring,yellowhominy,yellowlentils,yellowmiso,yellowmustard,yellowmustardseeds,yellowonion,yellowpeppers,yellowrice,yellowrocksugar,yellowsplitpeas,yellowsquash,yellowsummersquash,yellowtomato,yoghurt,yogurtcheese,yolk,youngcoconutmeat,yuca,yukongold,yukongoldpotatoes,yuzu,yuzujuice,yuzukosho,zaatar,zest,zestyitaliandressing,zinfandel,ziti,zucchini
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [26]:
y.value_counts(normalize = True)

italian         0.1970634082566500855371316447417484596371650695800781250000
mexican         0.1618645346206064317140516095605562441051006317138671875000
southern_us     0.1086136672197918207061206885555293411016464233398437500000
indian          0.0755015839493136248217197703525016549974679946899414062500
chinese         0.0672047065922461855924652240901195909827947616577148437500
french          0.0665258711721224971213928256474900990724563598632812500000
cajun_creole    0.0388696133152310538405593831612350186333060264587402343750
thai            0.0386936189470508395960024472515215165913105010986328125000
japanese        0.0357771408457786474488493411172385094687342643737792968750
greek           0.0295419118016794907122246627295680809766054153442382812500
spanish         0.0248654900186051179555857260083939763717353343963623046875
korean          0.0208679036556544475478069955443061189725995063781738281250
vietnamese      0.0207421933926685772564546539342700270935893058776855468750

# Random Forest Modeling

In [27]:
forest = RandomForestClassifier()

In [28]:
forest.fit(X_train_cv, y_train)

RandomForestClassifier()

In [29]:
forest.score(X_train_cv, y_train), forest.score(X_test_cv, y_test)

(0.9997653369091518, 0.7089702333065165)

# Tuning the Model

In [30]:
forest = RandomForestClassifier()

In [44]:
forest_params = {'n_estimators': [750, 1000],
          'max_depth': [17, 19],
         'min_samples_split': [4, 5],
         'min_samples_leaf': [2, 3],
         'max_features': ['auto']}

In [45]:
# forest_params = {
#     'n_estimators' : [5, 10, 15, 20],
#     'max_depth' : [10, 20, 30],
#     'max_features' : [.2, .3]
# }
gs_forest = GridSearchCV(forest, forest_params, cv = 3)
gs_forest.fit(X_train_cv, y_train)

GridSearchCV(cv=3, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [17, 19], 'max_features': ['auto'],
                         'min_samples_leaf': [2, 3],
                         'min_samples_split': [4, 5],
                         'n_estimators': [750, 1000]})

In [32]:
gs_forest.best_params_

{'max_depth': 30, 'max_features': 0.2, 'n_estimators': 20}

In [47]:
gs_forest.score(X_train_cv, y_train), gs_forest.score(X_test_cv, y_test)

(0.5301709688233323, 0.5160901045856798)

# Extremely Randomized Trees (Extra Trees)

In [34]:
et = ExtraTreesClassifier()

In [35]:
et.fit(X_train_cv, y_train)

ExtraTreesClassifier()

In [36]:
et.score(X_train_cv, y_train), et.score(X_test_cv, y_test)

(0.9997653369091518, 0.7194288012872083)

# Improve the overfit by TFIDF

In [38]:
# fit the winning model and vectorizer
tf = TfidfVectorizer()

X_train_tf = tf.fit_transform(X_train)
X_test_tf = tf.transform(X_test)
print(X_train_tf.shape)

(29830, 6184)


In [40]:
random_forest = RandomForestClassifier()
random_forest.fit(X_train_tf, y_train)
preds = random_forest.predict(X_test_tf)

In [41]:
print('accuracy score of TFDIF for Random Forest on testing data: ', random_forest.score(X_test_tf, y_test).round(4))
print('cross-validation score is: ', \
    cross_val_score(random_forest, X_train_tf, y_train, cv = 5).mean().round(4)) 

accuracy score of TFDIF for Random Forest on testing data:  0.7041
cross-validation score is:  0.6989


# Try bagging

In [None]:
# Instantiate BaggingClassifier.
bag = BaggingClassifier()

# Fit BaggingClassifier.
bag.fit(X_train_cv, y_train)

# Score BaggingClassifier.
bag.score(X_test_cv, y_test)

# View the feature importances

In [48]:
gs_forest.best_estimator_.feature_importances_

array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
       0.00000000e+00, 1.95392358e-06, 2.10685348e-04])

In [51]:
d = {'Column Name':df_X_train.columns,'Feature Importance':gs_forest.best_estimator_.feature_importances_}
fi = pd.DataFrame(d)

In [50]:
fi.sort_values(by=['Feature Importance'], ascending = False).head(20)

Unnamed: 0,Column Name,Feature Importance
1868,gratedparmesancheese,0.0352217690977306
3802,soysauce,0.0300992968736636
1747,garammasala,0.0284462153391949
3501,salsa,0.0235399018960356
3595,sesameoil,0.0219410504024597
1088,corntortillas,0.0217198696800994
150,avocado,0.0215603361062632
1557,flourtortillas,0.0215032795726948
1947,groundcumin,0.0208653063527408
2790,oliveoil,0.0207906173900862
