**Random Forest**

In [48]:
import pandas as pd
import numpy as np
import re
from collections import Counter
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [26]:
food_train=pd.read_csv('data/food_train.csv')
food_test=pd.read_csv('data/food_test.csv')
nutri=pd.read_csv('data/nutrients.csv')
food_nutri=pd.read_csv('data/food_nutrients.csv')

In [27]:
nutrients = pd.merge(food_nutri, nutri, how='left',on='nutrient_id') #merge the nutrients with our food description


In [28]:
food_train_w_nut=pd.merge(food_train, nutrients, how='left', on='idx') #merge the nutrients data with our food train data


In [29]:
categories=list(food_train_w_nut['category'].drop_duplicates())


In [30]:

popcorn_peanuts_keywords = "|".join(["cashews", "popcorn", "seeds", "nuts", "macadamias", "peanuts","roasted", "pistachios"])
candy_keywords = "|".join(["gummi","chewy", "gummy", "lolli", "candy", "candies", "fruit", "licorice", "drops", "peeps", "jelly", "fizz"])
cookies_keywords = "|".join(["cookies", "sandwich","wafers"])
chips_keywords = "|".join(["chips", "pretzel", "tortilla"])
chocolate_keywords = "|".join(["truffles", "bar"])
cakes_keywords = "|".join(["cake", "brownie", "pie", "eclair", "donut"])

def predict_category(description):
    if re.search(popcorn_peanuts_keywords, description):
        return "popcorn_peanuts_seeds_related_snacks"
    if re.search(candy_keywords, description):
        return "candy"
    if re.search(cookies_keywords, description):
        return "cookies_biscuits"
    if re.search(chips_keywords, description):
        return "chips_pretzels_snacks"
    if re.search(chocolate_keywords, description):
        return "chocolate"
    if re.search(cakes_keywords, description):
        return "cakes_cupcakes_snack_cakes"

    return None



food_train['predicted_category'] = food_train['description'].apply(predict_category)



In [31]:
popcorn_peanuts_key_brand = "|".join(['star snacks co., inc.', "nabisco food company",'john b. sanfilippo & son, inc.','kar nut products company','ferris coffee & nut co.'])
candy_key_brand = "|".join(['sunmark','wm. wrigley jr. company','perfetti van melle usa inc.','jelly belly candy company'])
cookies_key_brand = "|".join(['keebler company'])
chips_key_brand = "|".join(["the hain celestial group, inc."])
chocolate_key_brand = "|".join(["lindt & sprungli (schweiz) ag"])
cakes_key_brand = "|".join(['hostess brands, llc','tasty baking company'])

def predict_category_by_brand(brand):
    if brand in popcorn_peanuts_key_brand:
        return "popcorn_peanuts_seeds_related_snacks"
    if brand in candy_key_brand:
        return "candy"
    if brand in cookies_key_brand:
        return "cookies_biscuits"
    if brand in chips_key_brand:
        return "chips_pretzels_snacks"
    if brand in chocolate_key_brand:
        return "chocolate"
    if brand in cakes_key_brand:
        return "cakes_cupcakes_snack_cakes"

    return None



food_train['predicted_category_by_brand'] = food_train['brand'].apply(predict_category_by_brand)

In [32]:
def create_predict_y(row):
    if row['predicted_category_by_brand'] == row['predicted_category']:
        return row['predicted_category']
    elif row['predicted_category_by_brand'] is not None and row['predicted_category'] is not None:
        return None
    elif row['predicted_category_by_brand'] is not None:
        return row['predicted_category_by_brand']
    elif row['predicted_category'] is not None:
        return row['predicted_category']
    else:
        return None


food_train['predict_y'] = food_train.apply(create_predict_y, axis=1)




In [33]:
unique_characters = set()
food_train['ingredients']=food_train['ingredients'].str.replace('[*&$%]', '', regex=True)
food_train['ingredients_list']=food_train['ingredients'].apply(lambda x :re.split(r',\s*(?![^()]*\))',str(x)))
food_train['len_ingredients']=food_train['ingredients_list'].str.len()


for ingredient in food_train['ingredients']:
    if isinstance(ingredient, str):
        unique_characters.update(ingredient)



In the code provided below, our goal is to identify all the ingredients that exist within the dataset.
Subsequently, we aim to determine the frequency of occurrence for each ingredient throughout the dataset.
Following this, we identify the commonly occurring ingredients and generate a new column that indicates the presence of ingredients that intersect with these frequently appearing terms


In [34]:

food_test['ingredients_list']=food_test['ingredients'].apply(lambda x :re.split(r',\s*(?![^()]*\))',str(x)))

food_train_no_ch = food_train.copy()
food_test_no_ch= food_test.copy()

def clean_text(text):
    if isinstance(text, str):
        cleaned_text = re.sub(r'[^a-zA-Z0-9\s,%]', '', text)
        return cleaned_text
    else:
        return text


food_train_no_ch['ingredients'] = food_train_no_ch['ingredients'].apply(clean_text)
food_test_no_ch['ingredients'] = food_test_no_ch['ingredients'].apply(clean_text)


ingredient_counts = Counter()

for i in food_train_no_ch['ingredients'].str.lower().str.split(', '):
    if isinstance(i, list):
        for ingredient in i:
            if ingredient in ingredient_counts:
                ingredient_counts[ingredient] += 1
            else:
                ingredient_counts[ingredient] = 1


common_ingredient = sorted(ingredient_counts.keys(), key=lambda x: ingredient_counts[x], reverse=True)
top_common = common_ingredient[:1700]


def intersection(list_1, list_2):
    temp = set(list_2)
    common_list = [value for value in list_1 if value in temp]
    return common_list


food_train_no_ch['top_ingredients']=food_train_no_ch['ingredients_list'].apply(lambda x:intersection(top_common,x))
food_test_no_ch['top_ingredients']=food_test_no_ch['ingredients_list'].apply(lambda x:intersection(top_common,x))


In [35]:

mlb = MultiLabelBinarizer()
mlb.fit([top_common])


train_ing = pd.DataFrame(mlb.transform(food_train_no_ch['top_ingredients'])
,
 columns=mlb.classes_,
 index=food_train_no_ch.index)

test_ing = pd.DataFrame(mlb.transform(food_test_no_ch['top_ingredients'])
,
 columns=mlb.classes_,
 index=food_test_no_ch.index)


Here, once again, we retrieve only the words that appear most frequently in the 'household_serving_fulltext' column


In [36]:
threshold_percentage = 0.007
value_counts = food_train_no_ch['household_serving_fulltext'].value_counts()
threshold_frequency = len(food_train_no_ch) * threshold_percentage
values_above_threshold = value_counts[value_counts > threshold_frequency].index.tolist()

In [37]:
def check_value(value, values_above_threshold):
    if value in values_above_threshold:
        return value
    else:
        return 'other'
food_train_no_ch['household_serving_common'] = food_train_no_ch['household_serving_fulltext'].apply(lambda x: check_value(x,values_above_threshold))
food_test_no_ch['household_serving_common'] = food_test_no_ch['household_serving_fulltext'].apply(lambda x: check_value(x,values_above_threshold))


In [38]:
from sklearn.preprocessing import OneHotEncoder

def hotone(train_col, test_col):
    oh = OneHotEncoder(handle_unknown='ignore')

    trained_oh = oh.fit(train_col.values.reshape(-1, 1))
    train_new = trained_oh.transform(train_col.values.reshape(-1, 1))
    train_new = pd.DataFrame.sparse.from_spmatrix(train_new, index=train_col.index)

    test_new = trained_oh.transform(test_col.values.reshape(-1, 1))
    test_new = pd.DataFrame.sparse.from_spmatrix(test_new, index=test_col.index)

    return train_new, test_new


encoded_column_train,encoded_column_test = hotone(food_train_no_ch['household_serving_common'],food_test_no_ch['household_serving_common'])


train_encoded = pd.concat([food_train_no_ch, encoded_column_train], axis=1)
test_encoded = pd.concat([food_test_no_ch, encoded_column_test], axis=1)




In [39]:
final_train = pd.concat([train_encoded, train_ing], axis=1)
final_test = pd.concat([test_encoded, test_ing], axis=1)



In [40]:
train_y=final_train['category']
train_predict_y=final_train['predict_y']
final_train=final_train.drop(['category','brand','ingredients','serving_size_unit','household_serving_fulltext','ingredients_list','description','household_serving_common','top_ingredients','brand','serving_size','len_ingredients','predicted_category_by_brand','predicted_category','predict_y'],axis=1)
final_test=final_test.drop(['brand','ingredients','household_serving_fulltext','ingredients_list','description','household_serving_common','top_ingredients','brand','serving_size', "serving_size_unit"],axis=1)
final_train.columns = final_train.columns.astype(str)




In [41]:
train_n,test_n,train_y_n,test_y_n=train_test_split(final_train,train_y,train_size=0.80,random_state=100)
clf = RandomForestClassifier(max_depth=45, random_state=0,n_estimators=2000)
clf.fit(train_n,train_y_n)

test_predict_forest=clf.predict(test_n)




In [42]:
(test_predict_forest==test_y_n).sum()/len(test_predict_forest)


0.8455361360415683

In the provided code snippet, our intention is to create a new model based on the 'predict_y' column.
If there exists a value that is not None within this column, we will use those values.
 Alternatively, if no such non-None value is present, we will use the prediction value generated by the random forest model.


In [43]:
for index in test_n.index:
    if train_predict_y.loc[index] is None:
        test_index = test_n.index.get_loc(index)
        train_predict_y.loc[index] = test_predict_forest[test_index]

train_predict_y = train_predict_y.loc[test_n.index]

In [44]:
(train_predict_y==test_y_n).sum()/len(train_predict_y)


0.8548260116517084

The achieved accuracy did not meet our expectations. Therefore, we have made the decision to proceed with another model.


In [45]:
final_test.columns = final_test.columns.astype(str)

clf = RandomForestClassifier(max_depth=45, random_state=0,n_estimators=2000)
clf.fit(final_train,train_y)
test_predict_forest=clf.predict(final_test)



In [49]:
np.savetxt('test_predict_RF.txt', test_predict_forest, fmt='%s')

In [51]:
pd.Series(test_predict_forest, index=food_test['idx'], name='pred_cat').to_csv('model01.csv',index_label='idx')
