In [138]:
import pandas as pd
import numpy as np

## Read the dataset and make some cleaning

In [238]:
df = pd.read_csv('food_coded.csv')

In [239]:
df['healthy_meal'] = df.healthy_meal.str.lower()
df['healthy_meal'] = df.healthy_meal.str.replace('[\.,&;\(\)]', '')

In [240]:
df.healthy_meal

0                                        looks not oily 
1      grains veggies more of grains and veggies smal...
2      usually includes natural ingredients nonproces...
3                 fresh fruits vegetables organic meats 
4      a lean protein such as grilled chicken green v...
5             requires veggies fruits and a cooked meal 
6                protein vegetables fruit and some carbs
7      a healthy meal has a piece of meat followed by...
8                                               colorful
9               chicken and rice with a side of veggies 
10                                  chicken and veggies 
11              lean protein veggies fruit complex carbs
12     a salad with a reasonable amount of dressing a...
13     lots of vegetabls with some grains like rice a...
14                                  green and not greasy
15                                  chicken veggies rice
16     not too much carbs a lot of protein healthy fa...
17     for me usually a big pie

In [241]:
df.healthy_meal.isnull().any()

True

In [242]:
df_healthy = df[['healthy_meal', 'healthy_feeling']]
df_healthy = df_healthy.rename(columns={'healthy_feeling': 'target'})
#df_healthy['target'] = df_healthy.target.astype(np.float64)

In [243]:
df_healthy = df_healthy.dropna()

In [244]:
df_healthy.shape

(124, 2)

## Split data in train and test

In [270]:
from sklearn.model_selection import train_test_split
df_train, df_test, y_train, y_test = train_test_split(df_healthy, df_healthy.target, test_size=0.2, random_state=1567, stratify=df_healthy.target)

## Transform Train using TF-IDF

In [271]:
from sklearn.feature_extraction.text import CountVectorizer

In [272]:
count_vect = CountVectorizer(stop_words='english', )
X_train_counts = count_vect.fit_transform(df_train.healthy_meal)
X_train_counts.shape

(99, 159)

In [273]:
X_train_counts

<99x159 sparse matrix of type '<class 'numpy.int64'>'
	with 476 stored elements in Compressed Sparse Row format>

In [274]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=True).fit(X_train_counts)
X_train_tfidf = tf_transformer.transform(X_train_counts)
X_train_tfidf.shape

(99, 159)

## Train the model

In [275]:
from sklearn.ensemble import RandomForestRegressor
clfr = RandomForestRegressor(n_estimators=15, max_depth=7)
clfr.fit(X_train_tfidf, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=7,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False)

In [276]:
from sklearn.ensemble import RandomForestClassifier
clfc = RandomForestClassifier(n_estimators=15, max_depth=9)
clfc.fit(X_train_tfidf, df_train.target)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=9, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

## Transform the test

In [277]:
X_test_counts = count_vect.transform(df_test.healthy_meal)
X_test_tfidf = tf_transformer.transform(X_test_counts)

In [278]:
X_test_tfidf.shape

(25, 159)

In [279]:
y_proba_reg = clfr.predict(X_test_tfidf)
y_pred_class = clfc.predict(X_test_tfidf)

## Evaluate the model regression

In [280]:
from sklearn.metrics import mean_squared_error

In [285]:
mean_squared_error(y_test, y_proba_reg)

9.015613313464724

## Evaluate the model classifier

In [284]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred_class, average='micro')

0.20000000000000004