In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Simple data analysis for numeric features

## 1. load the data and prepare the data

In [2]:
df = pd.read_csv('../data/epi-recipes-num.csv')
df2 = df.dropna()

In [3]:
def rm_outliers(data, m=1):
    return data[abs(data - np.mean(data)) < m * np.std(data)]

In [4]:
df2['calories'] = rm_outliers(df2['nutritions.Calories'])
df2['calories'] = rm_outliers(df2['calories'])
df3 = df2.dropna()
df3 = df3.drop(['calories'], axis=1)
df3.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


(17064, 15)

In [5]:
df3["RRR"] = ((df3['nutritions.Protein']/50 + df3['nutritions.Fiber']/25)/2) / ((df3['nutritions.Calories']/2000 + df3['nutritions.Saturated Fat']/20 + df3['nutritions.Cholesterol']/300 + df3['nutritions.Carbohydrates']/300 + df3['nutritions.Sodium']/2400)/5)
df3.shape

(17064, 16)

In [6]:
data = df3[['nutritions.Calories', 'nutritions.Carbohydrates','nutritions.Cholesterol', 'nutritions.Fat', 'nutritions.Fiber','nutritions.Monounsaturated Fat', 'nutritions.Polyunsaturated Fat','nutritions.Protein', 'nutritions.Saturated Fat', 'nutritions.Sodium']]
target = df3['healthy']
rrr = np.array(df3['RRR']).reshape(-1,1)

In [7]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2).fit(data).transform(data)
pca.shape

(17064, 2)

In [8]:
sum(target)

1233

## 2. build models

### 2.1 linear regression

In [10]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression(normalize=True)
reg.fit(data, target)
reg.score(data, target)

0.0512697796286844

In [11]:
sum(reg.predict(data)>0.2)

241

In [12]:
reg2 = LinearRegression(normalize=True)
reg2.fit(rrr, target)
reg2.score(rrr, target)

0.05236463326332452

In [13]:
sum(reg2.predict(rrr)>0.2)

605

In [14]:
reg3 = LinearRegression(normalize=True)
reg3.fit(pca, target)
reg3.score(pca, target)

0.00760514904778964

In [15]:
sum(reg3.predict(pca)>0.2)

16

### 2.2 logistic regression

In [16]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg.fit(data, target)
logreg.score(data, target)

0.9272737927801219

In [17]:
from sklearn import metrics
print(metrics.classification_report(target, logreg.predict(data)), sum(logreg.predict(data)))

             precision    recall  f1-score   support

      False       0.93      1.00      0.96     15831
       True       0.36      0.01      0.02      1233

avg / total       0.89      0.93      0.89     17064
 28


In [18]:
logreg2 = LogisticRegression()
logreg2.fit(rrr, target)
logreg2.score(rrr, target)

0.9265119549929677

In [19]:
print(metrics.classification_report(target, logreg2.predict(rrr)), sum(logreg2.predict(rrr)))

             precision    recall  f1-score   support

      False       0.93      1.00      0.96     15831
       True       0.30      0.01      0.02      1233

avg / total       0.88      0.93      0.89     17064
 53


In [20]:
logreg3 = LogisticRegression()
logreg3.fit(pca, target)
logreg3.score(pca, target)

0.9273909985935302

In [21]:
print(metrics.classification_report(target, logreg3.predict(pca)), sum(logreg3.predict(pca)))

             precision    recall  f1-score   support

      False       0.93      1.00      0.96     15831
       True       0.00      0.00      0.00      1233

avg / total       0.86      0.93      0.89     17064
 6


### 2.3 random forest

In [22]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=8, random_state=12345)
rf.fit(data, target)
rf.score(data, target)

0.9289732770745429

In [23]:
print(metrics.classification_report(target, rf.predict(data)), sum(rf.predict(data)))

             precision    recall  f1-score   support

      False       0.93      1.00      0.96     15831
       True       1.00      0.02      0.03      1233

avg / total       0.93      0.93      0.90     17064
 21


In [24]:
rf2 = RandomForestClassifier(max_depth=8, random_state=12345)
rf2.fit(rrr, target)
rf2.score(rrr, target)

0.9281528363806845

In [25]:
print(metrics.classification_report(target, rf2.predict(rrr)), sum(rf2.predict(rrr)))

             precision    recall  f1-score   support

      False       0.93      1.00      0.96     15831
       True       0.89      0.01      0.01      1233

avg / total       0.93      0.93      0.89     17064
 9


In [26]:
rf3 = RandomForestClassifier(max_depth=8, random_state=12345)
rf3.fit(pca, target)
rf3.score(pca, target)

0.9279184247538678

In [27]:
print(metrics.classification_report(target, rf3.predict(pca)), sum(rf3.predict(pca)))

             precision    recall  f1-score   support

      False       0.93      1.00      0.96     15831
       True       1.00      0.00      0.00      1233

avg / total       0.93      0.93      0.89     17064
 3


## 3. resample for unbalance dataset

In [28]:
df_h = df3[df3.healthy == 1]
df_n = df3[df3.healthy == 0]
len(df_h)

1233

In [29]:
n = int(len(df3)/2)
n

8532

In [30]:
from sklearn.utils import resample, shuffle
df_h2 = resample(df_h, replace = True, n_samples = n, random_state = 100)
df_n2 = resample(df_n, replace = True, n_samples = n, random_state = 100)
df4 = df_h2.append(df_n2)
df4 = shuffle(df4, random_state = 100)
df4.shape

(17064, 16)

In [31]:
data2 = df4[['nutritions.Calories', 'nutritions.Carbohydrates','nutritions.Cholesterol', 'nutritions.Fat', 'nutritions.Fiber','nutritions.Monounsaturated Fat', 'nutritions.Polyunsaturated Fat','nutritions.Protein', 'nutritions.Saturated Fat', 'nutritions.Sodium']]
target2 = df4['healthy']
rrr2 = np.array(df4['RRR']).reshape(-1,1)

In [32]:
sum(target2)

8532

## 4. rebuild model based on resampled data

### 4.1 linear regression

In [33]:
reg = LinearRegression(normalize=True)
reg.fit(data2, target2)
reg.score(data2, target2)

0.20469164894330016

In [34]:
sum(reg.predict(data2)>0.5)

9797

In [35]:
reg2 = LinearRegression(normalize=True)
reg2.fit(rrr2, target2)
reg2.score(rrr2, target2)

0.15430778042139415

In [36]:
sum(reg2.predict(rrr2)>0.5)

7482

### 4.2 logistic regression

In [37]:
logreg = LogisticRegression()
logreg.fit(data2, target2)
logreg.score(data2, target2)

0.7163619315518049

In [38]:
print(metrics.classification_report(target2, logreg.predict(data2)), sum(logreg.predict(data2)))

             precision    recall  f1-score   support

      False       0.74      0.67      0.70      8532
       True       0.70      0.77      0.73      8532

avg / total       0.72      0.72      0.72     17064
 9382


In [39]:
logreg2 = LogisticRegression()
logreg2.fit(rrr2, target2)
logreg2.score(rrr2, target2)

0.6831926863572433

In [40]:
print(metrics.classification_report(target2, logreg2.predict(rrr2)), sum(logreg2.predict(rrr2)))

             precision    recall  f1-score   support

      False       0.67      0.72      0.70      8532
       True       0.70      0.64      0.67      8532

avg / total       0.68      0.68      0.68     17064
 7870


### 4.3 random forest

In [41]:
rf = RandomForestClassifier(max_depth=8, random_state=12345)
rf.fit(data2, target2)
rf.score(data2, target2)

0.8495663384903891

In [42]:
print(metrics.classification_report(target2, rf.predict(data2)), sum(rf.predict(data2)))

             precision    recall  f1-score   support

      False       0.92      0.77      0.84      8532
       True       0.80      0.93      0.86      8532

avg / total       0.86      0.85      0.85     17064
 9879


In [43]:
rf2 = RandomForestClassifier(max_depth=8, random_state=12345)
rf2.fit(rrr2, target2)
rf2.score(rrr2, target2)

0.7519924988279418

In [44]:
print(metrics.classification_report(target2, rf2.predict(rrr2)), sum(rf2.predict(rrr2)))

             precision    recall  f1-score   support

      False       0.79      0.68      0.73      8532
       True       0.72      0.82      0.77      8532

avg / total       0.76      0.75      0.75     17064
 9730


## 5. use the model to predict test data

In [48]:
rf.score(data, target)

0.6729957805907173

In [46]:
print(metrics.classification_report(target, rf.predict(data)), sum(rf.predict(data)))

             precision    recall  f1-score   support

      False       0.99      0.74      0.85     15831
       True       0.21      0.91      0.35      1233

avg / total       0.93      0.75      0.81     17064
 5274


In [53]:
rf2.score(rrr, target)

0.6729957805907173

In [54]:
print(metrics.classification_report(target, rf2.predict(rrr)), sum(rf2.predict(rrr)))

             precision    recall  f1-score   support

      False       0.98      0.66      0.79     15831
       True       0.16      0.82      0.27      1233

avg / total       0.92      0.67      0.75     17064
 6359


In [103]:
prob = rf.predict_proba(data)

In [116]:
sum(prob > 0.5)

array([11790,  5274])

In [117]:
for i in prob:
    print(i[1])
    break

0.3591093513383067


In [120]:
prob_tran = [1 if i[1] > 0.7 else 0 for i in prob]
sum(prob_tran)

1926

In [123]:
print(metrics.classification_report(target, prob_tran))

             precision    recall  f1-score   support

      False       0.97      0.92      0.94     15831
       True       0.37      0.58      0.45      1233

avg / total       0.92      0.90      0.91     17064



### comments: 
use resampled data to build a random forest with max_depth=8, and set the threshold=0.7, we can get 0.45 F1 for healthy label

## OneClassSVM

In [135]:
from sklearn import svm
ocs = svm.OneClassSVM()
ocs.fit(data2)
y_pred = ocs.predict(data)

In [136]:
yy = [1 if x==1 else 0 for x in y_pred]

In [137]:
sum(yy)

1542

In [138]:
print(metrics.classification_report(target, yy))

             precision    recall  f1-score   support

      False       0.96      0.94      0.95     15831
       True       0.41      0.52      0.46      1233

avg / total       0.92      0.91      0.92     17064



In [139]:
ocs2 = svm.OneClassSVM()
ocs2.fit(rrr2)
y_pred2 = ocs2.predict(rrr)

In [140]:
yy2 = [1 if x==1 else 0 for x in y_pred2]
sum(yy2)

8201

In [141]:
print(metrics.classification_report(target, yy2))

             precision    recall  f1-score   support

      False       0.93      0.52      0.67     15831
       True       0.08      0.52      0.14      1233

avg / total       0.87      0.52      0.63     17064

