In [1]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import MinMaxScaler
%matplotlib inline


def my_remove_highly_correlated(X,threshold):

    X = X.select_dtypes(include=[np.number])  #.dropna()
    
    cols = X.columns

    #run correlation matrix
    df = X.corr()

    
    #put df into array
    a = df.values
    #label top half with -99999 
    #we want to ignore top half of matrix
    iu1 = np.triu_indices(len(df))
    a[iu1] = -99999
    #put data back into dataframe
    df = pd.DataFrame(a, columns=cols)
    df['var'] = cols

    #unstack to get a list of var1, var2, correlation
    df = pd.melt(df, id_vars='var')

    #remove those flagged with -99999
    df = df[df.value != -99999].sort_values(by='var', ascending=True)

    #flag remove vs keep based on corr threshold
    df_remove = df[df.value > threshold]
    keep_list = df.loc[df.value <= threshold,'var'].unique()
 
    print('{} out of {} vars removed due to corr greater than {}'.
          format(df_remove.shape[0],X.shape[1],threshold))
    
    print(df_remove)
    print('\nShape before: ' + str(X.shape))
    print('Shape after: ' + str(X[keep_list].shape))
    print('\n')
    return X[keep_list]

def my_drop_na_columns(X,NANthreshold):

    df = X
    colcount = df.shape[1]
    #Get count of NA in each column
    series_cols = df.isnull().sum(axis = 0).sort_values(ascending=False)

    #filter the list to include only those with counts above threshold
    series_cols_remove = series_cols[series_cols.values >= NANthreshold]
    series_cols_keep = series_cols[(series_cols.values < NANthreshold) & (series_cols.values > 0) ]
    
    #put the to-remove column names in a list
    list_colstodrop = series_cols_remove.index.tolist()

    #drop the columns
    df = df.drop(labels = list_colstodrop, axis=1)

    #print the results
    print('Dropped {} of {} Columns - containing more than {} NANs\n'.format(
          series_cols_remove.shape[0],colcount,NANthreshold))
    print(series_cols_remove)
    
    print('\n{} Columns remain with NANs\n'.format(series_cols_keep.shape[0]))
    print(series_cols_keep)
    print('\nShape before: ' + str(X.shape))
    print('Shape after: ' + str(df.shape))
    print('\n')    
    return df


def my_feature_selector(mytype,X,y=[],threshold=None,k=None):  
    
    #reduces # of features in given x dataset by one of 3 ways:
    #1) selectkbest  2) rfecv (recursive feature elim)  3) variancethreshold

    mytype = str.lower(mytype)
    
    #Retain original column names
    orig_cols = X.columns
    
    #Evaluate type and instatiate object
    if mytype == 'selectkbest':
        if k == None:
            print('Try Again: To run selectKBest you must pass a k for number of features to select')
            return X  
        if len(y) == 0:
            print('Try Again: To run selectKBest you must pass a y vector reflecting outcomes')
            return X             
        
        selector = SelectKBest(chi2, k=k)
        df_new = selector.fit_transform(X,y)
        keep_list = selector.get_support()
        scores = [orig_cols, selector.pvalues_, keep_list]
        score_type = 'PValue'
        
    if mytype == 'rfecv':
        if len(y) == 0:
            print('Try Again: To run rfecv you must pass a y vector reflecting outcomes')
            return X 
       
        estimator = SVR(kernel="linear")
        selector = RFECV(estimator, n_jobs=-1)
        df_new = selector.fit_transform(X,y)
        keep_list = selector.get_support()
        scores = [orig_cols, selector.ranking_, keep_list]
        score_type = 'Ranking'
            
    if mytype == 'variancethreshold':
        if threshold == None:
            print('Try Again: To run VarianceThreshold you must pass a threshold value from 0 to 1. (0 returns all)')
            return X     
    
        selector = VarianceThreshold(threshold=threshold)
        df_new = selector.fit_transform(X)
        keep_list = selector.get_support()   
        scores = [orig_cols, selector.variances_, keep_list]
        score_type = 'Variance'
        
    if (mytype != 'selectkbest') & (mytype != 'rfecv') & (mytype != 'variancethreshold'):  
        print('Try Again: Type must be passed as selectkbest, rfecv, or variancethreshold')
        return X  
    
    
    #Print Scores
    print('---- Running Feature Selection Method: ' + mytype + '-------\n')
    scores_df = pd.DataFrame(scores)
    scores_df = scores_df.transpose()
    scores_df.columns= ['Feature',score_type,'Keep']
    print(scores_df.sort_values(by=score_type, ascending=True))

    #List features that were removed vs Kept
    i = 0
    keep = []

    for item in keep_list:
        col_name = orig_cols[i]
        if item==True:
            keep.append(col_name)
        i = i + 1

    #Place resultset of kept features into DataFrame with correct column headers
    df_keep = pd.DataFrame(df_new, columns=keep)
    
    #Print Stats
    print('\nShape before: ' + str(X.shape))
    print('Shape after: ' + str(df_keep.shape))
    print('\n')
    return df_keep

def my_confusion_matrix(array_Expected,array_Predicted,colName):
    a = np.array(confusion_matrix(array_Expected, array_Predicted ))
    totalExpectedFalse = a[0,0] + a[0,1]
    totalExpectedTrue = a[1,0] + a[1,1]
    correctFalse = a[0,0] 
    correctTrue = a[1,1] 
    correctTruePct = np.round(correctTrue / totalExpectedTrue,3)
    correctFalsePct = np.round(correctFalse / totalExpectedFalse,3)
    print('Regarding {}, the model correctly predicted {} Negatives out of {} expected Negatives: {}'.format(
        colName,correctFalse,totalExpectedFalse,correctFalsePct))
    print('Regarding {}, the model correctly predicted {} Positives out of {} expected Positives: {}'.format(
        colName,correctTrue,totalExpectedTrue,correctTruePct))    
    print(a)

def my_minmax_scaler(df, min_val, max_val):
    #Take in a dataframe and return a dataframe scaled with min 0, max 1
    print('------Scaling Data to Min {}, Max {}------\n'.format(min_val,max_val))
    # Save the column names.
    names=df.columns
    
    #instatiate scaler object
    #you can use StandardScaler instead to scale with mean 0 and std 1
    scaler = MinMaxScaler(feature_range=(min_val,max_val), copy=True)
    
    # Scale, then turn the resulting numpy array back into a data frame with the
    # correct column names.
    scaler.fit(df)
    df_scaled = pd.DataFrame(scaler.transform(df), columns=names)
    print('Scaling Complete')
    return df_scaled


In [2]:
raw_data = pd.read_csv('epi_r.csv')
raw_data.head()


Unnamed: 0,title,rating,calories,protein,fat,sodium,#cakeweek,#wasteless,22-minute meals,3-ingredient recipes,...,yellow squash,yogurt,yonkers,yuca,zucchini,cookbooks,leftovers,snack,snack week,turkey
0,"Lentil, Apple, and Turkey Wrap",2.5,426.0,30.0,7.0,559.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,Boudin Blanc Terrine with Red Onion Confit,4.375,403.0,18.0,23.0,1439.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Potato and Fennel Soup Hodge,3.75,165.0,6.0,7.0,165.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Mahi-Mahi in Tomato Olive Sauce,5.0,,,,,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Spinach Noodle Casserole,3.125,547.0,20.0,32.0,452.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
df = raw_data
df = my_drop_na_columns(df,NANthreshold=2000)
#df.dropna(inplace=True)
#df[df.isnull().any(axis=1)]

#df['fat'] = np.where(df.fat > 100, 17, df.fat)
#df['fat'] = np.where(df.fat > 62, 62, df.fat)
#df['protein'] = np.where(df.protein > 50, 8, df.protein)
#df['protein'] = np.where(df.protein > 28, 28, df.protein)
#df['sodium'] = np.where(df.sodium > 1000, 296, df.sodium)
#df['calories'] = np.where(df.calories > 2000, 2000, df.calories)
df.drop(columns=['bon appétit'], axis=1, inplace=True)

Y = np.where(df.rating >= 4, 1.0, 0.0 )
X = df.drop(columns=['rating'], axis=1)


Dropped 4 of 680 Columns - containing more than 2000 NANs

fat         4183
protein     4162
sodium      4119
calories    4117
dtype: int64

0 Columns remain with NANs

Series([], dtype: int64)

Shape before: (20052, 680)
Shape after: (20052, 676)




In [4]:
X = my_remove_highly_correlated(X, threshold=.80)

7 out of 673 vars removed due to corr greater than 0.8
                  var     variable     value
5569            drink    alcoholic  0.851944
133588         london      england  1.000000
203584     louisville     kentucky  0.816476
208418    pescatarian       kosher  0.871690
286510       portland       oregon  0.829149
304769       soy free  peanut free  0.940721
304819  tree nut free  peanut free  0.829673

Shape before: (20052, 673)
Shape after: (20052, 672)




In [5]:
X = my_feature_selector(mytype='selectkbest', X=X, y=Y, k=30)

---- Running Feature Selection Method: selectkbest-------

                Feature       PValue   Keep
276      house & garden  1.39663e-38   True
185               drink  2.66413e-30   True
7             alcoholic  1.06162e-26   True
234                 gin  1.42847e-23   True
520               roast  1.02085e-20   True
618        thanksgiving  5.74861e-20   True
132      cocktail party  2.45704e-18   True
178              dinner  2.60384e-16   True
582              spirit  9.64424e-16   True
122           christmas  6.27071e-14   True
49              bitters   7.7588e-14   True
576            soy free  6.14919e-13   True
453         peanut free  1.10902e-12   True
250      grill/barbecue  1.81116e-12   True
343             low fat  9.91295e-12   True
29         backyard bbq  4.33409e-11   True
131            cocktail  1.29407e-10   True
203                fall   1.2177e-09   True
249               grill  2.07034e-09   True
595   stuffing/dressing  2.08636e-09   True
261       harperc

In [6]:
X = my_minmax_scaler(X, 0, 1)

------Scaling Data to Min 0, Max 1------

Scaling Complete


In [7]:
#'----------  SVC ---------------

from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.svm import NuSVC

svc = LinearSVC(C=30)
svc.fit(X,Y)
print(svc.score(X, Y))

svm_pred = svc.predict(X)

0.5799920207460603


In [15]:
#'----------- K Nearest Neighbor --------------

from sklearn.neighbors import KNeighborsClassifier
neighbors = KNeighborsClassifier(n_neighbors=10, weights='distance')
neighbors.fit(X,Y)
print(neighbors.score(X, Y))
neighbors_pred = neighbors.predict(X)

0.5831837223219629


In [10]:
#'----------- Naive Bayes --------------
from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()
bnb.fit(X, Y)
print(bnb.score(X, Y))
bnb_pred = bnb.predict(X)

0.5662776780371035


In [20]:
#'----------- Logistic Regression --------------
from sklearn import linear_model 
lr = linear_model.LogisticRegression(penalty='l1', C=1 )
lr.fit(X, Y)
print(lr.score(X, Y))
lr_pred = lr.predict(X)

0.5779473369239976


In [17]:
print('\n---- svm -------\n')
my_confusion_matrix(Y, svm_pred, 'highly_rated')


print('\n---- K Nearest Neighbor -------\n')
my_confusion_matrix(Y, neighbors_pred, 'highly_rated')

print('\n---- Naive Bayes -------\n')
my_confusion_matrix(Y, bnb_pred, 'highly_rated')

print('\n---- Logistic Regression -------\n')
my_confusion_matrix(Y, lr_pred, 'highly_rated')



---- svm -------

Regarding highly_rated, the model correctly predicted 2608 Negatives out of 9314 expected Negatives: 0.28
Regarding highly_rated, the model correctly predicted 9022 Positives out of 10738 expected Positives: 0.84
[[2608 6706]
 [1716 9022]]

---- K Nearest Neighbor -------

Regarding highly_rated, the model correctly predicted 4457 Negatives out of 9314 expected Negatives: 0.479
Regarding highly_rated, the model correctly predicted 7237 Positives out of 10738 expected Positives: 0.674
[[4457 4857]
 [3501 7237]]

---- Naive Bayes -------

Regarding highly_rated, the model correctly predicted 4691 Negatives out of 9314 expected Negatives: 0.504
Regarding highly_rated, the model correctly predicted 6664 Positives out of 10738 expected Positives: 0.621
[[4691 4623]
 [4074 6664]]

---- Logistic Regression -------

Regarding highly_rated, the model correctly predicted 2685 Negatives out of 9314 expected Negatives: 0.288
Regarding highly_rated, the model correctly predicted 

In [21]:
cv=5

from sklearn.model_selection import cross_val_score
print('\n---- svm -------')
score = cross_val_score(svc, X, Y, cv=cv)
print("\nCross Validation Accuracy %i folds: %.2f (+/- %.2f)" % (cv, score.mean(), (score.std() * 2)))

print('\n---- K Nearest Neighbor -------')
score = cross_val_score(neighbors, X, Y, cv=cv)
print("\nCross Validation Accuracy %i folds: %.2f (+/- %.2f)" % (cv, score.mean(), (score.std() * 2)))


print('\n---- Naive Bayes -------')
score = cross_val_score(bnb, X, Y, cv=cv)
print("\nCross Validation Accuracy %i folds: %.2f (+/- %.2f)" % (cv, score.mean(), (score.std() * 2)))



print('\n---- Logistic Regression -------')
score = cross_val_score(lr, X, Y, cv=cv)
print("\nCross Validation Accuracy %i folds: %.2f (+/- %.2f)" % (cv, score.mean(), (score.std() * 2)))


---- svm -------

Cross Validation Accuracy 5 folds: 0.58 (+/- 0.01)

---- K Nearest Neighbor -------

Cross Validation Accuracy 5 folds: 0.55 (+/- 0.02)

---- Naive Bayes -------

Cross Validation Accuracy 5 folds: 0.57 (+/- 0.02)

---- Logistic Regression -------

Cross Validation Accuracy 5 folds: 0.58 (+/- 0.02)


Oh dear, so this did seem not to work very well. In fact it is remarkably poor. Now there are many things that we could do here. 

Firstly the overfit is a problem, even though it was poor in the first place. We could go back and clean up our feature set. There might be some gains to be made by getting rid of the noise.

We could also see how removing the nulls but including dietary information performs. Though its a slight change to the question we could still possibly get some improvements there.

Lastly, we could take our regression problem and turn it into a classifier. With this number of features and a discontinuous outcome, we might have better luck thinking of this as a classification problem. We could make it simpler still by instead of classifying on each possible value, group reviews to some decided high and low values.

__And that is your challenge.__

Transform this regression problem into a binary classifier and clean up the feature set. You can choose whether or not to include nutritional information, but try to cut your feature set down to the 30 most valuable features.

Good luck!

In [61]:
review = X
review['rating'] = Y

table = review.pivot_table(review, index=['rating'], aggfunc=np.average, margins=True, )


table = table.transpose().sort_values(by='All', ascending=False)


table.columns = ['Low','High','All']
table['PercentBoost'] = round((table['High'] - table['Low']) / table.Low * 100,2)
table.sort_values(by='PercentBoost', ascending=False)

Unnamed: 0,Low,High,All,PercentBoost
stuffing/dressing,0.003114,0.010151,0.006882,226.02
goat cheese,0.009878,0.020022,0.01531,102.7
father's day,0.009985,0.018812,0.014712,88.4
meat,0.009663,0.01816,0.014213,87.93
roast,0.04767,0.081579,0.065829,71.13
grill,0.020507,0.034736,0.028127,69.39
thanksgiving,0.054005,0.088936,0.072711,64.68
christmas,0.038866,0.063047,0.051815,62.22
backyard bbq,0.035967,0.056156,0.046778,56.13
grill/barbecue,0.042517,0.065934,0.055057,55.08


When you've finished that, also take a moment to think about bias. Is there anything in this dataset that makes you think it could be biased, perhaps extremely so?

There is. Several things in fact, but most glaringly is that we don't actually have a random sample. It could be, and probably is, that the people more likely to choose some kinds of recipes are more likely to give high reviews.

After all, people who eat chocolate _might_ just be happier people.