In [1]:
## Import packages
# data analysis and wrangling
import pandas as pd
import numpy as np

# visualization
import matplotlib.pyplot as plt
%matplotlib inline

# encoders
from category_encoders import OneHotEncoder, OrdinalEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

#metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, classification_report, roc_auc_score

#pipeline
from sklearn.pipeline import make_pipeline

# machine learning
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier


### I wrangle the data

In [2]:
# absolute path
df = pd.read_csv('/Users/sisichen/Desktop/Unit2-sprint3-ds25/data/burritos/burritos.csv')

In [23]:
# relative path
df = pd.read_csv('../data/burritos/burritos.csv', parse_dates=['Date'], index_col='Date')

In [24]:
df.head()

Unnamed: 0_level_0,Location,Burrito,Neighborhood,Address,URL,Yelp,Google,Chips,Cost,Hunger,...,Nopales,Lobster,Queso,Egg,Mushroom,Bacon,Sushi,Avocado,Corn,Zucchini
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2016-01-18,Donato's taco shop,California,Miramar,6780 Miramar Rd,http://donatostacoshop.net/,3.5,4.2,,6.49,3.0,...,,,,,,,,,,
2016-01-24,Oscar's Mexican food,California,San Marcos,225 S Rancho Santa Fe Rd,http://www.yelp.com/biz/oscars-mexican-food-sa...,3.5,3.3,,5.45,3.5,...,,,,,,,,,,
2016-01-24,Oscar's Mexican food,Carnitas,,,,,,,4.85,1.5,...,,,,,,,,,,
2016-01-24,Oscar's Mexican food,Carne asada,,,,,,,5.25,2.0,...,,,,,,,,,,
2016-01-27,Pollos Maria,California,Carlsbad,3055 Harding St,http://pollosmaria.com/,4.0,3.8,x,6.59,4.0,...,,,,,,,,,,


In [None]:
df['great'] = (df['overall'] > 4)*1

In [25]:
df.isnull().sum()/len(df)

Location        0.000000
Burrito         0.000000
Neighborhood    0.782506
Address         0.791962
URL             0.794326
                  ...   
Bacon           0.992908
Sushi           0.995272
Avocado         0.969267
Corn            0.992908
Zucchini        0.997636
Length: 65, dtype: float64

In [26]:
na_cols = df.columns[df.isnull().mean()>0.5]

In [27]:
df.drop(columns=na_cols, inplace=True)

In [35]:
df.head()

Unnamed: 0_level_0,Location,Burrito,Cost,Hunger,Tortilla,Temp,Meat,Fillings,Meat:filling,Uniformity,Salsa,Synergy,Wrap,overall,Reviewer,great
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2016-01-18,Donato's taco shop,California,6.49,3.0,3.0,5.0,3.0,3.5,4.0,4.0,4.0,4.0,4.0,3.8,Scott,0
2016-01-24,Oscar's Mexican food,California,5.45,3.5,2.0,3.5,2.5,2.5,2.0,4.0,3.5,2.5,5.0,3.0,Scott,0
2016-01-24,Oscar's Mexican food,Carnitas,4.85,1.5,3.0,2.0,2.5,3.0,4.5,4.0,3.0,3.0,5.0,3.0,Emily,0
2016-01-24,Oscar's Mexican food,Carne asada,5.25,2.0,3.0,2.0,3.5,3.0,4.0,5.0,4.0,4.0,5.0,3.75,Ricardo,0
2016-01-27,Pollos Maria,California,6.59,4.0,4.0,5.0,4.0,3.5,4.5,5.0,2.5,4.5,4.0,4.2,Scott,1


In [40]:
cat_cols = df.select_dtypes(include='object').columns

In [41]:
df.drop(columns=cat_cols, inplace=True)

In [42]:
df

Unnamed: 0_level_0,Cost,Hunger,Tortilla,Temp,Meat,Fillings,Meat:filling,Uniformity,Salsa,Synergy,Wrap,overall,great
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2016-01-18,6.49,3.0,3.0,5.0,3.0,3.5,4.0,4.0,4.0,4.0,4.0,3.80,0
2016-01-24,5.45,3.5,2.0,3.5,2.5,2.5,2.0,4.0,3.5,2.5,5.0,3.00,0
2016-01-24,4.85,1.5,3.0,2.0,2.5,3.0,4.5,4.0,3.0,3.0,5.0,3.00,0
2016-01-24,5.25,2.0,3.0,2.0,3.5,3.0,4.0,5.0,4.0,4.0,5.0,3.75,0
2016-01-27,6.59,4.0,4.0,5.0,4.0,3.5,4.5,5.0,2.5,4.5,4.0,4.20,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-08-27,6.00,1.0,5.0,4.0,3.5,,4.0,4.0,2.0,2.0,5.0,3.50,0
2019-08-27,6.00,4.0,4.0,5.0,,3.5,4.0,4.0,5.0,4.0,3.0,4.00,0
2019-08-27,7.90,3.0,4.0,4.0,4.0,3.7,3.0,2.0,3.5,4.0,4.5,3.50,0
2019-08-27,7.90,3.0,5.0,2.0,5.0,5.0,5.0,2.0,5.0,5.0,2.0,4.00,0


In [43]:
## create wrangle function
def wrangle(data_path):
    df = pd.read_csv(data_path, parse_dates=['Date'], index_col='Date')
    
    df['great'] = (df['overall'] > 4)*1
    
    na_cols = df.columns[df.isnull().mean()>0.5]
    
    df.drop(columns=na_cols, inplace=True)
    
    cat_cols = df.select_dtypes(include='object').columns
    # high_car = [col for col in cat_cols if df[col].nunique()> cutoff]
    df.drop(columns=cat_cols, inplace=True)
    
    df.drop(columns='overall',inplace=True)
    
    return df

In [44]:
data_path = '../data/burritos/burritos.csv'
df = wrangle(data_path)

In [45]:
df

Unnamed: 0_level_0,Cost,Hunger,Length,Circum,Volume,Tortilla,Temp,Meat,Fillings,Meat:filling,Uniformity,Salsa,Synergy,Wrap,great
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2016-01-18,6.49,3.0,,,,3.0,5.0,3.0,3.5,4.0,4.0,4.0,4.0,4.0,0
2016-01-24,5.45,3.5,,,,2.0,3.5,2.5,2.5,2.0,4.0,3.5,2.5,5.0,0
2016-01-24,4.85,1.5,,,,3.0,2.0,2.5,3.0,4.5,4.0,3.0,3.0,5.0,0
2016-01-24,5.25,2.0,,,,3.0,2.0,3.5,3.0,4.0,5.0,4.0,4.0,5.0,0
2016-01-27,6.59,4.0,,,,4.0,5.0,4.0,3.5,4.5,5.0,2.5,4.5,4.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-08-27,6.00,1.0,17.0,20.5,0.57,5.0,4.0,3.5,,4.0,4.0,2.0,2.0,5.0,0
2019-08-27,6.00,4.0,19.0,26.0,1.02,4.0,5.0,,3.5,4.0,4.0,5.0,4.0,3.0,0
2019-08-27,7.90,3.0,20.0,22.0,0.77,4.0,4.0,4.0,3.7,3.0,2.0,3.5,4.0,4.5,0
2019-08-27,7.90,3.0,22.5,24.5,1.07,5.0,2.0,5.0,5.0,5.0,2.0,5.0,5.0,2.0,0


In [47]:
df = df.sort_index()

In [48]:
df

Unnamed: 0_level_0,Cost,Hunger,Length,Circum,Volume,Tortilla,Temp,Meat,Fillings,Meat:filling,Uniformity,Salsa,Synergy,Wrap,great
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2011-05-16,8.00,4.0,,,,3.0,,2.0,3.0,2.0,3.0,2.0,3.0,2.0,0
2015-04-20,,4.0,,,,5.0,,5.0,5.0,5.0,4.0,5.0,5.0,5.0,1
2016-01-18,6.49,3.0,,,,3.0,5.0,3.0,3.5,4.0,4.0,4.0,4.0,4.0,0
2016-01-24,5.25,2.0,,,,3.0,2.0,3.5,3.0,4.0,5.0,4.0,4.0,5.0,0
2016-01-24,4.85,1.5,,,,3.0,2.0,2.5,3.0,4.5,4.0,3.0,3.0,5.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-08-27,6.75,3.0,19.00,25.0,0.94,3.0,4.0,4.0,3.0,4.0,4.0,3.0,3.0,5.0,0
2019-08-27,5.50,2.0,19.00,23.0,0.80,4.5,5.0,5.0,3.5,4.0,4.5,4.0,4.9,4.5,1
2019-08-27,6.00,3.0,17.50,21.5,0.64,4.0,4.0,4.5,4.0,3.0,3.0,4.5,4.0,4.5,0
2019-08-27,5.50,3.5,17.00,21.3,0.61,3.0,5.0,4.3,4.0,4.9,3.8,3.0,4.5,4.0,1


### II.split the data

In [49]:
X = df.drop(columns='great')
y = df['great']

In [None]:
## Ven
cutoff1 = '2009-01-01'
cutoff2 = '2017-07-01'
X_train = X[(X.index > cutoff1) & ( X.index < cutoff2)]
y_train = y[(X.index > cutoff1) & ( X.index < cutoff2)]

In [54]:
cutoff1 = int(0.6*len(df))
cutoff2 = int(0.8*len(df))

In [55]:
X_train, y_train = X.iloc[:cutoff1], y.iloc[:cutoff1]
X_val, y_val = X.iloc[cutoff1:cutoff2], y.iloc[cutoff1:cutoff2]
X_test, y_test = X.iloc[cutoff2:], y.iloc[cutoff2:]

### III.Establish the baseline

In [57]:
y_train.value_counts()

0    195
1     58
Name: great, dtype: int64

In [60]:
# Baseline accuracy
print(accuracy_score(y_train, [0]*len(y_train)))

0.7707509881422925


In [61]:
print(y_train.value_counts(normalize=True).max())

0.7707509881422925


### IV. Build the model

In [62]:
model_rf = make_pipeline(OrdinalEncoder(), 
                         SimpleImputer(), 
                         RandomForestClassifier())

In [63]:
model_rf.fit(X_train,y_train)

Pipeline(steps=[('ordinalencoder', OrdinalEncoder(cols=[], mapping=[])),
                ('simpleimputer', SimpleImputer()),
                ('randomforestclassifier', RandomForestClassifier())])

### V. check the metrics

In [64]:
print("train acc", accuracy_score(y_train, model_rf.predict(X_train)))
print("val acc", accuracy_score(y_val, model_rf.predict(X_val)))
print("test acc", accuracy_score(y_test, model_rf.predict(X_test)))

train acc 1.0
val acc 0.8823529411764706
test acc 0.8470588235294118


In [67]:
# imbalanced data
y_actual = [1, 0, 0, 0, 0, 0, 0, 0]
y_pred = [0, 0, 0, 0, 0, 0 ,0, 0]

In [68]:
accuracy_score(y_actual, y_pred)

0.875

In [None]:
# confusion matrix
# classification report 
# f1 score
# PR curve and PR score

In [75]:
from sklearn.metrics import roc_curve
fpr, tpr, threshold = roc_curve(y_val,model_rf.predict_proba(X_val)[:,-1])

In [76]:
pd.DataFrame({'threshold': threshold, 'fpr': fpr, 'tpr': tpr})

Unnamed: 0,threshold,fpr,tpr
0,1.95,0.0,0.0
1,0.95,0.0,0.04
2,0.85,0.0,0.2
3,0.76,0.0,0.36
4,0.65,0.0,0.52
5,0.61,0.0,0.6
6,0.59,0.0,0.64
7,0.58,0.016667,0.68
8,0.38,0.1,0.68
9,0.36,0.1,0.76
