In [1]:
# A. Import Basic Libraries

from collections import OrderedDict

# Libraries for analysis
import pandas as pd
import numpy as np
from sklearn import svm
import csv
import copy
import datetime
import time

# Libraries for visuals
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(font_scale=1.2)

In [2]:
# B. Import Data

#2010-2012
survey_trips_full = pd.read_csv('desktop/Caltrans/California_Household_Travel_Survey_2010_2012/caltrans_full_survey/survey_activity.csv', encoding = "ISO-8859-1", low_memory=False)
survey_households_full = pd.read_csv('desktop/Caltrans/California_Household_Travel_Survey_2010_2012/caltrans_full_survey/survey_households.csv', encoding = "ISO-8859-1", low_memory=False)
survey_person_full = pd.read_csv('desktop/Caltrans/California_Household_Travel_Survey_2010_2012/caltrans_full_survey/survey_person.csv', encoding = "ISO-8859-1", low_memory=False)
survey_place_full = pd.read_csv('desktop/Caltrans/California_Household_Travel_Survey_2010_2012/caltrans_full_survey/survey_place.csv', encoding = "ISO-8859-1", low_memory=False)
survey_vehicles_full = pd.read_csv('desktop/Caltrans/California_Household_Travel_Survey_2010_2012/caltrans_full_survey/survey_vehicles.csv', encoding = "ISO-8859-1", low_memory=False)




In [3]:
# C. Get Data in Right Form

# get only the columns needed from each data set
survey_trips_working = survey_trips_full[['sampno', 'perno', 'purpose', 'arr_time', 'dep_time', 'travel_date', 'county_id', 
                                          'zipcode']]
working = survey_trips_working.copy(deep=True)
working_ = working[0:10]

survey_households_working = survey_households_full[['sampno', 'home_county_id', 'home_zipcode',
                                                    'vehicle_count', 'persons_count', 'worker_count', 'student_count',
                                                    'income', 'residence_type', 'home_own']]

survey_person_working = survey_person_full[['sampno', 'perno', 'vehno', 'gender', 'education', 'race1',
                                           'empl_status', 'empl_industry', 'employment', 'empl_occupation',
                                            'student', 'school_zipcode']]

survey_vehicles_working = survey_vehicles_full[['sampno', 'vehno', 'veh_type', 'model_year', 'veh_make', 'body_type', 'fuel_type1', 'purchase_type', 'ownership']]



In [4]:
def series_day_to_num(df):
    
    '''Function that takes a DataFrame with dateTime object
    and builds a Series in the same order with the day as
    the corresponding number
    0-Sun, 1-Mon, 2-Tues, 3-Wed, 4-Thur, 5-Fri, 6-Sat'''
    
    dic = OrderedDict()
    key=0
    
    for i, row in df.iterrows():
        if row[0] == "Sunday":
            day_num = 0
        elif row[0] == "Monday":
            day_num = 1
        elif row[0] == "Tuesday":
            day_num = 2
        elif row[0] == "Wednesday":
            day_num = 3
        elif row[0] == "Thursday":
            day_num = 4
        elif row[0] == "Friday":
            day_num = 5
        elif row[0] == "Saturday":
            day_num = 6
            
        dic[key] = day_num   
        key += 1
        
    new_series = pd.Series(dic)
    return new_series
    

In [5]:
#Join all the datasets into one
working1 = pd.merge(survey_trips_working, survey_person_working, on=['sampno', 'perno'])
working2 = pd.merge(working1, survey_vehicles_working, on=['sampno', 'vehno'])
working3 = pd.merge(working2, survey_households_working, on=['sampno'])

working3



Unnamed: 0,sampno,perno,purpose,arr_time,dep_time,travel_date,county_id,zipcode,vehno,gender,...,ownership,home_county_id,home_zipcode,vehicle_count,persons_count,worker_count,student_count,income,residence_type,home_own
0,1050668,1,4,8:00:00,8:30:00,5/4/12,111.0,91360,1.0,1,...,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0
1,1050668,1,9,8:45:00,14:55:00,5/4/12,111.0,91360,1.0,1,...,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0
2,1050668,1,11,11:30:00,12:00:00,5/4/12,111.0,91360,1.0,1,...,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0
3,1050668,1,1,15:00:00,2:59:00,5/4/12,111.0,91360,1.0,1,...,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0
4,1050668,1,1,5:30:00,7:30:00,5/4/12,111.0,91360,1.0,1,...,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0
5,1050668,1,2,7:30:00,8:00:00,5/4/12,111.0,91360,1.0,1,...,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0
6,1050668,2,1,3:00:00,7:30:00,5/4/12,111.0,91360,2.0,2,...,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0
7,1050668,2,2,7:30:00,8:00:00,5/4/12,111.0,91360,2.0,2,...,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0
8,1050668,2,4,8:00:00,8:30:00,5/4/12,111.0,91360,2.0,2,...,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0
9,1055989,2,1,3:00:00,6:30:00,5/24/12,77.0,95330,2.0,1,...,1,77.0,95330.0,2.0,2.0,0.0,0.0,7.0,1.0,1.0


In [6]:
#make a copy as a checkpoint
working_trips = working3.copy(deep=True)
working_trips

#change travel_data column to day of the week
working_trips.travel_date = pd.to_datetime(working_trips.travel_date, format='%m/%d/%y').dt.weekday_name

#get hour and minute of arrival and departure and make new colums
working_trips['arrival_hour'] = pd.to_datetime(working_trips.arr_time, format='%H:%M:%S').dt.hour
working_trips['departure_hour'] = pd.to_datetime(working_trips.dep_time, format='%H:%M:%S').dt.hour

working_trips.head(n=10)


Unnamed: 0,sampno,perno,purpose,arr_time,dep_time,travel_date,county_id,zipcode,vehno,gender,...,home_zipcode,vehicle_count,persons_count,worker_count,student_count,income,residence_type,home_own,arrival_hour,departure_hour
0,1050668,1,4,8:00:00,8:30:00,Friday,111.0,91360,1.0,1,...,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,8,8
1,1050668,1,9,8:45:00,14:55:00,Friday,111.0,91360,1.0,1,...,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,8,14
2,1050668,1,11,11:30:00,12:00:00,Friday,111.0,91360,1.0,1,...,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,11,12
3,1050668,1,1,15:00:00,2:59:00,Friday,111.0,91360,1.0,1,...,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,15,2
4,1050668,1,1,5:30:00,7:30:00,Friday,111.0,91360,1.0,1,...,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,5,7
5,1050668,1,2,7:30:00,8:00:00,Friday,111.0,91360,1.0,1,...,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,7,8
6,1050668,2,1,3:00:00,7:30:00,Friday,111.0,91360,2.0,2,...,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,3,7
7,1050668,2,2,7:30:00,8:00:00,Friday,111.0,91360,2.0,2,...,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,7,8
8,1050668,2,4,8:00:00,8:30:00,Friday,111.0,91360,2.0,2,...,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,8,8
9,1055989,2,1,3:00:00,6:30:00,Thursday,77.0,95330,2.0,1,...,95330.0,2.0,2.0,0.0,0.0,7.0,1.0,1.0,3,6


In [7]:
#now change travel_date column from string to corresponding int
Series_travel_day = working_trips[['travel_date']]
num_travel_day = series_day_to_num(Series_travel_day)
num_travel_day.head(n=10)

0    5
1    5
2    5
3    5
4    5
5    5
6    5
7    5
8    5
9    4
dtype: int64

In [8]:
#make another copy as a checkpoint
working_trips_f = working_trips.copy(deep=True)

#turn num_travel_day (Series into DataFrame for easy joining)
DF_num_travel_day = num_travel_day.to_frame()
DF_num_travel_day.columns = ['travel_day_of_week']

#add column to the data frame
df_concat = pd.concat([working_trips_f, DF_num_travel_day], axis=1)
df_concat 

#drop old columns that aren't dateTime objects
df_concat.drop(columns=['dep_time', 'arr_time', 'travel_date', 'sampno', 'vehno', 'perno'], axis=1,  inplace=True)

#so i can see all columns in the DataFrame without truncation
pd.set_option('display.max_columns', 500)
df_concat

Unnamed: 0,purpose,county_id,zipcode,gender,education,race1,empl_status,empl_industry,employment,empl_occupation,student,school_zipcode,veh_type,model_year,veh_make,body_type,fuel_type1,purchase_type,ownership,home_county_id,home_zipcode,vehicle_count,persons_count,worker_count,student_count,income,residence_type,home_own,arrival_hour,departure_hour,travel_day_of_week
0,4,111.0,91360,1,5,1.0,,31.0,1.0,11.0,3,,2,2007,27,1,1.0,1,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,8,8,5
1,9,111.0,91360,1,5,1.0,,31.0,1.0,11.0,3,,2,2007,27,1,1.0,1,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,8,14,5
2,11,111.0,91360,1,5,1.0,,31.0,1.0,11.0,3,,2,2007,27,1,1.0,1,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,11,12,5
3,1,111.0,91360,1,5,1.0,,31.0,1.0,11.0,3,,2,2007,27,1,1.0,1,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,15,2,5
4,1,111.0,91360,1,5,1.0,,31.0,1.0,11.0,3,,2,2007,27,1,1.0,1,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,5,7,5
5,2,111.0,91360,1,5,1.0,,31.0,1.0,11.0,3,,2,2007,27,1,1.0,1,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,7,8,5
6,1,111.0,91360,2,6,1.0,1.0,,2.0,,3,,2,2001,13,2,1.0,1,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,3,7,5
7,2,111.0,91360,2,6,1.0,1.0,,2.0,,3,,2,2001,13,2,1.0,1,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,7,8,5
8,4,111.0,91360,2,6,1.0,1.0,,2.0,,3,,2,2001,13,2,1.0,1,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,8,8,5
9,1,77.0,95330,1,1,97.0,1.0,,2.0,,3,,2,2001,35,3,1.0,1,1,77.0,95330.0,2.0,2.0,0.0,0.0,7.0,1.0,1.0,3,6,4


In [9]:
data = df_concat.dropna(axis="columns")
data

Unnamed: 0,purpose,gender,education,race1,student,veh_type,model_year,veh_make,body_type,fuel_type1,purchase_type,ownership,home_county_id,home_zipcode,vehicle_count,persons_count,worker_count,student_count,income,residence_type,home_own,arrival_hour,departure_hour,travel_day_of_week
0,4,1,5,1.0,3,2,2007,27,1,1.0,1,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,8,8,5
1,9,1,5,1.0,3,2,2007,27,1,1.0,1,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,8,14,5
2,11,1,5,1.0,3,2,2007,27,1,1.0,1,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,11,12,5
3,1,1,5,1.0,3,2,2007,27,1,1.0,1,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,15,2,5
4,1,1,5,1.0,3,2,2007,27,1,1.0,1,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,5,7,5
5,2,1,5,1.0,3,2,2007,27,1,1.0,1,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,7,8,5
6,1,2,6,1.0,3,2,2001,13,2,1.0,1,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,3,7,5
7,2,2,6,1.0,3,2,2001,13,2,1.0,1,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,7,8,5
8,4,2,6,1.0,3,2,2001,13,2,1.0,1,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,8,8,5
9,1,1,1,97.0,3,2,2001,35,3,1.0,1,1,77.0,95330.0,2.0,2.0,0.0,0.0,7.0,1.0,1.0,3,6,4


In [10]:
#Check if all the types are numbers
#df_concat.dtypes
data.dtypes

#have to change columns that are objects to ints

purpose                 int64
gender                  int64
education               int64
race1                 float64
student                 int64
veh_type                int64
model_year              int64
veh_make                int64
body_type               int64
fuel_type1            float64
purchase_type           int64
ownership               int64
home_county_id        float64
home_zipcode          float64
vehicle_count         float64
persons_count         float64
worker_count          float64
student_count         float64
income                float64
residence_type        float64
home_own              float64
arrival_hour            int64
departure_hour          int64
travel_day_of_week      int64
dtype: object

In [11]:
#Break up data into X (data) and y (target)

#make a final copy of the data
trips = data.copy(deep=True)

#get the targets (y)
y = trips['purpose']
y.head()


0     4
1     9
2    11
3     1
4     1
Name: purpose, dtype: int64

In [12]:
#make one with less features so less likely to overfit
simple_trips = trips[['gender', 'education','income',
                    'arrival_hour','travel_day_of_week']]
simple_trips

Unnamed: 0,gender,education,income,arrival_hour,travel_day_of_week
0,1,5,10.0,8,5
1,1,5,10.0,8,5
2,1,5,10.0,11,5
3,1,5,10.0,15,5
4,1,5,10.0,5,5
5,1,5,10.0,7,5
6,2,6,10.0,3,5
7,2,6,10.0,7,5
8,2,6,10.0,8,5
9,1,1,7.0,3,4


In [13]:
# Data (X)

# Have to drop purpose to get X

# all features
trips_minus_purpose = trips.drop(columns=['purpose'])
X = trips_minus_purpose.copy(deep=True)


#simple features
#simple_trips_minus_purpose = simple_trips.drop(columns=['purpose'])
X_simple = simple_trips.copy(deep=True)

#test that X and Y are the same length
print(X.shape)
print(y.shape)

print(X_simple.shape)
print(y.shape)

(430803, 23)
(430803,)
(430803, 5)
(430803,)


In [14]:
# D. Fit the Model (Using Train/Test Split)

# D.a ~~~~~~~~~~~~~~ Train/Test split ~~~~~~~~~~~~~~~

# Step 1: Split X and y into training and test **
from sklearn.model_selection import train_test_split

#on all features
X_train, X_test, y_train, y_test = train_test_split(X , y,
                                                    test_size = 0.3, random_state = 4)

X_train_simple, X_test_simple, y_train_simple, y_test_simple = train_test_split(X_simple , 
                                                                                y, test_size = 0.3, random_state = 4)


In [15]:
# Make sure training data is of the same length
print(X_train.shape)
print(y_train.shape)
print(X_train_simple.shape)
print(y_train_simple.shape)


# Make sure training data is of the same length
print(X_test.shape)
print(y_test.shape)
print(X_test_simple.shape)
print(y_test_simple.shape)


(301562, 23)
(301562,)
(301562, 5)
(301562,)
(129241, 23)
(129241,)
(129241, 5)
(129241,)


In [16]:
#Build Mini Data to use for testing

# on all the features

X_train_small = X_train[:1000]
y_train_small = y_train[:1000]
X_test_small = X_test[:250]
y_test_small = y_test[:250]

X_train_med = X_train[:5000]
y_train_med = y_train[:5000]
X_test_med = X_test[:1250]
y_test_med = y_test[:1250]

X_train_large = X_train[:10000]
y_train_large = y_train[:10000]
X_test_large = X_test[:2500]
y_test_large = y_test[:2500]


# new stuff

# youtube 1 Ensemble Methods


# Gradient Tree Boosting

In [17]:
from sklearn.ensemble import GradientBoostingClassifier

GradientBoostingClassifier?

# small

In [18]:
GTB_s = GradientBoostingClassifier(n_estimators=100)

start_time = time.time()

GTB_s.fit(X_train_small ,y_train_small)

y_pred_small = GTB_s.predict(X_test_small)

from sklearn import metrics

print("classification accuracy for logreg model: ", 
      metrics.accuracy_score(y_test_small, y_pred_small))

print("--- %s seconds ---" % (time.time() - start_time))

classification accuracy for logreg model:  0.372
--- 6.555329084396362 seconds ---


# medium

n_estimators = 100

In [19]:
GTB_m = GradientBoostingClassifier(n_estimators=100)

start_time = time.time()

GTB_m.fit(X_train_med ,y_train_med)

y_pred_med = GTB_m.predict(X_test_med)

from sklearn import metrics

print("classification accuracy for logreg model: ", 
      metrics.accuracy_score(y_test_med, y_pred_med))

print("--- %s seconds ---" % (time.time() - start_time))

classification accuracy for logreg model:  0.3856
--- 33.807536125183105 seconds ---


# Large

n_estimators = 100

In [20]:
GTB_l = GradientBoostingClassifier(n_estimators=100)

start_time = time.time()

GTB_l.fit(X_train_large ,y_train_large)

y_pred_large = GTB_l.predict(X_test_large)

from sklearn import metrics

print("classification accuracy: ", 
      metrics.accuracy_score(y_test_large, y_pred_large))

print("--- %s seconds ---" % (time.time() - start_time))

classification accuracy:  0.4244
--- 66.927316904068 seconds ---


# Full data

n_estimators = 100

In [21]:
#m = GradientBoostingClassifier(n_estimators=100)

#start_time = time.time()

#m.fit(X_train ,y_train)

#y_pred = m.predict(X_test)

#from sklearn import metrics

#print("classification accuracy: ", 
#      metrics.accuracy_score(y_test, y_pred))

#print("--- %s seconds ---" % (time.time() - start_time))

#classification accuracy:  0.4508631161937775
#--- 4036.9263598918915 seconds ---

# Best features in Gradient Tree Boosting

    1. Departure Hour (0.417)
    2. Arrival Hour (0.375)
    3. Travel_day of week (0.052)
    4. student (0.0448)
    5. worker_count (0.022)


# Voting Classifier

In [22]:
from sklearn.ensemble import VotingClassifier

VotingClassifier?


In [23]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

vc = VotingClassifier(
    estimators=[('lr', LogisticRegression()), 
                ('rf', RandomForestClassifier()), 
                ('gnb', GaussianNB())], 
    voting='hard')

# small

In [24]:
start_time = time.time()

vc_s = VotingClassifier(
    estimators=[('lr', LogisticRegression()), 
                ('rf', RandomForestClassifier()), 
                ('gnb', GaussianNB())], 
    voting='hard')

vc_s.fit(X_train_small ,y_train_small)

y_pred_small = vc_s.predict(X_test_small)

from sklearn import metrics

print("classification accuracy: ", 
      metrics.accuracy_score(y_test_small, y_pred_small))

print("--- %s seconds ---" % (time.time() - start_time))

classification accuracy:  0.332
--- 0.46448206901550293 seconds ---


  if diff:


# medium

In [25]:
start_time = time.time()

vc_m = VotingClassifier(
    estimators=[('lr', LogisticRegression()), 
                ('rf', RandomForestClassifier()), 
                ('gnb', GaussianNB())], 
    voting='hard')

vc_m.fit(X_train_med ,y_train_med)

y_pred_med = vc_m.predict(X_test_med)

from sklearn import metrics

print("classification accuracy: ", 
      metrics.accuracy_score(y_test_med, y_pred_med))

print("--- %s seconds ---" % (time.time() - start_time))

classification accuracy:  0.3448
--- 2.470761775970459 seconds ---


  if diff:


# Large

In [26]:
start_time = time.time()

vc_l = VotingClassifier(
    estimators=[('lr', LogisticRegression()), 
                ('rf', RandomForestClassifier()), 
                ('gnb', GaussianNB())], 
    voting='hard')

vc_l.fit(X_train_large ,y_train_large)

y_pred_large = vc_l.predict(X_test_large)

from sklearn import metrics

print("classification accuracy: ", 
      metrics.accuracy_score(y_test_large, y_pred_large))

print("--- %s seconds ---" % (time.time() - start_time))

classification accuracy:  0.3592
--- 6.644634962081909 seconds ---


  if diff:


# Full VC

In [None]:
start_time = time.time()

vc_full = VotingClassifier(
    estimators=[('lr', LogisticRegression()), 
                ('rf', RandomForestClassifier()), 
                ('gnb', GaussianNB())], 
    voting='hard')

vc_full.fit(X_train ,y_train)

y_pred_full = vc_full.predict(X_test_)

from sklearn import metrics

print("classification accuracy: ", 
      metrics.accuracy_score(y_test, y_pred_full))

print("--- %s seconds ---" % (time.time() - start_time))

# Full Data

In [27]:
#start_time = time.time()

#m.fit(X_train ,y_train)

#y_pred = m.predict(X_test)

#from sklearn import metrics

#print("classification accuracy: ", 
#      metrics.accuracy_score(y_test, y_pred))

#print("--- %s seconds ---" % (time.time() - start_time))

#classification accuracy:  0.45088632864183964
#--- 3957.926488876343 seconds ---

# Best Features Voting Classifier

    1. Departure Hour (0.248)
    2. Arrival Hour (0.224)
    3. Home Zipcode (0.0885)
    4. Travel Day of Week (0.0586)
    5. Student (0.0388)
    6. Worker Count (0.0351)

# NEWWWW

# Feature Selection

at this point

Gradient Tree Boost (n_estimators = 100) &
Voting Classifier 

perform at about the same classification accuracy (0.450 on all the trips)

# Univariate Feature Selection

In [28]:
from sklearn.datasets import load_iris
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2


# chisq and k=5

Features Selected: 
    - race1, model_year, departure hour, arrival hour, student_count

# chisq and f_classif

features selected:
    - student, departure hour, arrival hour, student_count, persons_count

# Recursive Feature Elimination


In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFECV


# with Random Forest

In [30]:
start_time = time.time()

RFE_l = RFECV(RandomForestClassifier(), scoring='accuracy')

RFE_l.fit(X_train_large, y_train_large)

y_pred_large = RFE_l.predict(X_test_large)

from sklearn import metrics

print("classification accuracy: ", 
      metrics.accuracy_score(y_test_large, y_pred_large))

print("--- %s seconds ---" % (time.time() - start_time))





classification accuracy:  0.386
--- 17.180453777313232 seconds ---


# with Gradient Boosting Tree Boosting

# small data

In [31]:
#start_time = time.time()

#RFE_gbc = RFECV(GradientBoostingClassifier(n_estimators=100), scoring='accuracy')

#RFE_gbc.fit(X_train_small, y_train_small)

#y_pred_small = RFE_gbc.predict(X_test_small)
#print("classification accuracy: ", 
#      metrics.accuracy_score(y_test_small, y_pred_small))

#cc = SFM_gbc_l.transform(X_train_small)
#print(cc.shape)

#print("--- %s seconds ---" % (time.time() - start_time))

#classification accuracy:  0.3614457831325301
#(1000, 7)
#--- 448.52055191993713 seconds ---

# Full Data

# Select From Model

In [32]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel

SelectFromModel?

# Gradient Tree Boosting

In [33]:
#start_time = time.time()
#SFM_gbc_s = SelectFromModel(GradientBoostingClassifier(n_estimators=100))

#SFM_gbc_s.fit(X_train_small, y_train_small)

#a = SFM_gbc_s.transform(X_train_small)
#print(a.shape)
#print("--- %s seconds ---" % (time.time() - start_time))

#(1000, 8)
#--- 6.920062065124512 seconds ---

In [34]:
X.shape

(430803, 23)

In [35]:
#start_time = time.time()
#SFM_gbc_full = SelectFromModel(GradientBoostingClassifier(n_estimators=100))

#SFM_gbc_full.fit(X_train, y_train)

#trans_SFM_gbc = SFM_gbc_full.transform(X_train)
#print(trans_SFM_gbc.shape)
#print("--- %s seconds ---" % (time.time() - start_time))
#
#(301562, 4)
#--- 4374.2707459926605 seconds ---

In [36]:
import numpy
#numpy.set_printoptions(threshold=numpy.nan)
#trans_SFM_gbc

In [37]:
X_train.head()

Unnamed: 0,gender,education,race1,student,veh_type,model_year,veh_make,body_type,fuel_type1,purchase_type,ownership,home_county_id,home_zipcode,vehicle_count,persons_count,worker_count,student_count,income,residence_type,home_own,arrival_hour,departure_hour,travel_day_of_week
15552,1,6,1.0,3,2,1987,50,2,1.0,2,1,37.0,90265.0,4.0,5.0,4.0,4.0,8.0,1.0,2.0,3,2,5
356364,1,6,1.0,3,2,9998,24,1,1.0,2,1,53.0,93923.0,2.0,2.0,0.0,0.0,5.0,1.0,1.0,11,11,6
347623,2,4,2.0,3,2,2007,17,1,1.0,2,1,37.0,93534.0,1.0,8.0,1.0,3.0,98.0,1.0,2.0,22,2,5
160663,1,2,1.0,3,2,2003,16,3,1.0,2,1,65.0,92220.0,4.0,3.0,0.0,0.0,2.0,1.0,1.0,3,5,3
371240,1,3,1.0,3,2,2000,35,1,1.0,2,1,85.0,95123.0,2.0,3.0,1.0,1.0,99.0,1.0,1.0,9,9,2


# Features selected by SelectFromModel using (Gradient Tree Boost)

 - home_zipcode
 - arrival_hour
 - departure_hour
 - travel_day_of_week

In [38]:
from sklearn.linear_model import LassoCV

start_time = time.time()

r=SelectFromModel(LassoCV())
r.fit(X_train, y_train)
rr = r.transform(X_train)

print("--- %s seconds ---" % (time.time() - start_time))

--- 4.282165765762329 seconds ---


In [39]:
rr.shape

(301562, 10)

In [40]:
X_train.head()

Unnamed: 0,gender,education,race1,student,veh_type,model_year,veh_make,body_type,fuel_type1,purchase_type,ownership,home_county_id,home_zipcode,vehicle_count,persons_count,worker_count,student_count,income,residence_type,home_own,arrival_hour,departure_hour,travel_day_of_week
15552,1,6,1.0,3,2,1987,50,2,1.0,2,1,37.0,90265.0,4.0,5.0,4.0,4.0,8.0,1.0,2.0,3,2,5
356364,1,6,1.0,3,2,9998,24,1,1.0,2,1,53.0,93923.0,2.0,2.0,0.0,0.0,5.0,1.0,1.0,11,11,6
347623,2,4,2.0,3,2,2007,17,1,1.0,2,1,37.0,93534.0,1.0,8.0,1.0,3.0,98.0,1.0,2.0,22,2,5
160663,1,2,1.0,3,2,2003,16,3,1.0,2,1,65.0,92220.0,4.0,3.0,0.0,0.0,2.0,1.0,1.0,3,5,3
371240,1,3,1.0,3,2,2000,35,1,1.0,2,1,85.0,95123.0,2.0,3.0,1.0,1.0,99.0,1.0,1.0,9,9,2


# Features Selected by Lasso

 - education, race1, model_year, home_county_id, home_zipcode, worker_count, income, arrival_hour, departure_hour, travel_day_of_week

# Cross Validation

At this point the best two models are
 - Gradient Boosting Classifier
 - Random Forest 
 
so test both in full

In [41]:
from sklearn.model_selection import cross_val_score

# Gradient Boosting Classifier

# small

In [42]:
clf = GradientBoostingClassifier(n_estimators=100)
scores = cross_val_score(clf, X_train_small, y_train_small, cv=10)
scores



array([0.32478632, 0.36936937, 0.3490566 , 0.32692308, 0.39393939,
       0.36082474, 0.41489362, 0.37634409, 0.40659341, 0.38636364])

In [43]:
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std()*2))

Accuracy: 0.37 (+/- 0.06)


# medium

In [44]:
#start_time = time.time()

#clf_med = GradientBoostingClassifier(n_estimators=100)
#scores_med = cross_val_score(clf_med, X_train_med, y_train_med, cv=10)
#print("Accuracy: %0.2f (+/- %0.2f)" % (scores_med.mean(), scores_med.std()*2))

#print("--- %s seconds ---" % (time.time() - start_time))

#Accuracy: 0.40 (+/- 0.03)
#--- 453.61482214927673 seconds ---

# Full Data

In [45]:
#start_time = time.time()

#clf_full = GradientBoostingClassifier(n_estimators=100)
#scores_full = cross_val_score(clf_full, X_train, y_train, cv=10)
#print("Accuracy: %0.2f (+/- %0.2f)" % (scores_full.mean(), scores_full.std()*2))

#print("--- %s seconds ---" % (time.time() - start_time))

#accuracy =  0.44993105271152445 +/- 0.003899528858285363


# Random Forest

# small

In [46]:
from sklearn.ensemble import RandomForestClassifier

start_time = time.time()

ranFor_small = RandomForestClassifier(n_estimators=20, oob_score=True)
scores_ranFor = cross_val_score(ranFor_small, X_train_small, y_train_small, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_ranFor.mean(), scores_ranFor.std()*2))

print("--- %s seconds ---" % (time.time() - start_time))

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])


Accuracy: 0.37 (+/- 0.10)
--- 0.8047959804534912 seconds ---


# medium

In [47]:
start_time = time.time()

ranFor_med = RandomForestClassifier(n_estimators=20, oob_score=True)
scores_ranFor_med = cross_val_score(ranFor_med, X_train_med, y_train_med, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_ranFor_med.mean(), scores_ranFor_med.std()*2))

print("--- %s seconds ---" % (time.time() - start_time))

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])


Accuracy: 0.38 (+/- 0.04)
--- 3.0234439373016357 seconds ---


# large

In [48]:
start_time = time.time()

ranFor_l = RandomForestClassifier(n_estimators=20, oob_score=True)
scores_ranFor_l = cross_val_score(ranFor_l, X_train_large, y_train_large, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_ranFor_l.mean(), scores_ranFor_l.std()*2))

print("--- %s seconds ---" % (time.time() - start_time))

#Accuracy: 0.38 (+/- 0.02)
#--- 8.715481042861938 seconds ---

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])


Accuracy: 0.38 (+/- 0.01)
--- 6.150175333023071 seconds ---


# full ****2

In [49]:
start_time = time.time()

ranFor_full = RandomForestClassifier(n_estimators=20, oob_score=True)
scores_ranFor_full = cross_val_score(ranFor_full, X, y, cv=10)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_ranFor_full.mean(), scores_ranFor_full.std()*2))

print("--- %s seconds ---" % (time.time() - start_time))

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  warn("Some inputs do not have OOB scores. "
  

Accuracy: 0.40 (+/- 0.02)
--- 408.6780619621277 seconds ---


Random Forest with Cross Val
 - always around 0.38-0.42
 
Gradient Boosting with

# Voting Classifier

always around 0.36

# Hyperparameter Tuning

# with Gradient Boosting


First get data with only the Best Features Picked by Feature Selection

In [50]:
trips.head()

Unnamed: 0,purpose,gender,education,race1,student,veh_type,model_year,veh_make,body_type,fuel_type1,purchase_type,ownership,home_county_id,home_zipcode,vehicle_count,persons_count,worker_count,student_count,income,residence_type,home_own,arrival_hour,departure_hour,travel_day_of_week
0,4,1,5,1.0,3,2,2007,27,1,1.0,1,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,8,8,5
1,9,1,5,1.0,3,2,2007,27,1,1.0,1,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,8,14,5
2,11,1,5,1.0,3,2,2007,27,1,1.0,1,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,11,12,5
3,1,1,5,1.0,3,2,2007,27,1,1.0,1,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,15,2,5
4,1,1,5,1.0,3,2,2007,27,1,1.0,1,1,111.0,91360.0,2.0,2.0,1.0,0.0,10.0,1.0,1.0,5,7,5


In [51]:
tuned_features = trips[['home_zipcode', 'arrival_hour', 'departure_hour', 'travel_day_of_week']]
tuned_features.head()

Unnamed: 0,home_zipcode,arrival_hour,departure_hour,travel_day_of_week
0,91360.0,8,8,5
1,91360.0,8,14,5
2,91360.0,11,12,5
3,91360.0,15,2,5
4,91360.0,5,7,5


In [52]:
y.head()

0     4
1     9
2    11
3     1
4     1
Name: purpose, dtype: int64

In [56]:

X_train_tuned, X_test_tuned, y_train_tuned, y_test_tuned = train_test_split(
    tuned_features, y, test_size = 0.3, random_state = 4)


X_train_tuned_small = X_train_tuned[:1000]
y_train_tuned_small = y_train_tuned[:1000]
X_test_tuned_small = X_test_tuned[:250]
y_test_tuned_small = y_test_tuned[:250]

X_train_tuned_med = X_train_tuned[:5000]
y_train_tuned_med = y_train_tuned[:5000]
X_test_tuned_med = X_test_tuned[:1250]
y_test_tuned_med = y_test_tuned[:1250]

X_train_tuned_large = X_train_tuned[:10000]
y_train_tuned_large = y_train_tuned[:10000]
X_test_tuned_large = X_test_tuned[:2500]
y_test_tuned_large = y_test_tuned[:2500]




In [57]:
from sklearn.model_selection import GridSearchCV
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.svm import SVC

# 3 instances on small

In [59]:
start_time = time.time()


tuned_params = [{'n_estimators': [100,200],
                 'max_depth': [2,3, 5]}]

grid_searched_small = GridSearchCV(GradientBoostingClassifier(),
                                   tuned_params, cv=3)

grid_searched_small.fit(X_train_tuned_small, y_train_tuned_small)

print("--- %s seconds ---" % (time.time() - start_time))

grid_searched_small.best_params_


#previous best when i fit of t_tra



--- 101.63563179969788 seconds ---


{'max_depth': 2, 'n_estimators': 100}

In [64]:
for p, s in zip(grid_searched_small.cv_results_['params'], grid_searched_small.cv_results_['mean_test_score']):
    print (p, s)

{'max_depth': 2, 'n_estimators': 100} 0.36
{'max_depth': 2, 'n_estimators': 200} 0.348
{'max_depth': 3, 'n_estimators': 100} 0.354
{'max_depth': 3, 'n_estimators': 200} 0.353
{'max_depth': 5, 'n_estimators': 100} 0.342
{'max_depth': 5, 'n_estimators': 200} 0.335


# 6 instances on medium


In [65]:
start_time = time.time()


tuned_params = [{'n_estimators': [50 ,100,200],
                 'max_depth': [2,3, 5]}]

grid_searched_med = GridSearchCV(GradientBoostingClassifier(),
                                   tuned_params, cv=3)

grid_searched_med.fit(X_train_tuned_med, y_train_tuned_med)

print("--- %s seconds ---" % (time.time() - start_time))
for p, s in zip(grid_searched_small.cv_results_['params'], grid_searched_small.cv_results_['mean_test_score']):
    print (p, s)
print()

grid_searched_med.best_params_


#previous best when i fit of X_train and y_train and not X_trained_tuned...
# {'max_depth': 3, 'n_estimators': 200} 



--- 522.3546690940857 seconds ---
{'max_depth': 2, 'n_estimators': 100} 0.36
{'max_depth': 2, 'n_estimators': 200} 0.348
{'max_depth': 3, 'n_estimators': 100} 0.354
{'max_depth': 3, 'n_estimators': 200} 0.353
{'max_depth': 5, 'n_estimators': 100} 0.342
{'max_depth': 5, 'n_estimators': 200} 0.335



{'max_depth': 2, 'n_estimators': 50}

# Large 5 

100,200, 150
3, 5

In [66]:
start_time = time.time()


tuned_params = [{'n_estimators': [50 ,100],
                 'max_depth': [2, 3]}]

grid_searched_l = GridSearchCV(GradientBoostingClassifier(),
                                   tuned_params, cv=3)

grid_searched_l.fit(X_train_tuned_large, y_train_tuned_large)

print("--- %s seconds ---" % (time.time() - start_time))
for p, s in zip(grid_searched_l.cv_results_['params'], grid_searched_l.cv_results_['mean_test_score']):
    print (p, s)
print()

grid_searched_l.best_params_
   



--- 312.3906409740448 seconds ---
{'max_depth': 2, 'n_estimators': 50} 0.412
{'max_depth': 2, 'n_estimators': 100} 0.4143
{'max_depth': 3, 'n_estimators': 50} 0.4069
{'max_depth': 3, 'n_estimators': 100} 0.3982



{'max_depth': 2, 'n_estimators': 100}

# Large, 7, CV=5

In [62]:
#start_time = time.time()


#tuned_params_l7 = [{'n_estimators': [100, 25, 50, 75],
#                 'max_depth': [3, 2, 4]}]

#grid_searched_l7 = GridSearchCV(GradientBoostingClassifier(),
#                                   tuned_params_l7, cv=5)

#grid_searched_l7.fit(X_train_large, y_train_large)

#print("--- %s seconds ---" % (time.time() - start_time))

#print(grid_searched_l7.best_params_)
#print()
#for p, s in zip(grid_searched_l7.cv_results_['params'], grid_searched_l7.cv_results_['mean_test_score']):
#    print (p, s)
    
#--- 2528.1611819267273 seconds ---
#{'max_depth': 2, 'n_estimators': 100}

# Large, 9, CV 7

In [63]:
#start_time = time.time()


#tuned_params_l9 = [{'n_estimators': [100, 125, 150, 200],
#                 'max_depth': [1, 2, 3, 4, 5]}]

#grid_searched_l9 = GridSearchCV(GradientBoostingClassifier(),
                                   tuned_params_l9, cv=7)

#grid_searched_l9.fit(X_train_large, y_train_large)

#print("--- %s seconds ---" % (time.time() - start_time))

#print(grid_searched_l9.best_params_)
#print()
#for p, s in zip(grid_searched_l9.cv_results_['params'], grid_searched_l9.cv_results_['mean_test_score']):
#    print (p, s)
    
#--- 19914.24623990059 seconds ---
#{'max_depth': 2, 'n_estimators': 100}

IndentationError: unexpected indent (<ipython-input-63-b59bd7e1f609>, line 8)

# Feature selection full

# ** Overnight

In [143]:
start_time = time.time()


tuned_params_full = [{'n_estimators': [100, 200],
                 'max_depth': [2,3]}]

grid_searched_full = GridSearchCV(GradientBoostingClassifier(),
                                   tuned_params_full, cv=5)

grid_searched_full.fit(X_train, y_train)

print("--- %s seconds ---" % (time.time() - start_time))

print(grid_searched_full.best_params_)
print()
for p, s in zip(grid_searched_full.cv_results_['params'], grid_searched_full.cv_results_['mean_test_score']):
    print (p, s)
    
#--- 19914.24623990059 seconds ---
#{'max_depth': 2, 'n_estimators': 100}

KeyboardInterrupt: 

best was max_depth=2 and n_estimators =100

# Confusion Matrix

In [67]:
from sklearn.metrics import confusion_matrix

In [70]:
m = GradientBoostingClassifier(n_estimators=100, max_depth=2)

start_time = time.time()

m.fit(X_train_tuned ,y_train_tuned)

y_pred_tuned = m.predict(X_test_tuned)

from sklearn import metrics

print("classification accuracy: ", 
      metrics.accuracy_score(y_test_tuned, y_pred_tuned))

print()

print("--- %s seconds ---" % (time.time() - start_time))

#classification accuracy:  0.4508631161937775
#--- 4036.9263598918915 seconds ---

classification accuracy:  0.43118669771976387

--- 1508.2533848285675 seconds ---


In [86]:
from sklearn.dummy import DummyClassifier
cd = DummyClassifier(strategy='most_frequent',random_state=0)
cd.fit(X_train_tuned, y_train_tuned)

cd.score(X_test_tuned, y_test_tuned)

0.3279532037047067

# improved prediction by 31%

In [77]:
start_time = time.time()

returned = confusion_matrix(y_test_tuned, y_pred_tuned).ravel()
if len(returned) == 4:
    tn, fp, fn, tp = returned
    print("yay")


print("--- %s seconds ---" % (time.time() - start_time))
print()
returned


--- 0.15036582946777344 seconds ---



array([34696,  3491,     0, ...,     0,     0,     0])

In [88]:
print ('True:', y_test_tuned.values[0:20])
print ('Pred:', y_pred_tuned[0:20])

True: [ 7  2  1 31 22  8  9  1  5  7 34  9  7  1 22 22 35 36  7  9]
Pred: [ 1  2  1  2 27  1  2  1  2  2  1  2  1  2  2  2  1 33  1  1]


In [92]:
import numpy
numpy.set_printoptions(threshold=numpy.nan)
print(metrics.confusion_matrix(y_test_tuned, y_pred_tuned))

[[34696  3491     0     0     0     0     1    62  1345     0   170     0
      0     0     0     0     0     0     0     0    64   369     0     0
      0     0  1947     0     1     0     6     0   149     0     0     8
     75     0     1     0]
 [ 2114  8431     0     0     0     0     0    11   227     0   420     0
      0     0     0     0     0     0     0     0    40   369     0     0
      0     0  1919     0     0     0    20     0    76     0     0     4
     29     0     0     0]
 [  404   177     0     0     0     0     1     7    72     0     4     0
      0     0     0     0     0     0     0     0     0    13     0     0
      0     0    73     0     0     0     2     0    13     0     0     4
     29     0     0     0]
 [  187   611     0     0     0     0     0     0    14     0    14     0
      0     0     0     0     0     0     0     0     6    23     0     0
      0     0   149     0     1     0     1     0     9     0     0     1
      0     0     0     0]
 [  

In [120]:
from collections import defaultdict
d = defaultdict()

for i in y_pred_tuned[:]:
    if i not in d.keys():
        d[i] = 1
    else:
        d[i] += 1

d

defaultdict(None,
            {1: 54189,
             2: 30360,
             27: 22291,
             33: 1023,
             11: 3040,
             9: 11493,
             22: 5208,
             37: 462,
             8: 206,
             21: 705,
             36: 100,
             29: 18,
             31: 137,
             3: 2,
             25: 2,
             7: 3,
             39: 2})

In [128]:
dic = defaultdict()

for i in y_test_tuned[:]:
    if i not in dic.keys():
        dic[i] = 1
    else:
        dic[i] += 1

dic


defaultdict(None,
            {7: 4835,
             2: 13660,
             1: 42385,
             31: 4300,
             22: 5890,
             8: 6083,
             9: 8541,
             5: 601,
             34: 2035,
             35: 1131,
             36: 1642,
             27: 7874,
             30: 1656,
             21: 3153,
             39: 2602,
             32: 1337,
             23: 1128,
             25: 2728,
             6: 1171,
             33: 1537,
             38: 981,
             37: 4608,
             11: 1411,
             4: 1016,
             29: 2199,
             17: 572,
             3: 799,
             28: 761,
             14: 95,
             24: 322,
             16: 277,
             26: 1283,
             13: 85,
             15: 155,
             20: 94,
             19: 48,
             12: 32,
             10: 87,
             99: 46,
             18: 81})

In [123]:
confusion_matrix = pd.crosstab(y_test_tuned, y_pred_tuned, rownames=['True'], colnames=['Predicted'], margins=True)
confusion_matrix

Predicted,1,2,3,7,8,9,11,21,22,25,27,29,31,33,36,37,39,All
True,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
1,34696,3491,0,1,62,1345,170,64,369,0,1947,1,6,149,8,75,1,42385
2,2114,8431,0,0,11,227,420,40,369,0,1919,0,20,76,4,29,0,13660
3,404,177,0,1,7,72,4,0,13,0,73,0,2,13,4,29,0,799
4,187,611,0,0,0,14,14,6,23,0,149,1,1,9,1,0,0,1016
5,314,135,0,0,2,55,7,1,7,0,69,0,2,5,0,4,0,601
6,412,232,0,0,1,405,6,1,12,0,91,0,1,9,0,1,0,1171
7,1817,1574,0,0,28,439,67,14,102,0,709,1,6,52,7,19,0,4835
8,3686,1053,0,0,62,553,41,6,101,1,479,0,4,56,12,29,0,6083
9,1604,509,1,0,14,5888,47,6,49,0,363,0,1,38,1,20,0,8541
10,18,9,0,0,0,37,2,0,1,0,20,0,0,0,0,0,0,87


In [None]:
# Tp = 129,241 trips * .43118669 classification accuracy = 55,727