<a href="https://colab.research.google.com/github/charlie9526/TaxiFare/blob/master/majority_voting_ensemble_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1 . Mounting

In [None]:
from google.colab import drive 
drive.mount('/content/drive')
%cd /content/drive/My\ Drive/Colab\ Notebooks/FAIR_TAXI

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive/Colab Notebooks/FAIR_TAXI


# 2 . Imports

In [None]:
import pandas as pd
from datetime import datetime
import random
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns; sns.set(style="ticks", color_codes=True)              
import numpy as np                 
from scipy.stats import norm   
import warnings
warnings.filterwarnings('ignore')   
%matplotlib inline                  
from math import sin, cos, sqrt, atan2, radians
import math
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.cluster import KMeans

from sklearn import metrics
from sklearn.metrics import roc_curve, roc_auc_score,fbeta_score,f1_score

from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder,LabelBinarizer
from sklearn.preprocessing import OneHotEncoder

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier

from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn.svm import SVC,LinearSVC
from sklearn.decomposition import PCA
from sklearn.neural_network import MLPClassifier
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from xgboost import XGBClassifier
from sklearn.cluster import KMeans



# 3 . Read files

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# 4 . Functions

In [None]:

# set distance column
class set_distance(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
      return self

    def transform(self, X):
      X['distance'] = X.apply(lambda x:get_distance(x), axis=1)
      # print(X)
      # X = X.drop(labels=['pick_lat','pick_lon','drop_lat','drop_lon'],axis=1)
      return X
# lambda column get distance
def get_distance(x):
  R = 6373.0
  lat1 = radians(float(x['drop_lat']))
  lon1 = radians(float(x['drop_lon']))
  lat2 = radians(float(x['pick_lat']))
  lon2 = radians(float(x['pick_lon']))

  dlon = lon2 - lon1
  dlat = lat2 - lat1

  a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
  c = 2 * atan2(sqrt(a), sqrt(1 - a))

  distance = round( R * c,2)
  return distance


def encoding_label(label):
  if(label=='correct'):
    return 1
  elif(label=='incorrect'):
    return 0
  else:
    print ("invalid label")

def get_hour_value(x,date_time_column_name):
  if x[date_time_column_name]==None:
    return None
  h =  datetime.strptime(x[date_time_column_name],"%m/%d/%Y %H:%M").hour
  return h

def get_minute_value(x,date_time_column_name):
  if x[date_time_column_name]==None:
    return None
  h =  datetime.strptime(x[date_time_column_name],"%m/%d/%Y %H:%M").minute
  return h


def get_fare_per_hr(x):
  if  x["duration"]>0:
    hours = x["duration"]
    val = float(x['fare'])/hours
    return val
  if x["duration"]==0:
    return 0
  return None


def get_addfare_per_hr(x):
  if  x["distance"]>0:
    val = float(x['additional_fare'])/x["distance"]
    return val
  if x["distance"]==0:
    return 0
  return None


def get_dis_per_fare(x):
  if x['fare']==0:
    return 0
  if x['fare']>0:
    return float(x['distance'])/x['fare']
  else:
    return 0


def get_waiting_rate(x):
  if math.isnan(x['meter_waiting']):
    return None
  if x['meter_waiting']==0:
    return 0
  if  x['meter_waiting_fare']>=0:
    return float(x['meter_waiting_fare'])/x['meter_waiting']
  else:
    return 0


def get_velocity(x):
  if math.isnan(x['duration']) or math.isnan(x['distance']) or math.isnan(x['meter_waiting']):
    return None
  if (x['duration']-x['meter_waiting'])==0:
    return 0
  return (float(x['distance']))/(x['duration']-x['meter_waiting'])


def get_mw_ratio(x):
  if (x['duration'])==0:
    return 0
  return float(x['meter_waiting'])/(x['duration'])


def get_mwfare_ratio(x):
  if (x['fare'])==0:
    return 0
  return float(x['meter_waiting_fare'])/(x['fare'])



def invget_fare_per_hr(x):
  if  x["fare"]>0:
    fare = x["duration"]
    val = float(x['fare'])/fare
    return val
  if x["fare"]==0:
    return 0
  return None

  ################### Here we have to add fare_per_hour column
################### write a lambda function

def get_fare_per_min(x):
  minutes = (x["duration"]-x['meter_waiting'])/60.0
  if x["fare"]>=0 and minutes>0:
    minutes = (x["duration"]-x['meter_waiting'])/60.0
    val = (x['fare']-x['meter_waiting_fare']-x['additional_fare'])/minutes
    return val
  if x["duration"]==0:
    return 0
  return None

########### This is for getting fare per distance 
########### Lambda function
def get_fare_per_distance(x):
  if x['distance']==0:
    return 0
  if x['meter_waiting_fare']>0:
    return float(x['fare']-x['meter_waiting_fare']-x['additional_fare'])/x['distance']
  else:
    return float(x['fare'])/x['distance']


# 5 . PreProcess

### Some pre steps


In [None]:
# train_df = train_df.drop_duplicates()
# train_df = train_df.dropna()
################# ugly indexes
# bads = [203757140,213112837,193653017,209365576,190167541]
# train_df = train_df[~(train_df['tripid'].isin(bads))]

### Create feature_df and label

In [None]:
trip_id  = test_df['tripid']

features_df = train_df.loc[:, train_df.columns != 'label']
label_df = train_df['label'].apply(encoding_label).values

features_df = features_df.loc[:, features_df.columns != 'tripid']
test_df = test_df.loc[:, test_df.columns != 'tripid']

### New columns

In [None]:
features_df['pickup_hour'] = features_df.apply(lambda x : get_hour_value(x,'pickup_time'),axis=1)
test_df['pickup_hour'] = test_df.apply(lambda x : get_hour_value(x,'pickup_time'),axis=1)

features_df['pickup_minute'] = features_df.apply(lambda x : get_minute_value(x,'pickup_time'),axis=1)
test_df['pickup_minute'] = test_df.apply(lambda x : get_minute_value(x,'pickup_time'),axis=1)

features_df['drop_hour'] = features_df.apply(lambda x : get_hour_value(x,'drop_time'),axis=1)
test_df['drop_hour'] = test_df.apply(lambda x : get_hour_value(x,'drop_time'),axis=1)

features_df['drop_minute'] = features_df.apply(lambda x : get_minute_value(x,'drop_time'),axis=1)
test_df['drop_minute'] = test_df.apply(lambda x : get_minute_value(x,'drop_time'),axis=1)

features_df['distance'] = features_df.apply(lambda x:get_distance(x), axis=1)
test_df['distance'] = test_df.apply(lambda x:get_distance(x), axis=1)

features_df['fare_per_distance'] = features_df.apply(lambda x:get_fare_per_distance(x),axis=1 )
test_df['fare_per_distance'] = test_df.apply(lambda x:get_fare_per_distance(x),axis=1 )

features_df['fare_per_min'] = features_df.apply(lambda x:get_fare_per_min(x),axis=1 )
test_df['fare_per_min'] = test_df.apply(lambda x:get_fare_per_min(x),axis=1 )




### Drop Columns

In [None]:
feature_list = [
                'pickup_hour',
                'fare_per_distance',
                'fare_per_min',
                'meter_waiting',
                'meter_waiting_fare',
                'additional_fare',
                'duration',
                'distance',
                'fare'
                ]

features_df = features_df[feature_list]
test_df = test_df[feature_list]


## transformers


In [None]:
numeric_cols = features_df.columns[features_df.dtypes != "object"].values
numeric_preprocessing_steps = Pipeline([
    ('standard_scaler', StandardScaler()),
    ('simple_imputer', SimpleImputer(strategy='mean'))
])


preprocessor = ColumnTransformer(
    transformers = [
        ("numeric", numeric_preprocessing_steps, numeric_cols),  
    ],
    remainder = "drop"
)


# 7 . Cross validation and result

In [None]:
pca = PCA()

xgb = XGBClassifier(
    learning_rate=0.21,
)
rf = RandomForestClassifier(
    random_state=1
)
lr = LogisticRegression( 
    random_state=1
)

vc = VotingClassifier(estimators=[
                             ('lr', lr), 
                             ('rf', rf), 
                             ('xgb', xgb)], 
                 voting='hard'
)

full_pipeline = Pipeline([
                          ("preprocessor", preprocessor),
                          ("pca",pca),
                          ("esti", vc)
                          ])

kfold = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
cv_results = cross_val_score(full_pipeline, features_df, label_df, cv=kfold, scoring='f1')
print('%s: %f (%f) ' % ("F1", cv_results.mean()*100, cv_results.std()))
cv_results = cross_val_score(full_pipeline, features_df, label_df, cv=kfold, scoring='precision')
print('%s: %f (%f)' % ("precision", cv_results.mean()*100, cv_results.std()))
cv_results = cross_val_score(full_pipeline, features_df, label_df, cv=kfold, scoring='recall')
print('%s: %f (%f)  ' % ("recall", cv_results.mean()*100, cv_results.std()))
cv_results = cross_val_score(full_pipeline, features_df, label_df, cv=kfold, scoring='accuracy')
print('%s: %f (%f) ' % ("accuracy", cv_results.mean()*100, cv_results.std()))
print("\n")


F1: 97.143253 (0.002765) 
precision: 95.379596 (0.003196)
recall: 98.973865 (0.003114)  
accuracy: 94.748529 (0.005083) 




In [None]:
# F1: 97.619646 (0.003619) for rs = 0.000000 
# precision: 96.418998 (0.004029) for rs = 0.000000 
# recall: 98.851296 (0.004098) for rs = 0.000000 
# accuracy: 95.651002 (0.006617) for rs = 0.000000 

# F1: 97.644779 (0.003251) for rs = 0.000000 
# precision: 96.468063 (0.004293) for rs = 0.000000 
# recall: 98.851280 (0.003037) for rs = 0.000000 
# accuracy: 95.697551 (0.005975) for rs = 0.000000 


# 8 . Train model

In [None]:
full_pipeline.fit(features_df,label_df)
# pd.DataFrame({'feature':feature_list,'importance':full_pipeline.feature_importances_}).sort_values(by=['importance'])

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('numeric',
                                                  Pipeline(memory=None,
                                                           steps=[('standard_scaler',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True)),
                                                                  ('simple_imputer',
                                                                   SimpleImputer(add_indicator=False,
                                                                 

# 9 . Predict and save result

In [None]:
predicted = full_pipeline.predict(test_df)

data_dict = { 'tripid':trip_id, 'prediction':predicted}
pd_test_result = pd.DataFrame(data=data_dict,columns=['tripid','prediction'])
pd_test_result = pd_test_result.set_index('tripid') 
pd_test_result.to_csv("final_ensemble.csv", index=True)

In [None]:
pd_test_result['prediction'].value_counts()

1    8212
0     364
Name: prediction, dtype: int64