<a href="https://colab.research.google.com/github/dilanSachi/ML_Ride_Fare_Classification/blob/master/ML_Ride_Fare_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

In [None]:
training_data = pd.read_csv('./train.csv')
testing_data = pd.read_csv('./test.csv')

In [None]:
training_data.label = training_data.label.map({'correct':1,'incorrect':0})

In [None]:
training_labels = training_data.pop("label")

In [None]:
training_data['pickup_time'] = pd.to_datetime(training_data['pickup_time'])
testing_data['pickup_time'] = pd.to_datetime(testing_data['pickup_time'])

In [None]:
training_data['drop_time'] = pd.to_datetime(training_data['drop_time'])
testing_data['drop_time'] = pd.to_datetime(testing_data['drop_time'])

In [None]:
durations_trdata = training_data.apply(lambda row: (row.drop_time - row.pickup_time).total_seconds(), axis=1)
durations_tsdata = testing_data.apply(lambda row: (row.drop_time - row.pickup_time).total_seconds(), axis=1)

In [None]:
for i in range(len(training_data["duration"])):
  if ((np.isnan(training_data["duration"][i])) | (training_data["duration"][i]==0)):
    training_data.at[i, 'duration'] = durations_trdata[i]
for i in range(len(testing_data["duration"])):
  if ((np.isnan(testing_data["duration"][i])) | (testing_data["duration"][i]==0)):
    testing_data.at[i, 'duration'] = durations_tsdata[i]

In [None]:
training_data.drop("tripid", axis=1, inplace=True)
training_data.drop("pickup_time", axis=1, inplace=True)
training_data.drop("drop_time", axis=1, inplace=True)

testing_data.drop("tripid", axis=1, inplace=True)
testing_data.drop("pickup_time", axis=1, inplace=True)
testing_data.drop("drop_time", axis=1, inplace=True)

In [None]:
!pip install pyproj
from pyproj import Geod

In [None]:
wgs84_geod = Geod(ellps='WGS84')
def Distance(lat1,lon1,lat2,lon2):
  az12,az21,dist = wgs84_geod.inv(lon1,lat1,lon2,lat2)
  return dist

In [None]:
distances_tr = Distance(training_data.pick_lat.to_list(),
                     training_data.pick_lon.to_list(),
                     training_data.drop_lat.to_list(),
                     training_data.drop_lon.to_list())

distances_ts = Distance(testing_data.pick_lat.to_list(),
                     testing_data.pick_lon.to_list(),
                     testing_data.drop_lat.to_list(),
                     testing_data.drop_lon.to_list())

In [None]:
training_data["distance"]=distances_tr
testing_data["distance"]=distances_ts

In [None]:
training_data.drop("pick_lat", axis=1, inplace=True)
training_data.drop("pick_lon", axis=1, inplace=True)
training_data.drop("drop_lat", axis=1, inplace=True)
training_data.drop("drop_lon", axis=1, inplace=True)

testing_data.drop("pick_lat", axis=1, inplace=True)
testing_data.drop("pick_lon", axis=1, inplace=True)
testing_data.drop("drop_lat", axis=1, inplace=True)
testing_data.drop("drop_lon", axis=1, inplace=True)

In [None]:
cols_with_missing = [col for col in training_data.columns
                     if training_data[col].isnull().any()]

In [None]:
# Make copy to avoid changing original data (when imputing)
training_data_plus = training_data.copy()
testing_data_plus = testing_data.copy()

# Make new columns indicating what will be imputed
for col in cols_with_missing:
    training_data_plus[col + '_was_missing'] = training_data_plus[col].isnull()
    testing_data_plus[col + '_was_missing'] = testing_data_plus[col].isnull()

# Imputation
my_imputer = SimpleImputer(strategy='median')
imputed_training_data_plus = pd.DataFrame(my_imputer.fit_transform(training_data_plus))
imputed_testing_data_plus = pd.DataFrame(my_imputer.transform(testing_data_plus))

# Imputation removed column names; put them back
imputed_training_data_plus.columns = training_data_plus.columns
imputed_testing_data_plus.columns = testing_data_plus.columns

In [None]:
# Inserting additional features

In [None]:
imputed_training_data_plus['feature_1'] = (imputed_training_data_plus['duration'] - imputed_training_data_plus['meter_waiting'])/imputed_training_data_plus['fare']
imputed_testing_data_plus['feature_1'] = (imputed_testing_data_plus['duration'] - imputed_testing_data_plus['meter_waiting'])/imputed_testing_data_plus['fare']

In [None]:
imputed_training_data_plus['feature_2'] = (imputed_training_data_plus['fare'] - imputed_training_data_plus['meter_waiting_fare'])/(imputed_training_data_plus['duration'] - imputed_training_data_plus['meter_waiting'])
imputed_testing_data_plus['feature_2'] = (imputed_testing_data_plus['fare'] - imputed_testing_data_plus['meter_waiting_fare'])/(imputed_testing_data_plus['duration'] - imputed_testing_data_plus['meter_waiting'])

In [None]:
imputed_training_data_plus['feature_3'] = (imputed_training_data_plus['distance'])/(imputed_training_data_plus['duration'] - imputed_training_data_plus['meter_waiting'])
imputed_testing_data_plus['feature_3'] = (imputed_testing_data_plus['distance'])/(imputed_testing_data_plus['duration'] - imputed_testing_data_plus['meter_waiting'])

In [None]:
imputed_training_data_plus['feature_4'] = (imputed_training_data_plus['meter_waiting'])/(imputed_training_data_plus['duration'])
imputed_testing_data_plus['feature_4'] = (imputed_testing_data_plus['meter_waiting'])/(imputed_testing_data_plus['duration'])

In [None]:
imputed_training_data_plus['feature_5'] = (imputed_training_data_plus['additional_fare'])/(imputed_training_data_plus['distance'])
imputed_testing_data_plus['feature_5'] = (imputed_testing_data_plus['additional_fare'])/(imputed_testing_data_plus['distance'])

In [None]:
imputed_training_data_plus['feature_6'] = (imputed_training_data_plus['distance'])/(imputed_training_data_plus['fare'])
imputed_testing_data_plus['feature_6'] = (imputed_testing_data_plus['distance'])/(imputed_testing_data_plus['fare'])

In [None]:
imputed_training_data_plus['feature_7'] = (imputed_training_data_plus['fare'] - imputed_training_data_plus['meter_waiting_fare'])/(imputed_training_data_plus['distance'])
imputed_testing_data_plus['feature_7'] = (imputed_testing_data_plus['fare'] - imputed_testing_data_plus['meter_waiting_fare'])/(imputed_testing_data_plus['distance'])

In [None]:
# imputed_training_data_plus['feature_8'] = (imputed_training_data_plus['additional_fare'])/(imputed_training_data_plus['fare'])
# imputed_testing_data_plus['feature_8'] = (imputed_testing_data_plus['additional_fare'])/(imputed_testing_data_plus['fare'])

In [None]:
# imputed_training_data_plus['feature_9'] = (imputed_training_data_plus['additional_fare'])/(imputed_training_data_plus['duration'])
# imputed_testing_data_plus['feature_9'] = (imputed_testing_data_plus['additional_fare'])/(imputed_testing_data_plus['duration'])

In [None]:
X_train_full, X_valid_full, y_train, y_valid = train_test_split(imputed_training_data_plus, training_labels, 
                                                                train_size=0.85, test_size=0.15,
                                                                random_state=0)

In [None]:
############################################################################ Testing

In [None]:
# from sklearn.preprocessing import StandardScaler
# from xgboost import XGBClassifier
# from sklearn.ensemble import RandomForestClassifier, VotingClassifier
# from sklearn.model_selection import cross_val_score
# from sklearn.metrics import f1_score

# def get_score_with_no_pipeline(n_estimators, l_rate, early):
#   my_model = XGBClassifier(n_estimators=n_estimators, learning_rate=l_rate)
#   my_model.fit(X_train_full, y_train, 
#              early_stopping_rounds=early, 
#              eval_set=[(X_valid_full, y_valid)],
#              verbose=False)

#   scores = my_model.predict(X_valid_full)
#   f1 = f1_score(y_valid, scores)
#   print("F1 score:", f1)
#   return f1

In [None]:
# results = []
# l_rates = [0.02, 0.05, 0.07, 0.09]
# early_stop = [10, 30, 50, 70, 100, 150, 170, 200]
# for l_rate in l_rates:
#   print("L_rate:---------------------------------------------------------------", l_rate)
#   temptemp = []
#   for early in early_stop:
#     temp = {}
#     print("early:-------------------------------------------------------------", early)
#     for i in range(350, 1450, 50):
#       print("n_est:---------------------------------------------------", i)
#       temp[i] = get_score_with_no_pipeline(i, l_rate, early)
#     temptemp.append(temp)
#   results.append(temptemp)

In [None]:
# import matplotlib.pyplot as plt
# %matplotlib inline

# for i in range(0, 8):
#   plt.plot(list(results[0][i].keys()), list(results[0][i].values()))
# plt.legend(['10', '30', '50', '70', '100', '150', '170', '200'], loc='lower right')
# plt.show()

In [None]:
#########################################################

In [None]:
from xgboost import XGBClassifier                   #  with early stopping
my_model = XGBClassifier(n_estimators=550, learning_rate=0.07)
my_model.fit(X_train_full, y_train, 
             early_stopping_rounds=170, 
             eval_set=[(X_valid_full, y_valid)], 
             verbose=False)

In [None]:
preds = my_model.predict(X_valid_full)

In [None]:
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import f1_score

In [None]:
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)

In [None]:
f1 = f1_score(y_valid, preds)
print("F1: ", f1)

In [None]:
test_preds=my_model.predict(imputed_testing_data_plus)

In [None]:
final_preds = pd.DataFrame(
          {'prediction': test_preds}
)

In [None]:
test_set_new = pd.read_csv('./test.csv')

In [None]:
final_preds.insert(0,'tripid',test_set_new.tripid.to_list(),True)

In [None]:
final_preds.to_csv('./final_preds.csv',index=False)