In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import random
from prettytable import PrettyTable

In [2]:
#Original DF
df=pd.read_csv("DataSet/yellow_tripdata_2009-01.csv",nrows = 1000000)

#Drop unwanted columns
df_nyc=df.drop(columns=['Rate_Code','mta_tax','store_and_forward'])
df_nyc = df_nyc.dropna()

# Set display option
pd.set_option('display.float_format', lambda x: '%.3f' % x)

#Delete outliers
#1.outliers.Passenger count
df_nyc=df_nyc[df_nyc['Passenger_Count'].between(1,7)]

#For latitude and longitude columns, choose the values between range 2.5% and 97.5% percentile values 
#for col in ['Start_Lat', 'Start_Lon', 'End_Lat', 'End_Lon']:
#    print(f'{col.capitalize():17}: 2.5% = {round(np.percentile(df_nyc[col], 2.5), 2):5} \t 97.5% = {round(np.percentile(df_nyc[col], 97.5), 2)}')
    
df_nyc = df_nyc.loc[df_nyc['Start_Lat'].between(40, 41)]
df_nyc = df_nyc.loc[df_nyc['Start_Lon'].between(-75, -72)]
df_nyc = df_nyc.loc[df_nyc['End_Lat'].between(40, 41)]
df_nyc = df_nyc.loc[df_nyc['End_Lon'].between(-75, -72)]


#Delete outliers, calculate differnce between pickup and dropof lattititude/ longitude differnce and drop outliers
df_nyc['Lat_Diff']=(df_nyc['Start_Lat'] - df_nyc['End_Lat']).abs()
df_nyc['Lon_Diff']=(df_nyc['Start_Lon'] - df_nyc['End_Lon']).abs()

df_nyc = df_nyc[(df_nyc['Lat_Diff'] != 0) & (df_nyc['Lon_Diff'] != 0)]




In [3]:
#Get the types of Payment for rides
df_nyc['Payment_Type'].unique()

#Feature Engineering for Payment Column 
df_nyc['Payment_Type'] = df_nyc['Payment_Type'].str.upper()
df_nyc = df_nyc[(df_nyc['Payment_Type'] != 'NO CHARGE')]
df_nyc = df_nyc[(df_nyc['Payment_Type'] != 'DISPUTE')]

# Add, Day Of week , Hour , WeekDay/WeekEnd Column
df_nyc['hour'] = pd.to_datetime(df.Trip_Pickup_DateTime).dt.hour
df_nyc['dayofweek'] = pd.to_datetime(df.Trip_Pickup_DateTime).dt.dayofweek

In [4]:
#Encode the categorical String Data
from sklearn.preprocessing import LabelEncoder
df_nyc['vendor_name_encoded']=LabelEncoder().fit_transform(df_nyc['vendor_name']) 
df_nyc['Payment_Type_encoded']=LabelEncoder().fit_transform(df_nyc['Payment_Type']) 


In [5]:
from sklearn.cluster import MiniBatchKMeans
coords = np.vstack((df_nyc[['Start_Lat', 'Start_Lon']].values,
                df_nyc[['End_Lat', 'End_Lon']].values))
#df_ = df_nyc.copy()
sample_ind = np.random.permutation(len(coords))[:500000]
kmeans = MiniBatchKMeans(n_clusters=100, batch_size=10000).fit(coords[sample_ind])
df_nyc.loc[:, 'pickup_cluster'] = kmeans.predict(df_nyc[['Start_Lat', 'Start_Lon']])
df_nyc.loc[:, 'dropoff_cluster'] = kmeans.predict(df_nyc[['End_Lat', 'End_Lon']])



In [6]:
#For Classification types of Prolem
from sklearn.metrics import accuracy_score, confusion_matrix, precision_score, f1_score, recall_score

#For Rgression types of Prolem
from sklearn.metrics import explained_variance_score, mean_squared_error, r2_score

from prettytable import PrettyTable

def print_stats_metrices_regression(algorithm_name, y_test, y_pred):
    x = PrettyTable()
    x.field_names = ["MSE", "Variance Score", "R2 Score"]
    
    rmse=mean_squared_error(y_test, y_pred)
    variance_score=explained_variance_score(y_test, y_pred)
    r_score=r2_score(y_test, y_pred)
    
    x.add_row([rmse,variance_score,r_score]);
    print(x)
    
def print_stats_metrices_classification(algorithm_name, y_test, y_pred):
    x = PrettyTable()
    x.field_names = ["Algorithm", "Accuracy Score", "Precision Score", "Recall Score","F1 Score"]
    
    acc_score=accuracy_score(y_test, y_pred)
    prec_score=precision_score(y_true=y_test,y_pred=predictions, average='micro')
    rec_score=recall_score(y_true=y_test,y_pred=predictions, average='micro')
    f_score=recall_score(y_true=y_test,y_pred=predictions, average='micro')
    x.add_row([algorithm_name,acc_score,prec_score ,rec_score, f_score]);
    print(x)   
    

In [7]:
#Machine Learning Part
import sklearn
from sklearn.model_selection import train_test_split
df_nyc_train=df_nyc.drop(columns=['vendor_name','Payment_Type','Trip_Pickup_DateTime','Trip_Dropoff_DateTime','surcharge','Tip_Amt','Total_Amt','Fare_Amt','Tolls_Amt'])
x_train, x_test, y_train, y_test= train_test_split(df_nyc_train,df_nyc['Fare_Amt'],test_size=0.25)

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

lenReg1=LinearRegression()
lenReg1.fit(x_train,y_train)
predictions=lenReg1.predict(x_test)
print_stats_metrices_regression('LinearRegression',y_test,predictions)


+--------------------+--------------------+--------------------+
|        MSE         |   Variance Score   |      R2 Score      |
+--------------------+--------------------+--------------------+
| 7.3779672167505925 | 0.8495446081091033 | 0.8495438484477712 |
+--------------------+--------------------+--------------------+


In [8]:
x_train

Unnamed: 0,Passenger_Count,Trip_Distance,Start_Lon,Start_Lat,End_Lon,End_Lat,Lat_Diff,Lon_Diff,hour,dayofweek,vendor_name_encoded,Payment_Type_encoded,pickup_cluster,dropoff_cluster
166773,1,1.610,-73.962,40.776,-73.978,40.760,0.016,0.015,9,0,2,0,96,74
794788,5,8.140,-73.979,40.763,-73.927,40.854,0.091,0.052,20,4,2,0,74,19
983661,1,3.060,-73.988,40.748,-74.008,40.712,0.036,0.021,1,4,2,0,20,12
725089,1,2.280,-74.009,40.717,-73.983,40.723,0.006,0.027,20,6,2,1,23,32
80175,1,0.200,-73.966,40.763,-73.969,40.761,0.002,0.003,13,5,0,0,26,65
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
114265,1,1.020,-74.002,40.730,-73.990,40.734,0.004,0.012,18,0,2,1,30,9
38282,1,0.600,-74.003,40.739,-73.995,40.740,0.001,0.008,19,3,2,0,67,84
519260,1,3.500,-73.955,40.766,-73.992,40.749,0.016,0.037,17,3,0,0,59,62
549954,1,0.710,-73.986,40.723,-73.978,40.729,0.006,0.008,23,4,2,0,32,53
