In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msgno

import warnings
warnings.filterwarnings('ignore')

sns.set()

#### For better understanding, this link provides a dictionary of all columns's description :
https://www1.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf

In [None]:
df = pd.read_csv('../input/taxi-trip-data-nyc/taxi_tripdata.csv')

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.describe()

In [None]:
df.dtypes

##### Unnecessary columns to drop.

In [None]:
df = df.drop(['ehail_fee', 'VendorID', 'trip_type', 'congestion_surcharge'], axis= 1)

##### Total Amount cannot be negative , so I will drop those rows.

In [None]:
len(df[df['total_amount'] <= 0])

In [None]:
df = df[df['total_amount'] > 0]

In [None]:
len(df[df['fare_amount'] <= 0])

##### Trip Distance cannot be zero.

In [None]:
len(df[df['trip_distance'] <= 0])

In [None]:
df.groupby('RatecodeID')['trip_distance'].median()

In [None]:
df.loc[(df['trip_distance'] <= 0) & (df['RatecodeID'] == 1.0)] = 2.8
df.loc[(df['trip_distance'] <= 0) & (df['RatecodeID'] == 2.0)] = 20
df.loc[(df['trip_distance'] <= 0) & (df['RatecodeID'] == 3.0)] = 24
df.loc[(df['trip_distance'] <= 0) & (df['RatecodeID'] == 4.0)] = 12
df.loc[(df['trip_distance'] <= 0) & (df['RatecodeID'] == 5.0)] = 3.1

In [None]:
for i in df.columns[[0, 1]]:
    df[i] = pd.to_datetime(df[i])

##### I will create a new column that contains the trip duration.

In [None]:
timedeltas = df['lpep_dropoff_datetime'] - df['lpep_pickup_datetime']
trip_duration = []
for i in timedeltas :
    trip_duration.append(i.total_seconds() / 60)

df['trip_duration'] = trip_duration
df['trip_duration'] = df['trip_duration'].round(2)

##### I will drop the rows where the 'trip_duration' is less than a minute.

In [None]:
df = df[df['trip_duration'] > 1]
df.shape

In [None]:
df['store_and_fwd_flag'].value_counts()

In [None]:
df['store_and_fwd_flag'].fillna('N', inplace= True)

In [None]:
df['payment_type'].value_counts(normalize= True)

In [None]:
df.groupby('payment_type')['total_amount'].quantile(0.95)

##### I will asume that if 'total_amount' is more than $50, credit card (1.0) will be used.

In [None]:
df.loc[(df['payment_type'].isna()) & (df['total_amount'] > 50), 'payment_type'] = 1.0
df.loc[(df['payment_type'].isna()) & (df['total_amount'] <= 50), 'payment_type'] = 2.0

In [None]:
df['RatecodeID'].value_counts()

In [None]:
df.groupby('RatecodeID')['total_amount'].quantile(0.95)

In [None]:
df.loc[(df['RatecodeID'].isna()) & (df['total_amount'] >= 184), 'RatecodeID'] = 4.0
df.loc[(df['RatecodeID'].isna()) & (138 <= df['total_amount']) & (df['total_amount']< 184), 'RatecodeID'] = 3.0
df.loc[(df['RatecodeID'].isna()) & (79 <= df['total_amount']) & (df['total_amount']< 138), 'RatecodeID'] = 2.0
df.loc[(df['RatecodeID'].isna()) & (0 <= df['total_amount']) & (df['total_amount']< 79), 'RatecodeID'] = 1.0

In [None]:
df['passenger_count'].value_counts(normalize= True)

In [None]:
df.loc[df['passenger_count']==0, 'passenger_count'] = 1.0
df.loc[df['passenger_count']==7, 'passenger_count'] = 1.0
df.loc[df['passenger_count']==32, 'passenger_count'] = 1.0

In [None]:
df.loc[df['passenger_count'].isna(), 'passenger_count'] = 1.0

In [None]:
df.isna().sum()

In [None]:
df.head()

In [None]:
df.shape

In [None]:
plt.figure(figsize= (18, 10))
sns.heatmap(df.corr(), annot= True);

##### Which is the busiest day ?

In [None]:
df['day_of_week'] = df['lpep_pickup_datetime'].dt.day_name()

In [None]:
plt.figure(figsize= (18, 7))
sns.countplot(y= 'day_of_week', data= df)
plt.ylabel('');

##### Which is the busiest hour ?

In [None]:
plt.figure(figsize= (18, 7))
sns.countplot(x= df['lpep_pickup_datetime'].dt.hour, data= df, color= 'goldenrod')
plt.ylabel('')
plt.xlabel('Hour of Day');

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators= 100, random_state= 42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
print('Coefficient of determination: %.2f' % r2_score(y_test, y_pred))

In [None]:
from xgboost import XGBRegressor

xgb = XGBRegressor(n_estimators= 100, random_state= 42)
xgb.fit(X_train, y_train)

y_pred = xgb.predict(X_test)

print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
print('Coefficient of determination: %.2f' % r2_score(y_test, y_pred))


In [None]:
from lightgbm import LGBMRegressor

lgbm = LGBMRegressor(n_estimators= 100, random_state= 42)
lgbm.fit(X_train, y_train)

y_pred = lgbm.predict(X_test)

print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
print('Coefficient of determination: %.2f' % r2_score(y_test, y_pred))

In [None]:
from catboost import CatBoostRegressor

cat = CatBoostRegressor(n_estimators= 100, random_state= 42)
cat.fit(X_train, y_train)

y_pred = cat.predict(X_test)

print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
print('Coefficient of determination: %.2f' % r2_score(y_test, y_pred))

In [None]:

from sklearn.svm import SVR

svr = SVR()
svr.fit(X_train, y_train)

y_pred = svr.predict(X_test)

print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred))
print('Coefficient of determination: %.2f' % r2_score(y_test, y_pred))
