In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
df = pd.read_csv('train.csv')

In [None]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['dropoff_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['store_and_fwd_flag'] = df['store_and_fwd_flag'].map(lambda x: 1 if x == "Y" else 0) 

In [None]:
dfs = df.copy()

## Simple Feature Engg

In [None]:
dfs.columns

In [None]:
dfs = dfs.query('trip_duration < 2*3600')
dfs = dfs.query('pickup_latitude < 45 and pickup_longitude > - 90')

In [None]:
dfs['pu_day']= dfs['pickup_datetime'].dt.day
dfs['pu_hour']= dfs['pickup_datetime'].dt.hour
dfs['pu_month']= dfs['pickup_datetime'].dt.month
dfs['pu_year']= dfs['pickup_datetime'].dt.year
dfs['pu_minute']= dfs['pickup_datetime'].dt.minute
dfs['pu_weekday']=dfs['pickup_datetime'].dt.weekday

In [None]:
dfs.drop(['id','vendor_id','pickup_datetime','dropoff_datetime'],axis=1,inplace=True)

In [None]:
dfs.head()

# Baseline

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, normalize
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error,mean_squared_error,accuracy_score

In [None]:
def get_train_test(X,y):
    X_train, X_val, y_train, y_val= train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_val, y_train, y_val

def get_new_baseline(X_train, X_val, y_train, y_val,plot=True):
    rf = RandomForestRegressor()
    model = rf.fit(X_train,y_train)
    preds=rf.predict(X_val)
    if plot == True:
        return mean_absolute_error(y_val,preds),y_val,preds
    else:
        return mean_absolute_error(y_val,preds)

In [None]:
X_train, X_val, y_train, y_val = get_train_test(dfs.drop('trip_duration',axis=1), dfs['trip_duration'])
res,true,pred = get_new_baseline(X_train, X_val, y_train, y_val)
print(res)

In [None]:
sns.distplot(true)
sns.distplot(pred)

# More Feature Engineering

### Distance
Added horizontal, vertical and mannhattan distance

In [None]:
dfs['horiz_dist'] = dfs['pickup_latitude']-dfs['dropoff_latitude']
dfs['vertical_dist'] = dfs['pickup_longitude']-dfs['dropoff_longitude']
dfs['mann_dist'] = np.abs(dfs['pickup_longitude']-dfs['dropoff_longitude'])+np.abs(dfs['pickup_latitude']-dfs['dropoff_latitude'])

In [None]:
X_train, X_val, y_train, y_val = get_train_test(dfs.drop('trip_duration',axis=1), dfs['trip_duration'])
res,true,pred = get_new_baseline(X_train, X_val, y_train, y_val)
print(res)

### Coords

* Interaction between coordinates - like lat and long
* neighborhood?

In [None]:
# direction of travel
dfs = dfs.query('pickup_latitude > 40 and pickup_latitude < 42 and pickup_longitude < -73 and pickup_longitude > -75').query('dropoff_latitude > 40 and dropoff_latitude < 42 and dropoff_longitude < -73 and dropoff_longitude > -75')
dfs['direction'] = np.tan(dfs['vertical_dist']/ (dfs['horiz_dist']+0.0000001))

In [None]:
from sklearn.cluster import AgglomerativeClustering, KMeans
clus = KMeans(n_clusters=5)
clus.fit(dfs[['pickup_latitude','pickup_longitude']])
cluster_lab = clus.predict(dfs[['pickup_latitude','pickup_longitude']])
plt.scatter(tmp['pickup_latitude'],dfs['pickup_longitude'],c=cluster_lab)
# plt.scatter(tmp['dropoff_latitude'],tmp['dropoff_longitude'],alpha=0.4)

In [None]:
dfs['pickup_nbd'] = clus.predict(dfs[['pickup_latitude','pickup_longitude']])
dfs['dropoff_nbd'] = clus.predict(dfs[['dropoff_latitude','dropoff_longitude']])

In [None]:
# try again added neighborhood and direction and removed more outliers
X_train, X_val, y_train, y_val = get_train_test(dfs.drop('trip_duration',axis=1), dfs['trip_duration'])
res,true,pred = get_new_baseline(X_train, X_val, y_train, y_val)
print(res)

In [None]:
sns.distplot(true)
sns.distplot(pred)

In [None]:
dfs.columns

## Model Inspection

In [None]:
# from sklearn.inspection import permutation_importance
# result = permutation_importance(model, X_val, y_cal, n_repeats=10,random_state=42)
# perm_sorted_idx = result.importances_mean.argsort()

# tree_importance_sorted_idx = np.argsort(model.feature_importances_)
# tree_indices = np.arange(0, len(model.feature_importances_)) + 0.5

# fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
# ax1.barh(tree_indices,
#          clf.feature_importances_[tree_importance_sorted_idx], height=0.7)
# ax1.set_yticklabels(data.feature_names)
# ax1.set_yticks(tree_indices)
# ax1.set_ylim((0, len(clf.feature_importances_)))
# ax2.boxplot(result.importances[perm_sorted_idx].T, vert=False,
#             labels=data.feature_names)
# fig.tight_layout()
# plt.show()