In [14]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score

## Import boostrapped dataframe

In [3]:
df = pd.read_csv('../data/mod_dataset.csv')

### splitting dataset

In [15]:
X = df.drop(columns='fare_amount')
y = df['fare_amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

### Implementing a RandomForestRegressor

In [9]:
rf = RandomForestRegressor(n_estimators=100)

In [10]:
cross_val_score(rf, X_train, y_train, cv=5).mean()

0.6188033997160852

In [17]:
cross_val_score(rf, X_test, y_test, cv=5).mean()

0.5476064297113357

### Implementing a Ridge

In [12]:
ridge = RidgeCV(alphas=np.linspace(.1, 10, 100))

In [13]:
ridge_scores = cross_val_score(ridge, X_train, y_train, cv=3)
ridge_scores.mean()

0.11245399323473253

In [45]:
ridge.fit(X_train, y_train)
ridge.score(X_test, y_test)

0.10468487016715067

### Implementing a RandomForestRegressor V.2

In [18]:
rf2 = RandomForestRegressor(n_estimators=100)

In [19]:
rf2_params = {
    'n_estimators': [100, 150, 200],
    'max_depth': [None, 1, 2, 3, 4, 5],
}
gs = GridSearchCV(rf2, param_grid=rf2_params, cv=5)
gs.fit(X_train, y_train)
print(gs.best_score_)
gs.best_params_

0.6725641300501907


{'max_depth': 5, 'n_estimators': 150}

In [20]:
gs.score(X_train, y_train)

0.6834456984535701

In [21]:
gs.score(X_test, y_test)

0.6340633634346092

### Importing Test dataframe and preprocess

In [26]:
df_test = pd.read_csv('../data/test.csv')

In [27]:
# make sure to convert objects to floats
df_test.pickup_longitude.astype('float64')
df_test.pickup_latitude.astype('float64')
df_test.dropoff_longitude.astype('float64')
df_test.dropoff_latitude.astype('float64')

0       40.743835
1       40.739201
2       40.746139
3       40.751635
4       40.744427
          ...    
9909    40.780388
9910    40.776371
9911    40.647011
9912    40.801731
9913    40.759220
Name: dropoff_latitude, Length: 9914, dtype: float64

In [28]:
df_test = df_test.dropna()

In [29]:
from math import sin, cos, sqrt, atan2, radians

def distance_loc(lat1, lon1, lat2, lon2):
    R = 6373.0
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = R * c
    return distance

In [30]:
list_distance = []
for row in df_test.itertuples():
    calc_dist =  distance_loc(row[4], row[5], row[6], row[7])
    list_distance.append(calc_dist)

In [32]:
df_test['distance'] = pd.Series(list_distance, index=df_test.index) 
df_test['pickup_datetime'] = df_test['pickup_datetime'].astype('datetime64[ns]')

In [33]:
import datetime

df_test['month'] = pd.DatetimeIndex(df_test['pickup_datetime']).month
df_test['year'] = pd.DatetimeIndex(df_test['pickup_datetime']).year

In [34]:
max_val = 2500
min_val = df_test['distance'].max()

df_test['distance'] = df_test['distance'].apply(lambda x: None if ((x == max_val) or (x == min_val)) else x)

In [36]:
df_test = df_test.dropna()

In [37]:
df_test.head(2)

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance,month,year
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24,-73.97332,40.763805,-73.98143,40.743835,1,2649.073465,1,2015
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24,-73.986862,40.719383,-73.998886,40.739201,1,2532.097137,1,2015


### Create predictions

In [38]:
features = ['passenger_count', 'distance', 'month', 'year']
preds = gs.predict(df_test[features])

In [39]:
preds[:10]

array([82.43557118, 82.43557118, 54.77768093, 55.51822503, 55.51822503,
       55.51822503, 37.06514911, 54.77768093, 54.77768093, 81.75252713])

In [40]:
final_sub = df_test.copy()
final_sub['fare_amount'] = pd.Series(preds)

In [41]:
final_sub.head(2)

Unnamed: 0,key,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance,month,year,fare_amount
0,2015-01-27 13:08:24.0000002,2015-01-27 13:08:24,-73.97332,40.763805,-73.98143,40.743835,1,2649.073465,1,2015,82.435571
1,2015-01-27 13:08:24.0000003,2015-01-27 13:08:24,-73.986862,40.719383,-73.998886,40.739201,1,2532.097137,1,2015,82.435571


In [42]:
final_sub = final_sub[['key', 'fare_amount']]
final_sub.head(2)

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,82.435571
1,2015-01-27 13:08:24.0000003,82.435571


In [43]:
final_sub.to_csv('../data/preds_dataset.csv', index=False)