# Modeling Notebook

In this notebook, we will be training a regression model from the data produced from the `Data_Prep` Notebook.

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

## Modeling

In [2]:
train_data = pd.read_csv('cleaned_train_data.csv')
test_data = pd.read_csv('formatted_test_data.csv')

In [3]:
X = train_data.drop(['key', 'fare_amount'], axis=1)
y = train_data['fare_amount']

In [4]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
X_train.head()

Unnamed: 0,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance_km,hour,day,month,year
3186651,-73.967938,40.800352,-73.955082,40.80101,1,1.084601,7,2,4,2013
3498976,-73.978187,40.773262,-73.973315,40.764113,1,1.096942,13,2,4,2012
3513723,-73.979995,40.751558,-73.963377,40.810577,1,6.710116,11,1,9,2014
3569066,-73.99442,40.72625,-73.978702,40.751,1,3.054113,18,1,8,2011
393247,-74.01612,40.711237,-74.006488,40.733105,1,2.563515,21,0,5,2014


In [6]:
lr = LinearRegression().fit(X_train, y_train)
lr_preds = lr.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, lr_preds))
print(rmse)

3.8422910293446852


In [7]:
lm_r = Ridge().fit(X_train, y_train)
lm_r_preds = lm_r.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, lm_r_preds))
print(rmse)

3.8422917302339403


In [19]:
test_data.head()

Unnamed: 0,key,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,distance_km,hour,day,month,year
0,2015-01-27 13:08:24.0000002,-73.97332,40.763805,-73.98143,40.743835,1,2.32326,13,1,1,2015
1,2015-01-27 13:08:24.0000003,-73.986862,40.719383,-73.998886,40.739201,1,2.425353,13,1,1,2015
2,2011-10-08 11:53:44.0000002,-73.982524,40.75126,-73.979654,40.746139,1,0.618628,11,5,10,2011
3,2012-12-01 21:12:12.0000002,-73.98116,40.767807,-73.990448,40.751635,1,1.961033,21,5,12,2012
4,2012-12-01 21:12:12.0000003,-73.966046,40.789775,-73.988565,40.744427,1,5.387301,21,5,12,2012


In [20]:
keys = test_data['key']
test_X = test_data.drop('key', axis=1)

t_preds = lr.predict(test_X)

In [14]:
test_df = pd.DataFrame(columns = ['key','fare_amount'])
test_df['key'] = keys
test_df['fare_amount'] = t_preds

In [15]:
test_df.head()

Unnamed: 0,key,fare_amount
0,2015-01-27 13:08:24.0000002,10.482075
1,2015-01-27 13:08:24.0000003,10.212826
2,2011-10-08 11:53:44.0000002,5.236367
3,2012-12-01 21:12:12.0000002,8.665985
4,2012-12-01 21:12:12.0000003,16.278435


In [16]:
test_df.to_csv('submission.csv', index=False) # current score: 5.60997