In [None]:
# Put these at the top of every notebook, to get automatic reloading and inline plotting
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [None]:
!pip install fastai==0.7.0

In [None]:
from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics

In [None]:
PATH="../input/"

In [None]:
!ls {PATH}

In [None]:
!head ../input/train.csv

In [None]:
NROWS = 5_00_000
col_datetime = ["pickup_datetime"]
col_types = {"key": "object",
             "fare_amount": "float32",
             "pickup_logitude": "float32",
             "pickup_latitude": "float32",
             "dropoff_longitude": "float32",
             "dropoff_latitude": "float32",
             "passenger_count": "int8"}

In [None]:
df_raw = pd.read_csv(f'{PATH}train.csv',
                     parse_dates = col_datetime,
                     dtype = col_types,
                     infer_datetime_format = True,
                     nrows = NROWS)

In [None]:
df_test_raw = pd.read_csv(f'{PATH}test.csv',
                         parse_dates = col_datetime,
                         dtype=col_types,
                         infer_datetime_format = True)

In [None]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000):
        display(df)

In [None]:
%%time

display_all(df_raw.describe(include='all'))

In [None]:
display_all(df_raw.describe(include='all'))

In [None]:
display_all(df_test_raw.describe(include='all'))

In [None]:
PATH_TMP = "/tmp/"
df_raw.to_feather(f'{PATH_TMP}train_raw')
df_test_raw.to_feather(f'{PATH_TMP}test_raw')

## Clearning DataSet

In [None]:
df_raw.drop(index=df_raw[df_raw.fare_amount <= 2.5].index, inplace=True)

In [None]:
df_raw.drop(index=df_raw[df_raw.passenger_count <= 0].index, inplace=True)

In [None]:
df_raw.drop(index=df_raw[(df_raw.pickup_longitude <= -75) 
                         | (df_raw.pickup_longitude >= -72) 
                         | (df_raw.dropoff_longitude <= -75) 
                         | (df_raw.dropoff_longitude >= -72)
                         | (df_raw.pickup_latitude <= 39)
                         | (df_raw.pickup_latitude >= 42)
                         | (df_raw.dropoff_latitude <= 39)
                         | (df_raw.dropoff_latitude >= 42)].index, inplace=True)

df_raw.reset_index(inplace=True, drop=True)

## Feature Engineering

In [None]:
add_datepart(df_raw, 'pickup_datetime', time=True)

In [None]:
df_raw.columns

In [None]:
def add_travel_vector_features(df):
    df["lat_diff"] = np.abs(df.dropoff_latitude - df.pickup_latitude)
    df["lon_diff"] = np.abs(df.dropoff_longitude - df.pickup_longitude)
    df['distance'] = ((df.lat_diff)**2 + (df.lon_diff)**2)**.5

add_travel_vector_features(df_raw)

In [None]:
df, y, _ = proc_df(df_raw, 'fare_amount', skip_flds=["key"], subset=100000)

In [None]:
print(df.shape, y.shape, df_raw.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(df, y, test_size=0.2)
print(X_train.shape, X_valid.shape, y_train.shape, y_valid.shape)

In [None]:
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [None]:
%%time
m = RandomForestRegressor(n_estimators=10, n_jobs=-1)
m.fit(X_train, y_train)
print_score(m)

In [None]:
%%time
m = RandomForestRegressor(n_estimators=120, max_features=0.5, min_samples_leaf=3, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)