# INTRACITY DRIVER

In [11]:
%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
from sklearn import neural_network
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

In [12]:
scaler = StandardScaler()
ann = neural_network.MLPRegressor(shuffle=True,
                                  alpha=0.5,
                                  hidden_layer_sizes=(100, 100),
                                  max_iter=10000,
                                  random_state=100,
                                  verbose=False)

In [13]:
def crossValidate(X, y, clf):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    clf.fit(X_train, y_train)
    prediction = clf.predict(X_test)
    
    print("CV performance")
    print(200 * r2_score(y_test, prediction))
    print("Train performance")
    print(200 * r2_score(y_train, clf.predict(X_train)))

In [14]:
def testRUN_crCsv(X, y, clf):
    
    scaler.fit(X)
    X = scaler.transform(X)
    
    clf.fit(X, y)
    
    predict = clf.predict(scaler.transform(test))
    
    predict = predict.round(decimals=2)
    predict = predict.reshape(predict.shape[0], 1)
    predict = np.concatenate([id_vec, predict], axis=1)
    predict = pd.DataFrame(data=predict, columns=['ID', 'FARE'])
    predict.to_csv("../answer.csv", index=False, header=True)
    print("Done! - check answer.csv file")


In [None]:
train = pd.read_csv('../data/processed_train.csv')
test = pd.read_csv('../data/processed_test.csv')
id_vec = np.array(test.loc[:, test.columns == 'ID'])

In [None]:
train.describe()

In [None]:
fig=plt.figure(figsize=(8, 8), dpi= 80, facecolor='w', edgecolor='k')

X_LABEL = 'VEHICLE_TYPE'
Y_LABEL = 'WAIT_TIME'

plt.xlabel(X_LABEL)
plt.ylabel(Y_LABEL)

# train.plot(x=X_LABEL,y=Y_LABEL)
# train.plot(kind='box', vert=False, positions=[1, 4, 5, 6, 8])
plt.scatter(train[Y_LABEL],train[X_LABEL])

In [None]:
# Features to drop

drop_lab = ['ID', 'cooling','bus','mean_lat', 'mean_long', 'TIME_AM','YEAR','DAY','TIMESTAMP']
train.drop(drop_lab, axis=1, inplace=True)
test.drop(drop_lab, axis=1, inplace=True)

In [None]:
X = train.drop(['FARE'], axis=1)
y = train['FARE']

In [None]:
# Hyperparameter tuning

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
gs = GridSearchCV(ann,
            param_grid={
                'hidden_layer_sizes': [(8), (10),(10, 10, 10), (70, 50, 20), (15, 15, 15), (40, 40, 40)],
                'random_state': [100, 1000, 10000],
                'alpha': [0.01, 0.1, 1.0]
            },
            n_jobs=-1,
            scoring=make_scorer(r2_score),
            verbose=5)

gs.fit(X, y)
print("best estimator :\n",gs.best_estimator_)
print("Best parameters :\n",gs.best_params_)
print("CV RESULTS : \n",gs.cv_results_)

In [None]:
# CROSS VALIDATION code
crossValidate(X, y, ann)

In [None]:
#  Real testing
testRUN_crCsv(X, y, ann)