## K-Means++ 

### Handle latitude/longitude in predictions 


In [None]:
from nextbike.io import input, output
from nextbike.model.utils import prepare_data
from nextbike.model.clustering import cluster
from nextbike.model.classification import logistic_regression
from nextbike.model.regression import polynomial_regression, linear_regression
from nextbike.model.ensemble import random_forest
from nextbike.model.ensemble.random_forest import dic
from nextbike.model.regression.polynomial_regression import dic
from nextbike.model.regression.parameters import model_dic


import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd


from sklearn.cluster import KMeans

In [None]:
# read in the trip data 
df_final = input.read_csv(loc='processed', name = "dortmund_trips.csv", index_col=0)

In [None]:
# scatterplot for starting positions of trips 
sns.scatterplot(x="latitude_start", y = "longitude_start", data=df_final)
plt.title("Starting position - Scatterplot")
plt.show()

In [None]:
cluster_value = cluster.elbow_method(df_final)

In [None]:
cluster_value["X_scaled"]

In [None]:
# due to the elbow method number of clusters are set to 4 
km = KMeans(n_clusters = 4)
km.fit(cluster_value["X_scaled"])
df_final["area_start"] = km.predict(cluster_value['X_scaled'])+1

sns.scatterplot(x="latitude_start", y="longitude_start", data=df_final, hue="area_start", palette="cool")
plt.title("Start positions clusterd to areas")
plt.show()

We get the start positions of the trips summed up to four different areas in the city (North-West, South-West, City-Center and North-East). 
This feature could be very useful for the predictions in task 3. 

In [None]:
output.write_trip_data(df_final)

### Task 3.1 Linear Models

In [None]:
df_final.describe()[["trip_duration"]]

In [None]:
# which features correlate with trip_duration?
df_final.corr()[["trip_duration"]]

In [None]:
plt.figure(figsize=(15,12))
sns.heatmap(df_final.corr(),cmap="Reds")
plt.yticks(fontsize=12)
plt.xticks(fontsize=12)
plt.title("Correlation Matrix", fontsize=16)
plt.tight_layout()
plt.savefig("../doc/figures/CorrelationMatrix.png")
plt.show()

In [None]:
init = linear_regression.__init__(df_final)

# Linear Regression

In [None]:
linear_regression.train(model = 'Linear', init=init)

# Lasso Regression

In [None]:
linear_regression.train(model = 'Lasso', init=init)

# Ridge Regression 

In [None]:
linear_regression.train(model="Ridge", init=init)

# Compare results Linear / Lasso / Ridge

In [None]:
linear_regression.compare_regression_models(init)

# Grid Search
## Parameter Tuning for Ridge and Lasso Regression 

In [None]:
df_results = linear_regression.calculate_hyper_parameters(init)

### Ridge with calculated values

In [None]:
linear_regression.model(model= "Ridge", alpha=4, max_iter=500, random_sate=0, fit_intercept=True, init= init)

### Lasso with calculated values

In [None]:
linear_regression.model(model= "Lasso", alpha=0.001, max_iter=100, random_sate=0, fit_intercept=True, init= init)

### Task 3.2 Polynominal Regression

In [None]:
from sklearn.linear_model import Lasso,Ridge, LinearRegression

In [None]:
init = polynomial_regression.__init__(df_final)

In [None]:
polynomial_regression.polynomial_reg(model_name= "LinearRegression", estimator = LinearRegression(), degree= 3, init= init)

In [None]:
polynomial_regression.polynomial_reg(model_name= "LinearRegression", estimator = LinearRegression(), degree= 4, init= init)

In [None]:
polynomial_regression.polynomial_reg(model_name= "Ridge", estimator= Ridge(), degree=3, init=init)

In [None]:
polynomial_regression.polynomial_reg(model_name= "Ridge",estimator=ator= Ridge(), degree=4, init= init)

In [None]:
# hyperparameters found with RandomizedSearch 
polynomial_regression.polynomial_reg("Ridge", Ridge(solver="cholesky", max_iter=40, fit_intercept=True, copy_X=False, alpha=50), degree=4)

In [None]:
polynomial_regression.polynomial_reg("Lasso", Lasso(), 4)

In [None]:
polynomial_regression.polynomial_reg("Lasso",Lasso(max_iter=40,fit_intercept=True, copy_X=False, alpha=50), 4)

In [None]:
pd.set_option('display.max_colwidth', -1)

df_result = pd.DataFrame(polynomial_regression.dic)
df_result

### Task 3.3 Random Forest Regressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
init = random_forest.__init__(df_final)

In [None]:
random_forest.rfr(init, RandomForestRegressor())

--> Model is overfitted, due to the much better metrics for training data than for the test data 

In [None]:
# set hyperparamters that decrease the complexity of the random forrest
# avoid overfitting - reduce complexity of the model 
random_forest.rfr(init, RandomForestRegressor(n_estimators=50, max_depth=12, min_samples_split=2, min_samples_leaf=15))

In [None]:
random_forest.rfr(init, RandomForestRegressor(max_features="auto",n_estimators=1155, max_depth=70, min_samples_split=10, min_samples_leaf=8,bootstrap=True))

#### Using log of trip duration 

In [None]:
init = random_forest.__init__(df_final, log=True)

In [None]:
rfr = random_forest.rfr(init, RandomForestRegressor())

In [None]:
rfr = random_forest.rfr(init, RandomForestRegressor(max_features="auto",n_estimators=1155, max_depth=70, min_samples_split=10, min_samples_leaf=8,bootstrap=True))

In [None]:
# exp the predictions to get correct values 
# compare this with actual values 
random_forest.convert_log_to_exp(init, rfr)

In [None]:
f, axs = plt.subplots(1,1,figsize=(15,15))
plt.scatter(init['y_test'],rfr['pred'])
plt.ylabel("Predictions",fontsize=18)
plt.xlabel("Actual durations", fontsize=18)
plt.title("Predicted trip durations \n Random Forrest and log of duration used", fontsize=20)
plt.show()

In [None]:
# summary of results 
df_results = pd.DataFrame(random_forest.dic)
df_results

### Task 3.4 Support Vector Regressor