**TODOs**  
- [x] Challenge description
- [ ] Feature importance

* [Challenge description](#Challenge-description)
* [Data exploration](#Data-exploration)

#TODO: finish or delete

## Challenge description

The goal for this week’s project is to build and train a regression model on the Capital Bike Share (Washington, D.C.) Kaggle data set, in order to predict demand for bicycle rentals at any given hour, based on time and weather, e.g.

**Data description**  
[https://www.kaggle.com/c/bike-sharing-demand](https://www.kaggle.com/c/bike-sharing-demand)

## Data exploration

### Load data

In [None]:
import pandas as pd
df = pd.read_csv('./data/train.csv', parse_dates=True)
df.tail()

### Train/Test split

In [None]:
from sklearn.model_selection import train_test_split

RANDOM_STATE = 42

X = df[df.columns.difference(['count', 'registered', 'casual'])].copy(deep=True)
y = df['count']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
X_train.sort_index().tail()

In [None]:
y_train.sort_index().tail()

### Heatmap

In [None]:
import seaborn as sns
tmp = df[['season', 'weather', 'temp', 'atemp', 'humidity', 'windspeed', 'count']]
sns.heatmap(
    tmp.corr(), 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True,
    
)

### Pairplot

In [None]:
import seaborn as sns
from pathlib import Path
from IPython import display
from IPython.core.display import display, HTML

PAIRPLOT_FROM = '2012-01-01'
PAIRPLOT_TO = '2013-01-01'
filename = "./output/pairplot-" + PAIRPLOT_FROM + " --" + PAIRPLOT_TO + ".png"

# Drawing a pairplot takes several minutes, that's why better to save (cache) image once it's created for the first time
file = Path(filename)
if not file.is_file():
    tmp = df.loc[PAIRPLOT_FROM:PAIRPLOT_TO, ['atemp', 'temp', 'humidity', 'season', 'weather', 'windspeed', 'count']]
    tmp = extract_datetime_data(tmp) # TODO: move this function to the top
    plot = sns.pairplot(tmp, hue='count')
    plot.savefig(file)
else:
    display(HTML('<img src="'+filename+'"></img>'))
    # display.Image(filename) # not working :(


Note: on a heatmap above we can clearly see two hightly correlated variables - temp and atemp. We will use only one of them (temp) in the model

### Check NaN values

In [None]:
X_train.isna().sum()

In [None]:
# TODO: ask about "weather" and "season" variables - does it make sense to have them in the heatmap?

### Average count by hour

In [None]:
df.groupby(pd.to_datetime(df['datetime']).dt.hour)['count'].mean().plot.bar()

### Average count by day of the week

In [None]:
df.groupby(pd.to_datetime(df['datetime']).dt.weekday)['count'].mean().plot.bar()

### Total count by month

In [None]:
df.groupby(pd.to_datetime(df['datetime']).dt.month)['count'].sum().plot.bar()

### Count of bike rents during a week

In [None]:


import plotly.express as px

fig = px.line(df[(df['datetime'] > '2012-12-12') & (df['datetime'] <= '2013-12-19')], x="datetime", y="count", title='Count of bike rents during a week')
fig.show()


## Feature engineering

#### Create hourly weights dictionary

In [None]:
tmp = df.groupby(pd.to_datetime(df['datetime']).dt.hour)['count'].sum().sort_values().reset_index().drop(columns='count').to_dict()
hour_weight_dict = {v:k for k, v in tmp['datetime'].items()}
# hour_weight_dict

#### Create monthly weights dictionary

In [None]:
tmp = df.groupby(pd.to_datetime(df['datetime']).dt.month)['count'].sum().sort_values().reset_index().drop(columns='count').to_dict()
month_weight_dict = {v:k+1 for k, v in tmp['datetime'].items()}
month_weight_dict

#### Create pipeline

In [None]:
# import sklearn.pipeline as pipeline
# from sklearn.preprocessing import FunctionTransformer
# datetime_pipeline = pipeline.make_pipeline(
#     FunctionTransformer(extract_datetime_data)
# )

In [None]:
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import MinMaxScaler

# preprocessor = ColumnTransformer([
#     # ('extract_datetime_data', datetime_pipeline, ['season']),
#     ('min_max_scaler', MinMaxScaler(), ['temp', 'humidity', 'windspeed']),
#     ('do_nothing', 'passthrough', ['season', 'weather'])
# ])

#### Function for extracting date features

In [None]:
def extract_datetime_data(df: pd.DataFrame) -> pd.DataFrame:
    df = pd.DataFrame(df)
    # df['hour'] = df.index.hour
    df['hour_weight'] = pd.to_datetime(df['datetime']).dt.hour.map(hour_weight_dict)
    # df['weekday'] = df.index.weekday
    # df['month'] = df.index.month
    df['month_weight'] = pd.to_datetime(df['datetime']).dt.month.map(month_weight_dict)

    df.drop(columns="datetime", inplace=True)

    return df

# TODO: add assertion

#### Transform data

In [None]:
# preprocessor.fit(X_train)
# X_train_fe = preprocessor.transform(X_train)
# X_test_fe = preprocessor.transform(X_test)
col = ['season', 'weather', 'temp', 'humidity', 'windspeed', 'datetime']

# col = ['temp']
X_train_fe = X_train[col].copy(deep=True)
X_test_fe = X_test[col].copy(deep=True)

X_train_fe = extract_datetime_data(X_train_fe)
X_test_fe = extract_datetime_data(X_test_fe)

assert pd.DataFrame(X_train_fe).isna().sum().unique().size == 1
assert pd.DataFrame(X_train_fe).isna().sum()[0] == 0

X_train_fe.sort_index().tail()

In [None]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer

def apply_min_max_scaller(df):

    transformer = ColumnTransformer([
        ('mmscaler', MinMaxScaler(), ['temp', 'windspeed', 'humidity', 'season', 'weather', 'hour_weight', 'month_weight'])
    ], remainder='drop')

    return pd.DataFrame(transformer.fit_transform(df), columns=transformer.get_feature_names_out())

X_train_fe2 = apply_min_max_scaller(X_train_fe)
X_test_fe2 = apply_min_max_scaller(X_test_fe)

assert pd.DataFrame(X_train_fe2).isna().sum().unique().size == 1
assert pd.DataFrame(X_test_fe2).isna().sum()[0] == 0

X_train_fe2.sort_index().tail()

In [None]:
from sklearn.preprocessing import PolynomialFeatures

pt = PolynomialFeatures(interaction_only=True)
X_train_fe3 = pd.DataFrame(pt.fit_transform(X_train_fe2), columns=pt.get_feature_names_out())
X_test_fe3 = pd.DataFrame(pt.fit_transform(X_test_fe2), columns=pt.get_feature_names_out())

assert pd.DataFrame(X_train_fe3).isna().sum().unique().size == 1
assert pd.DataFrame(X_test_fe3).isna().sum()[0] == 0

X_train_fe3

#### Validate transformed data

In [None]:
assert pd.DataFrame(X_train_fe3).isna().sum().unique().size == 1
assert pd.DataFrame(X_train_fe3).isna().sum()[0] == 0

# TODO: move it or get rid of it

## Models

### Linear Regression model

In [None]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression().fit(X_train_fe, y_train)

#### Predict results

In [None]:
y_pred = lr_model.predict(X_test_fe)
y_pred[y_pred < 0] = 0
y_pred

#### Evaluate model

In [None]:
from sklearn.metrics import mean_squared_log_error
import numpy as np

def rmsle(p,a):
    return np.sqrt(mean_squared_log_error(p,a))

In [None]:
print("RMSLE:", rmsle(y_pred, y_test))

In [None]:
# print("Coefficients:", lr_model.coef_)
# print("Intercept   :", lr_model.intercept_)

# print("train score :", lr_model.score(X_train_fe, y_train))
# print("test score  :", lr_model.score(X_test_fe, y_test))

### Poisson Regressor model

In [None]:
from sklearn.linear_model import PoissonRegressor

pr_model = PoissonRegressor(alpha=1)
pr_model.fit(X_train_fe3, y_train)
y_pred = pr_model.predict(X_test_fe3)

#### Calculate RMSLR

In [None]:
from sklearn.metrics import make_scorer, mean_squared_log_error

def rmslr(y_true, y_pred, **kwargs):
   return mean_squared_log_error(y_true, y_pred, **kwargs)**0.5

rmslr_scorer = make_scorer(rmslr, greater_is_better=False)

In [None]:
rmslr(y_test, y_pred)

#### Grid Search Cross Validation

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'alpha': [0.05, 0.1, 0.2, 0.5, 0.75, 1], 
    'fit_intercept': [True, False],
    'max_iter': [1000]
}

g = GridSearchCV(pr_model, param_grid, cv=5, scoring=rmslr_scorer, return_train_score=True )
g.fit(X_train_fe3, y_train)

In [None]:
res = pd.DataFrame(g.cv_results_)
res.head(3)

In [None]:
res.columns

In [None]:
col_names = ['mean_test_score', 'mean_train_score', 'mean_fit_time',
            'param_alpha', 'param_fit_intercept', 'param_max_iter']

res.sort_values('mean_test_score', ascending=False)[col_names].head(10)

In [None]:
g.best_params_

In [None]:
pr_model_best = g.best_estimator_

y_pred_train =pr_model_best.predict(X_train_fe3)
y_pred = pr_model_best.predict(X_test_fe3)

rmslr(y_test, y_pred), rmslr(y_train, y_pred_train)