# Multinomial model
this model will take details about the time and date and predict the number of spots available for a location that has multiple chargers (either a cluster or site)

the model might take the form of 

$ num_spots = linear combination of (hour , month, is_holiday, day_of_week) $

In [None]:
import os
os.chdir('..')

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from src.data_preprocessing import datetime_processing, userinput_processing, holiday_processing, create_x

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score, precision_score, recall_score, accuracy_score


In [None]:
def get_results(y_test, prediction):
    cm = confusion_matrix(y_test,prediction)
    make_classification_plot(cm)
    
    results = {'tpr': cm[1, 1]/np.sum(cm[1]),
               'fpr': cm[0,1]/np.sum(cm[0]),
               'accuracy': accuracy_score(y_test, prediction),
               'precision': precision_score(y_test, prediction, average='weighted'),
               'recall': recall_score(y_test, prediction, average='weighted'),
        'f1':f1_score(y_test,prediction, average='weighted')}
    return results

def make_classification_plot(cm):
    disp = ConfusionMatrixDisplay(cm)
    disp = disp.plot(include_values=True, cmap='viridis', ax=None, xticks_rotation='horizontal')
    plt.grid(False)
    plt.show()

# 1. Make data, X, y

In [None]:
df_of = pd.read_parquet('data/ACN-API/office001/').reset_index(drop=True)
df_of = datetime_processing(df_of)
df_of = userinput_processing(df_of)
df_of = holiday_processing(df_of)
df_of.head()

In [None]:
df = df_of
tmp = df.copy()
tmp.set_index('connectionTime', inplace=True)
start_date = '2019-03-25'; end_date = '2021-09-12'
tmp = tmp.sort_index().loc[start_date:end_date,:]

space_cols = tmp.spaceID.unique()
space_cols = (list(space_cols.astype('str')))

y = pd.DataFrame(index=pd.date_range(start_date,end_date, inclusive='both', freq='h', tz=0),columns=space_cols)
y[space_cols] = 1

tmp.reset_index(inplace=True)

for i in list(tmp.index):
    start_ = tmp.loc[i, 'connectionTime']
    end_ = tmp.loc[i, 'disconnectTime']
    session_ = tmp.loc[i, 'sessionID']
    space_ = tmp.loc[i, 'spaceID']
    # print(start_,'\t', end_,'\t', session_, '\t', space_)
    try:
        y.loc[start_:end_,space_] = 0
    except:
        print('bad value:')
        print(i, '\t', start_,'\t', end_,'\t', session_, '\t', space_)


In [None]:
y

In [None]:
# tmp = df_of.copy()
# tmp.set_index('connectionTime', inplace=True)
# start_date = '2019-03-25'; end_date = '2021-09-12'
# tmp = tmp.sort_index().loc[start_date:end_date,:]
# 
# space_cols = tmp.spaceID.unique()
# space_cols = (list(space_cols.astype('str')))
# 
# y = pd.DataFrame(index=pd.date_range(start_date,end_date, inclusive='both', freq='h', tz=0),columns=space_cols)
# y[space_cols] = 1
# 
# print('there is a problem here. Some charging sessions have the EXACT same start time (see index 335)', tmp.iloc[335:337, 0:2])
# print('the solution is to go through the dataframe by index number, using iloc, instead of by connection time using .loc')
# disconnect_col = np.where(tmp.columns == 'disconnectTime')[0][0]
# session_col = np.where(tmp.columns == 'sessionID')[0][0]
# space_col = np.where(tmp.columns == 'spaceID')[0][0]
# 
# print(f'so we use the column number instead of the column name, hence why we disconnect is {disconnect_col}, session is {session_col}, and space is {space_col}')
# 
# for i in range(len(tmp)):
#     start_ = tmp.index[i]
#     end_ = tmp.iloc[i, disconnect_col] 
#     session_ = tmp.iloc[i, session_col]
#     space_ = tmp.iloc[i, space_col]
#     # print(start_,'\t', end_,'\t', session_, '\t', space_)
#     try:
#         y.loc[start_:end_,space_] = 0
#     except:
#         print('bad value:')
#         print(i, '\t', start_,'\t', end_,'\t', session_, '\t', space_)

In [None]:
y.sum(axis=1).plot()
plt.title('available spots over time at the office')

In [None]:
X = pd.DataFrame(index=pd.date_range('2019-03-25','2021-09-12', inclusive='both', freq='h', tz=0),columns=['dow','hour','month'])
# X['dow'] = X.index.dt.hour
X['dow'] = X.index.dayofweek
X['hour'] = X.index.hour
X['month'] = X.index.month
X['connectionTime'] = X.index
X = holiday_processing(X).drop(columns=['connectionTime'])
X.head()

# 1.2 split data

In [None]:
train_test_cutoff_date = '2021-06-01'

In [None]:
# Create hold out test set
X_train, X_test, y_train, y_test = train_test_split(X, y.sum(axis=1), test_size = .2)
print(f'the training data has an average availability of {np.round(y_train.mean(),3)} spots available')

In [None]:
ohe = OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first')
ohe.fit(X_train)
X_train_ohe = ohe.transform(X_train)
X_test_ohe = ohe.transform(X_test)

# 4. Model

In [None]:
y_test

In [None]:
from sklearn.pipeline import Pipeline

classifier = LogisticRegression(C=1e5, solver='newton-cg', multi_class='multinomial')
pipe = Pipeline([
    ('ohe', OneHotEncoder(sparse_output=False, handle_unknown='ignore', drop='first')),
    ('lr', classifier),
])

pipe.fit(X_train, y_train)
pred_labels = pd.Series(pipe.predict(X_test), index=X_test.index)

In [None]:
pred_labels.sort_index().plot()

In [None]:
get_results(y_test, pred_labels)

In [None]:
# save basic model
import pickle
pickle.dump(pipe, open('../model.pkl','wb'))
os.getcwd()

In [None]:
model = pickle.load(open('../model.pkl', 'rb'))
start_date = '2021-01-04'
end_date = '2021-01-06'
from src.data_preprocessing import create_x
X = create_x(df, start_date=start_date, end_date=end_date)
print(model.predict(X))

pipe

In [None]:
?precision_score

In [None]:
classifier = LogisticRegression(multi_class='multinomial', max_iter=10000)
classifier.fit(X_train_ohe, y_train)
pred_labels = pd.Series(classifier.predict(X_test_ohe), index=X_test.index)
pred_labels.sort_index().plot()

In [None]:
C=10
classifier = LogisticRegression(C=C, penalty='l1',
                                                    solver='saga',
                                                    multi_class='multinomial',
                                                    max_iter=10000)
classifier.fit(X_train_ohe, y_train)
pred_labels = pd.Series(classifier.predict(X_test_ohe), index=X_test.index)
pred_labels.sort_index().plot()


In [None]:
plt.plot(pred_labels.sort_index(), 'o')
plt.plot(y_test.sort_index(), '--')

# For fun, linear regression
It doesnt work well because the predictions are wrong due to
1. not descrete, it choses values like 5.5 spaces available
2. chooses values too high, like more than 8 spaces, which is impossible

In [None]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(X_train_ohe, y_train)
pred = pd.Series(lm.predict(X_test_ohe), index=X_test.index)


In [None]:
plt.plot(pred.sort_index(), 'o')
plt.plot(y_test.sort_index(), '--')

In [None]:
# consider interaction terms
# https://stackoverflow.com/questions/45828964/how-to-add-interaction-term-in-python-sklearn
poly = PolynomialFeatures(interaction_only=True,include_bias = False)
poly.fit_transform(X)