In [55]:
import numpy as np
import pandas as pd

In [56]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [57]:
filepath = "../data/Final Categorization Data.csv"
df = pd.read_csv(filepath)

#drop extra binned columns for categorization models
df = df.drop(df.iloc[:,82:100], axis = 1)
df['Origin Destination New Region'] = df['Origin Province'] + df['New Regions']
#one hot encode remaining categorical variables
one_hot_columns = ["Main Transportation Method", "Quarter", "Main Reason" ]
df2 = pd.get_dummies(df, columns = one_hot_columns)
df2 = pd.get_dummies(df2, columns = ["Origin Destination New Region"])

In [58]:
#get list of column names for each feature
transport_cols = [col for col in df2.columns if ('Main Transportation Method') in col]
quarter_cols = [col for col in df2.columns if 'Quarter' in col]
mainreason_cols = [col for col in df2.columns if 'Main Reason' in col]
Origin_Destination_New_Region_cols = [col for col in df2.columns if 'Origin Destination New Region' in col]

#get dataframe subsets based on list of col names 
num_features = df2.loc[:,["DURATION", "GNCQ06A", "GNCQ06B"]]
transport = df2.loc[:,transport_cols]
quarter = df2.loc[:,quarter_cols]
main_reason = df2.loc[:,mainreason_cols]
ori_dest_newregion = df2.loc[:,Origin_Destination_New_Region_cols]
high_correlation_activities_cols =  df2.loc[:,['Visit Friends & Family', "Restaurant/Bar/Club", "Shopping", "Sightseeing", "Museum/Art Gallery", "Historic site", "Zoo/Aquarium", "Wildife Viewing/Bird Watching", "National/Provincial/Nature Park"]]

# same as above but dependent feature 
total_spend = df2.loc[:,["SPD_DTOT"]]
transport_spend = df2.loc[:,["Total Spending for transport"]]
foodbev_spend = df2.loc[:,["Total Spending for food/bevs"]]
activities_spend = df2.loc[:,["Total spending for activities/entertainment"]]
shopping_spend = df2.loc[:,["Total spending on shopping"]]
accomodation_spend = df2.loc[:,["SPD_D06"]]

In [59]:
#total spend model
#consists of numerical inputs + ori dest new region, transport, transport, quarter, mainreason, high corr activities
x_list = [num_features, ori_dest_newregion, transport, quarter, main_reason, high_correlation_activities_cols]
# x3_list = [num_features, ori_dest_newregion, transport, quarter, main_reason, all_activities_cols]
x_list_transposed = [list_item.transpose() for list_item in x_list]
x = pd.concat(x_list_transposed, axis=0).transpose()
# dependent variables for each cost model
y = total_spend
yt = transport_spend
yfb = foodbev_spend
yat = activities_spend
ys = shopping_spend
yacc = accomodation_spend

In [60]:
dataframes_list = [x, yt]
dataframes_list_transposed = [list_item.transpose() for list_item in dataframes_list]
merged_dataframe = pd.concat(dataframes_list_transposed, axis=0).transpose()
merged_dataframe.dropna(subset=['Total Spending for transport'], inplace=True)
merged_dataframe.isnull().any().any()
yt = merged_dataframe.loc[:,["Total Spending for transport"]]
xt = merged_dataframe.loc[:, merged_dataframe.columns != "Total Spending for transport"]

In [61]:
dataframes_list = [x, yfb]
dataframes_list_transposed = [list_item.transpose() for list_item in dataframes_list]
merged_dataframe = pd.concat(dataframes_list_transposed, axis=0).transpose()
merged_dataframe.dropna(subset=['Total Spending for food/bevs'], inplace=True)
merged_dataframe.isnull().any().any()
yfb = merged_dataframe.loc[:,["Total Spending for food/bevs"]]
xfb = merged_dataframe.loc[:, merged_dataframe.columns != "Total Spending for food/bevs"]

In [62]:
dataframes_list = [x, yat]
dataframes_list_transposed = [list_item.transpose() for list_item in dataframes_list]
merged_dataframe = pd.concat(dataframes_list_transposed, axis=0).transpose()
merged_dataframe.dropna(subset=['Total spending for activities/entertainment'], inplace=True)
merged_dataframe.isnull().any().any()
yat = merged_dataframe.loc[:,["Total spending for activities/entertainment"]]
xat = merged_dataframe.loc[:, merged_dataframe.columns != "Total spending for activities/entertainment"]

In [63]:
dataframes_list = [x, ys]
dataframes_list_transposed = [list_item.transpose() for list_item in dataframes_list]
merged_dataframe = pd.concat(dataframes_list_transposed, axis=0).transpose()
merged_dataframe.dropna(subset=['Total spending on shopping'], inplace=True)
merged_dataframe.isnull().any().any()
ys = merged_dataframe.loc[:,["Total spending on shopping"]]
xs = merged_dataframe.loc[:, merged_dataframe.columns != "Total spending on shopping"]

In [64]:
dataframes_list = [x, yacc]
dataframes_list_transposed = [list_item.transpose() for list_item in dataframes_list]
merged_dataframe = pd.concat(dataframes_list_transposed, axis=0).transpose()
merged_dataframe.dropna(subset=['SPD_D06'], inplace=True)
merged_dataframe.isnull().any().any()
yacc = merged_dataframe.loc[:,["SPD_D06"]]
xacc = merged_dataframe.loc[:, merged_dataframe.columns != "SPD_D06"]

*** MODELS BELOW HERE ***

In [65]:
# X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
# model = LinearRegression().fit(X_train, y_train)
# y_pred = model.predict(X_test)

model = LinearRegression().fit(x, y)
modelt = LinearRegression().fit(xt, yt)
modelfb = LinearRegression().fit(xfb, yfb)
modelat = LinearRegression().fit(xat, yat)
models = LinearRegression().fit(xs, ys)
modelacc = LinearRegression().fit(xacc, yacc)


In [66]:
# from sklearn.metrics import mean_squared_error, mean_absolute_error

# X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
# model1 = LinearRegression(positive = True, fit_intercept=False).fit(X_train, y_train)

# y_pred = model1.predict(X_test)
# mse_model = mean_squared_error(y_test, y_pred)
# mae_model = mean_absolute_error(y_test, y_pred)
# print(f"Model - MSE: {mse_model} MAE: {mae_model}")

In [67]:
# X_train, X_test, y_train, y_test = train_test_split(xt, yt, test_size=0.2, random_state=42)
# model1 = LinearRegression(positive = True, fit_intercept=False).fit(X_train, y_train)

# y_pred = model1.predict(X_test)
# mse_model = mean_squared_error(y_test, y_pred)
# mae_model = mean_absolute_error(y_test, y_pred)
# print(f"Model - MSE: {mse_model} MAE: {mae_model}")

In [68]:
# X_train, X_test, y_train, y_test = train_test_split(xfb, yfb, test_size=0.2, random_state=42)
# model1 = LinearRegression(positive = True, fit_intercept=False).fit(X_train, y_train)

# y_pred = model1.predict(X_test)
# mse_model = mean_squared_error(y_test, y_pred)
# mae_model = mean_absolute_error(y_test, y_pred)
# print(f"Model - MSE: {mse_model} MAE: {mae_model}")

In [69]:
# X_train, X_test, y_train, y_test = train_test_split(xat, yat, test_size=0.2, random_state=42)
# model1 = LinearRegression(positive = True, fit_intercept=False).fit(X_train, y_train)

# y_pred = model1.predict(X_test)
# mse_model = mean_squared_error(y_test, y_pred)
# mae_model = mean_absolute_error(y_test, y_pred)
# print(f"Model - MSE: {mse_model} MAE: {mae_model}")

In [70]:
# X_train, X_test, y_train, y_test = train_test_split(xs, ys, test_size=0.2, random_state=42)
# model1 = LinearRegression(positive = True, fit_intercept=False).fit(X_train, y_train)

# y_pred = model1.predict(X_test)
# mse_model = mean_squared_error(y_test, y_pred)
# mae_model = mean_absolute_error(y_test, y_pred)
# print(f"Model - MSE: {mse_model} MAE: {mae_model}")

In [71]:
# X_train, X_test, y_train, y_test = train_test_split(xacc, yacc, test_size=0.2, random_state=42)
# model1 = LinearRegression(positive = True, fit_intercept=False).fit(X_train, y_train)

# y_pred = model1.predict(X_test)
# mse_model = mean_squared_error(y_test, y_pred)
# mae_model = mean_absolute_error(y_test, y_pred)
# print(f"Model - MSE: {mse_model} MAE: {mae_model}")

In [72]:
import pickle
model_map = {
    "model" : model,
    "modelt" : modelt,
    "modelfb" : modelfb,
    "modelat" : modelat,
    "models" : models,
    "modelacc" : modelacc
}
for key, value in model_map.items():
    with open(f'../src/{key}.pkl', 'wb') as file:
        pickle.dump(value, file)