In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import chisquare
from sklearn import linear_model
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
df = pd.read_csv('boscun-longitudinal.csv.gz')
dfs = df.sample(frac=0.005, random_state=1)

In [None]:
dfs.dtypes

In [None]:
dfs['received_dt'] = pd.to_datetime(dfs['received_ms'], unit='ms')
dfs['departure_dt'] = pd.to_datetime(dfs['departure_ms'], unit='ms')
dfs['return_dt'] = pd.to_datetime(dfs['return_ms'], unit='ms')

In [None]:
def convert_date(label):
    dfs[f'{label}_day'] = dfs[f'{label}_dt'].dt.dayofweek
    dfs[f'{label}_day_name'] = dfs[f'{label}_dt'].dt.day_name()
    dfs[f'{label}_month'] = dfs[f'{label}_dt'].dt.month
    dfs[f'{label}_month_name'] = dfs[f'{label}_dt'].dt.month_name()
    dfs[f'{label}_season'] = (dfs[f'{label}_month'] % 12 + 3) // 3
    seasons = {1: 'Winter', 2: 'Spring', 3: 'Summer', 4: 'Autumn'}
    dfs[f'{label}_season_name'] = dfs[f'{label}_season'].map(seasons)
    dfs[f'{label}_time_window'] = (dfs[f'{label}_dt'].dt.hour % 24 + 4) // 4
    day_parts = {1: 'Late Night', 2: 'Early Morning', 3: 'Morning', 4: 'Noon', 5: 'Evening', 6: 'Night'}
    dfs[f'{label}_time_window_name'] = dfs[f'{label}_time_window'].map(day_parts)
    dfs[f'{label}_is_weekend'] = dfs[f'{label}_dt'].dt.dayofweek.isin([5, 6])
    return None

In [None]:
convert_date('departure')
convert_date('return')
convert_date('received')

In [None]:
sns.histplot(dfs, x='total_usd', bins=50, kde=True)

In [None]:
sns.histplot(dfs, x='advance', bins=50, kde=True)

In [None]:
dfs['total_usd'].describe()

In [None]:
dfs['total_usd'].describe(percentiles=[.99])

In [None]:
dfs['outgoing_duration'].describe()

In [None]:
dfs[['total_usd', 'returning_duration']].corr()

In [None]:
sns.scatterplot(data=dfs, x="advance", y="total_usd", hue="return_day_name", style="return_day_name")

In [None]:
sns.scatterplot(data=dfs, x="advance", y="total_usd", hue="return_month_name", style="return_month_name")

In [None]:
sns.scatterplot(data=dfs, x="advance", y="total_usd", hue="return_season_name", style="return_season_name")

In [None]:
sns.scatterplot(data=dfs, x="advance", y="total_usd", hue="return_is_weekend", style="return_is_weekend")

In [None]:
sns.scatterplot(data=dfs, x="advance", y="total_usd", hue="return_time_window_name", style="return_time_window_name")

In [None]:
chisquare(dfs['return_season'], dfs['return_month'])

In [None]:
len(dfs)

In [None]:
sns.jointplot(data=dfs, x='total_usd', y='advance', kind="hist")

In [None]:
dfs = pd.concat([dfs, pd.get_dummies(dfs['departure_month'], prefix='dep_mnt', drop_first=True)], axis=1)
dfs = pd.concat([dfs, pd.get_dummies(dfs['departure_time_window_name'], prefix='dep_win', drop_first=True)], axis=1)
dfs = pd.concat([dfs, pd.get_dummies(dfs['departure_is_weekend'], prefix='dep_we', drop_first=True)], axis=1)
dfs = pd.concat([dfs, pd.get_dummies(dfs['return_month'], prefix='ttn_mnt', drop_first=True)], axis=1)
dfs = pd.concat([dfs, pd.get_dummies(dfs['return_time_window_name'], prefix='rtn_win', drop_first=True)], axis=1)
dfs = pd.concat([dfs, pd.get_dummies(dfs['return_is_weekend'], prefix='rtn_we', drop_first=True)], axis=1)
dfs = pd.concat([dfs, pd.get_dummies(dfs['highest_cabin_class'], prefix='high_class', drop_first=True)], axis=1)
dfs = pd.concat([dfs, pd.get_dummies(dfs['lowest_cabin_class'], prefix='low_class', drop_first=True)], axis=1)

In [None]:
dfs.head()

In [None]:
X = dfs.drop(['search_id','trip_index','received_date','received_ms','origin','destination','total_usd','pax_type','refundable', 'validating_carrier','departure_odate','departure_ms','outgoing_duration','return_odate','return_ms', 'returning_duration','major_carrier_id','includes_saturday_night_stay','lowest_cabin_class','highest_cabin_class'], axis=1)
y = dfs['total_usd']

In [None]:
X.drop(['total_stops', 'departure_dt', 'return_dt', 'departure_day', 'departure_day_name', 'departure_month', 'departure_month_name', 'departure_season', 'departure_season_name', 'departure_time_window', 'departure_time_window_name', 'return_day', 'return_day_name', 'return_month', 'return_month_name', 'return_season', 'return_season_name', 'return_time_window', 'return_time_window_name', 'received_day', 'received_day_name', 'received_month', 'received_month_name', 'received_season', 'received_season_name', 'received_time_window', 'received_time_window_name',], axis=1, inplace=True)

In [None]:
X.drop(['received_dt'], axis=1, inplace=True)

In [None]:
X_prime = X[['advance', 'available_seats', 'outgoing_stops']]
y = dfs['total_usd']

In [None]:
for col in X.columns:
    print(col)

In [None]:
X_scaled = X.copy()
target_variables = ['outgoing_stops', 'returning_stops', 'advance', 'length_of_stay', 'available_seats']
scaler_x = StandardScaler().fit(X[target_variables])
X_scaled[target_variables] = scaler_x.transform(X[target_variables])

In [None]:
X_scaled = X_prime.copy()
scaler_x = StandardScaler().fit(X_prime)
X_scaled = scaler_x.transform(X_prime)

In [None]:
X_scaled

In [None]:
y

In [None]:
y = np.array(y).reshape(-1, 1)
scaler_y = StandardScaler().fit(y)
y_scaled = scaler_y.transform(y)

In [None]:
y_scaled[:5]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.3, random_state=100)

In [None]:
svr = SVR(kernel='rbf', C=1.0, epsilon=0.1)
mlp_1 = MLPRegressor(hidden_layer_sizes=(10,), activation='relu', solver='adam')
mlp_2 = MLPRegressor(hidden_layer_sizes=(10, 10, ), activation='relu', solver='adam')

In [None]:
model_svr = svr.fit(X_train, y_train)
model_mlp_1 = mlp_1.fit(X_train, y_train)
model_mlp_2 = mlp_2.fit(X_train, y_train)

In [None]:
y_pred_svr = model_svr.predict(X_test)
y_pred_mlp_1 = model_mlp_1.predict(X_test)
y_pred_mlp_2 = model_mlp_2.predict(X_test)

In [None]:
y_pred_mlp_2

In [None]:
y_test[:,0]

In [None]:
sns.scatterplot(x=y_test[:, 0], y=y_pred_mlp_2)

In [None]:
mlp_2 = MLPRegressor(hidden_layer_sizes=(100, 100, 100,), activation='relu', solver='adam')

In [None]:
model_mlp_2 = mlp_2.fit(X_train, y_train)
y_pred_mlp_2 = model_mlp_2.predict(X_test)

In [None]:
sns.scatterplot(x=y_test[:, 0], y=y_pred_mlp_2)

In [None]:
r2 = r2_score(y_test[:, 0], y_pred_mlp_2)
r2