# Import bibliotek i danych

In [1]:
import pandas as pd
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor 
import statsmodels.api as sm
from sklearn import metrics
from sklearn.metrics import *
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
from scipy import stats
import warnings
import numpy as np
import pickle
import typing
warnings.filterwarnings("ignore")
SEED = 17

# Ładowanie danych

In [2]:
data=pd.read_csv("../data/data_eda.csv")
data=data.drop(columns=['Unnamed: 0'])
features=data.columns.tolist()
features.remove('stars')
target='stars'
data

Unnamed: 0,pages,stars,reviews,series,mix,character,plot,funny,lighthearted,emotional,...,author_stars,Fiction,Nonfiction,Literary,Fantasy,Crime,Social,Children,Romans,Realism
0,273,4.00,2017,0,0.44,0.51,0.02,0.27,0.37,0.91,...,4.305000,1,1,0,0,0,1,0,1,1
1,302,3.78,7330,0,0.39,0.42,0.17,0.03,0.01,0.18,...,3.670000,1,0,0,0,1,0,0,0,0
2,400,4.15,16761,0,0.51,0.39,0.08,0.02,0.01,0.88,...,0.000000,1,0,1,0,0,0,0,0,0
3,459,4.16,2128,1,0.48,0.10,0.40,0.04,0.02,0.07,...,0.000000,1,0,0,1,0,0,0,0,0
4,160,3.65,6634,1,0.28,0.16,0.54,0.92,0.73,0.00,...,4.115000,1,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6925,432,4.15,30643,0,0.48,0.05,0.46,0.00,0.00,0.40,...,3.856667,1,0,0,1,1,0,0,0,0
6926,352,3.62,1058,0,0.55,0.13,0.30,0.15,0.10,0.25,...,3.700000,1,0,0,1,0,1,0,0,0
6927,535,3.88,30975,1,0.45,0.08,0.45,0.14,0.19,0.31,...,3.870000,1,0,0,1,0,0,1,0,0
6928,472,3.88,5914,1,0.64,0.12,0.22,0.07,0.00,0.36,...,3.660000,1,0,0,1,0,0,1,0,0


$\text{Podział danych na zbiór treningowy i testowy}$

In [3]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=SEED)

In [4]:
def perform_cv(X: pd.DataFrame, y: pd.Series, algorithm: typing.Any, cv: typing.Any = KFold(n_splits=5, shuffle=True, random_state=SEED), metric: typing.Any = mean_squared_error) -> typing.List[float]:
    """
    Perform cross-validation and return list of scores
    
    Args:
        X (pd.DataFrame): input data
        y (pd.Series): target data
        algorithm (typing.Any): algorithm to use for training and prediction
        cv (typing.Any): cross-validation strategy
        metric (typing.Any): metric to use for evaluation
    
    Returns:
        typing.List[float]: list of scores in order: train_scores, validation_scores
    """
    train_scores = []
    validation_scores = []
    for train_idx, val_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        algorithm.fit(X_train, y_train)
        y_train_pred = algorithm.predict(X_train)
        y_val_pred = algorithm.predict(X_val)
        train_scores.append(metric(y_train, y_train_pred, squared=False))
        validation_scores.append(metric(y_val, y_val_pred, squared=False))
    return np.mean(train_scores), np.mean(validation_scores)

def evaluation(X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series, algorithm: typing.Any, metric: typing.Any = mean_squared_error) -> typing.Tuple[float, float, np.ndarray]:
    """
    Train the algorithm on the train data and evaluate on the train and test data
    
    Args:
        X_train (pd.DataFrame): input train data
        y_train (pd.Series): target train data
        X_test (pd.DataFrame): input test data
        y_test (pd.Series): target test data
        algorithm (typing.Any): algorithm to use for training and prediction
        metric (typing.Any): metric to use for evaluation
    
    Returns:
        typing.Tuple[float, float, np.ndarray]: train_score, test_score, predictions on test data
    """
    algorithm.fit(X_train, y_train)
    y_train_pred = algorithm.predict(X_train)
    y_test_pred = algorithm.predict(X_test)
    train_results = metric(y_train, y_train_pred, squared=False)
    test_results = metric(y_test, y_test_pred, squared=False)
    return train_results, test_results, y_test_pred

### Najbardziej podstawowy model (bez feature engineeringu):

In [5]:
wzor = 'stars~' + '+'.join(features)
mod = smf.ols(formula = wzor, data = train_data)
res = mod.fit()
res.summary()

0,1,2,3
Dep. Variable:,stars,R-squared:,0.464
Model:,OLS,Adj. R-squared:,0.461
Method:,Least Squares,F-statistic:,153.9
Date:,"Sat, 11 May 2024",Prob (F-statistic):,0.0
Time:,19:38:29,Log-Likelihood:,512.46
No. Observations:,5544,AIC:,-960.9
Df Residuals:,5512,BIC:,-749.1
Df Model:,31,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.0217,0.036,84.278,0.000,2.951,3.092
pages,-2.24e-05,2.22e-05,-1.008,0.314,-6.6e-05,2.12e-05
reviews,1.023e-06,1.23e-07,8.348,0.000,7.83e-07,1.26e-06
series,0.0708,0.008,8.948,0.000,0.055,0.086
mix,0.0731,0.033,2.238,0.025,0.009,0.137
character,-0.0894,0.027,-3.357,0.001,-0.142,-0.037
plot,-0.1755,0.038,-4.609,0.000,-0.250,-0.101
funny,0.4076,0.021,19.274,0.000,0.366,0.449
lighthearted,-0.0356,0.036,-0.987,0.324,-0.106,0.035

0,1,2,3
Omnibus:,206.478,Durbin-Watson:,2.014
Prob(Omnibus):,0.0,Jarque-Bera (JB):,306.045
Skew:,-0.36,Prob(JB):,3.49e-67
Kurtosis:,3.898,Cond. No.,768000.0


In [6]:
model = LinearRegression(fit_intercept=True)
train_scores, validation_scores = perform_cv(train_data[features], train_data[target], model)
print("Train RMSE:", train_scores)
print("Validation RMSE:", validation_scores)

Train RMSE: 0.2204369364743511
Validation RMSE: 0.2220773168372384


## Model z interakcjami

In [7]:
data_interactions=pd.read_csv("../data/data_add.csv")
data_interactions=data_interactions.drop(columns=['Unnamed: 0'])
features_interactions=data_interactions.columns.tolist()
features_interactions.remove('stars')
train_data_interactions, test_data_interactions = train_test_split(data_interactions, test_size=0.2, random_state=SEED)
wzor = 'stars~' + '+'.join(features_interactions)
mod = smf.ols(formula = wzor, data = train_data_interactions)
res = mod.fit()
res.summary()

0,1,2,3
Dep. Variable:,stars,R-squared:,0.509
Model:,OLS,Adj. R-squared:,0.503
Method:,Least Squares,F-statistic:,90.01
Date:,"Sat, 11 May 2024",Prob (F-statistic):,0.0
Time:,19:38:29,Log-Likelihood:,753.11
No. Observations:,5544,AIC:,-1378.0
Df Residuals:,5480,BIC:,-954.5
Df Model:,63,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.0548,0.056,54.741,0.000,2.945,3.164
index_0,-1.187e-05,1.46e-06,-8.137,0.000,-1.47e-05,-9.01e-06
pages,-1.542e-05,2.17e-05,-0.710,0.478,-5.8e-05,2.71e-05
reviews,7.376e-07,1.2e-07,6.162,0.000,5.03e-07,9.72e-07
series,0.0021,0.022,0.094,0.925,-0.042,0.046
mix,0.0449,0.022,2.018,0.044,0.001,0.088
character,-0.0488,0.013,-3.774,0.000,-0.074,-0.023
plot,-0.0608,0.020,-3.112,0.002,-0.099,-0.023
funny,0.4203,0.021,19.939,0.000,0.379,0.462

0,1,2,3
Omnibus:,201.113,Durbin-Watson:,2.006
Prob(Omnibus):,0.0,Jarque-Bera (JB):,319.413
Skew:,-0.332,Prob(JB):,4.37e-70
Kurtosis:,3.971,Cond. No.,1.16e+16


In [8]:
model = LinearRegression(fit_intercept=True)
train_scores, validation_scores = perform_cv(train_data_interactions[features_interactions], train_data_interactions[target], model)
print("Train RMSE:", train_scores)
print("Validation RMSE:", validation_scores)

Train RMSE: 0.21091275251763042
Validation RMSE: 0.21410239828253755


## Model z transformacjami zmiennych

In [9]:
data_transformations=pd.read_csv("../data/data_fe.csv")
data_transformations=data_transformations.drop(columns=['Unnamed: 0'])
features_transformations=data_transformations.columns.tolist()
features_transformations.remove('stars')
train_data_transformations, test_data_transformations = train_test_split(data_transformations, test_size=0.2, random_state=SEED)
wzor = 'stars~' + '+'.join(features_transformations)
mod = smf.ols(formula = wzor, data = train_data_transformations)
res = mod.fit()
res.summary()

0,1,2,3
Dep. Variable:,stars,R-squared:,0.412
Model:,OLS,Adj. R-squared:,0.408
Method:,Least Squares,F-statistic:,98.76
Date:,"Sat, 11 May 2024",Prob (F-statistic):,0.0
Time:,19:38:29,Log-Likelihood:,254.48
No. Observations:,5544,AIC:,-429.0
Df Residuals:,5504,BIC:,-164.1
Df Model:,39,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.5155,0.116,21.716,0.000,2.288,2.743
Crime,0.0159,0.013,1.268,0.205,-0.009,0.040
Romans,-0.1002,0.010,-10.071,0.000,-0.120,-0.081
sad_mm,0.2819,0.025,11.277,0.000,0.233,0.331
reviews_boxcox,-0.3657,0.037,-9.832,0.000,-0.439,-0.293
reflective_boxcox,0.0244,0.007,3.341,0.001,0.010,0.039
mysterious_rs,0.0424,0.012,3.654,0.000,0.020,0.065
inspiring_log,0.0111,0.004,2.914,0.004,0.004,0.019
informative_boxcox,0.0024,0.001,1.860,0.063,-0.000,0.005

0,1,2,3
Omnibus:,116.895,Durbin-Watson:,2.017
Prob(Omnibus):,0.0,Jarque-Bera (JB):,164.332
Skew:,-0.245,Prob(JB):,2.07e-36
Kurtosis:,3.687,Cond. No.,402.0


In [10]:
model = LinearRegression(fit_intercept=True)
train_scores, validation_scores = perform_cv(train_data_transformations[features_transformations], train_data_transformations[target], model)
print("Train RMSE:", train_scores)
print("Validation RMSE:", validation_scores)

Train RMSE: 0.23086218185156407
Validation RMSE: 0.23336460790373134


Najlepszy wynik dla modelu z interkacjami bez transformacji, ale nadal dużo zmiennych wyszło jako nieistotne statystyczne, nawet te o dużej wartości MI, dlatego przeprowadzimy analizę VIF współliniowości

In [11]:
# the independent variables set 
X = train_data_interactions[features_interactions]
# VIF dataframe 
vif_data = pd.DataFrame() 
vif_data["feature"] = X.columns 
# calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))] 
vif_data.sort_values(by=["VIF"], ascending=False).head(50)

Unnamed: 0,feature,VIF
40,Fiction_character,inf
45,Fiction_mix,inf
4,mix,inf
5,character,inf
6,plot,inf
47,Fiction_plot,inf
46,Fiction_mysterious,344.819537
19,mysterious,344.498437
50,Fiction_tense,258.057208
13,tense,249.380952


Na początek usuniemy z modelu zmienne: mix, character, plot, Fiction_mysterious, Fiction_tense, Fiction_adventurous, Fiction_dark, Fiction_emotional, Fiction_challenging, Fiction_hopeful, Fiction_author_stars, Fiction_sad, Fiction_reflective, Nonfiction_challenging, series_mix, Fiction, Fanstasy_mix i Literary_sad

In [12]:
features_to_remove = ["mix", "character", "plot", "Fiction_mysterious", "Fiction_tense", "Fiction_adventurous", "Fiction_dark", "Fiction_emotional", "Fiction_challenging", "Fiction_hopeful", "Fiction_author_stars", "Fiction_sad", "Fiction_reflective", "Nonfiction_challenging", "series_mix", "Fiction", "Fantasy_mix", "Literary_sad"]
for feature in features_to_remove:
    features_interactions.remove(feature)

In [13]:
wzor = 'stars~' + '+'.join(features_interactions)
mod = smf.ols(formula = wzor, data = train_data_interactions)
res = mod.fit()
res.summary()

0,1,2,3
Dep. Variable:,stars,R-squared:,0.491
Model:,OLS,Adj. R-squared:,0.487
Method:,Least Squares,F-statistic:,110.5
Date:,"Sat, 11 May 2024",Prob (F-statistic):,0.0
Time:,19:38:32,Log-Likelihood:,657.04
No. Observations:,5544,AIC:,-1216.0
Df Residuals:,5495,BIC:,-891.7
Df Model:,48,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.0893,0.035,89.400,0.000,3.022,3.157
index_0,-1.223e-05,1.48e-06,-8.264,0.000,-1.51e-05,-9.33e-06
pages,-2.326e-05,2.19e-05,-1.063,0.288,-6.61e-05,1.96e-05
reviews,7.963e-07,1.21e-07,6.567,0.000,5.59e-07,1.03e-06
series,0.0412,0.020,2.088,0.037,0.003,0.080
funny,0.3933,0.021,18.793,0.000,0.352,0.434
lighthearted,-0.0076,0.036,-0.213,0.832,-0.078,0.063
emotional,0.3710,0.031,11.987,0.000,0.310,0.432
hopeful,0.2893,0.041,7.032,0.000,0.209,0.370

0,1,2,3
Omnibus:,220.83,Durbin-Watson:,2.008
Prob(Omnibus):,0.0,Jarque-Bera (JB):,356.18
Skew:,-0.353,Prob(JB):,4.54e-78
Kurtosis:,4.022,Cond. No.,830000.0


Pozostały zmienne nieistotne statystycznie: Fantasy_author_stars, Fantasy_inspiring, Nonfiction_author_stars, Romans_author_stars oraz reflective, które mają niską wartość VIF.

In [14]:
features_to_remove = ["Fantasy_author_stars", "Fantasy_inspiring", "Nonfiction_author_stars", "Romans_author_stars", "reflective"]
for feature in features_to_remove:
    features_interactions.remove(feature)

Również: lighthearted, która mają stosunkową wysoką wartość VIF.
Z EDA wiemy, że zmienna lighthearted jest skorelowana z funny oraz relaxing, więc spróbujemy utworzyć nową zmienną.

In [15]:
train_data_interactions['lighthearted_relaxing']=train_data_interactions['lighthearted']+train_data_interactions['relaxing']
train_data_interactions['lighthearted_funny']=train_data_interactions['lighthearted']+train_data_interactions['funny']
features_to_add = ["lighthearted_funny", "lighthearted_relaxing"]
for feature in features_to_add:
    features_interactions.append(feature)

In [16]:
wzor = 'stars~' + '+'.join(features_interactions)
mod = smf.ols(formula = wzor, data = train_data_interactions)
res = mod.fit()
res.summary()

0,1,2,3
Dep. Variable:,stars,R-squared:,0.491
Model:,OLS,Adj. R-squared:,0.487
Method:,Least Squares,F-statistic:,123.4
Date:,"Sat, 11 May 2024",Prob (F-statistic):,0.0
Time:,19:38:32,Log-Likelihood:,656.18
No. Observations:,5544,AIC:,-1224.0
Df Residuals:,5500,BIC:,-933.1
Df Model:,43,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.0791,0.031,100.632,0.000,3.019,3.139
index_0,-1.221e-05,1.48e-06,-8.257,0.000,-1.51e-05,-9.31e-06
pages,-2.215e-05,2.18e-05,-1.014,0.311,-6.5e-05,2.07e-05
reviews,7.955e-07,1.21e-07,6.571,0.000,5.58e-07,1.03e-06
series,0.0448,0.019,2.323,0.020,0.007,0.083
funny,0.3396,0.025,13.849,0.000,0.291,0.388
lighthearted,-0.2875,0.033,-8.600,0.000,-0.353,-0.222
emotional,0.3679,0.031,11.953,0.000,0.308,0.428
hopeful,0.2933,0.041,7.178,0.000,0.213,0.373

0,1,2,3
Omnibus:,220.368,Durbin-Watson:,2.008
Prob(Omnibus):,0.0,Jarque-Bera (JB):,358.485
Skew:,-0.35,Prob(JB):,1.43e-78
Kurtosis:,4.031,Cond. No.,1.16e+16


Reszta zmiennych w modelu jest istotna statystycznie

Większość zmiennych wpływa pozytywnie na zmienną objaśnianą oprócz: pages, Fantasy, Crime, Children, Romans, Realism, Fiction_character, Fiction_plot, Literary_inspiring, Literary_mix, Nonfiction_hopeful, series_emotional i lighthearted_funny.
Jednak zmienna Fantasy jest również obecna w interakcji z adventurous, a zmienna Crime w interkacji z mix.

Można wyciągnąc wniosek, że dobrze są oceniane ksiązki Fantasy jedynie, gdy mają wysoki wskaźnik przygodowości, a Crime, gdy mają mix postaci i fabuły. Gorzej są oceniane książki z genre fiction jedynie skupione na postaciach lub fabule, a nie mixie. Dodatkowo kategorie Children, Romans i Realism mają średnio niższe oceny. Również mniej lubione są książki z kategorii Literary, które są inspirujące lub mają mix fabułu i postaci, także książki Nonfiction, które są pełne nadziei i emocjonalne serie.
Zaskakująco ludzie również gorzej oceniają książki, które są śmieszne jeśli są beztroskie, możliwe, że preferowane są książki, które mają tzw. dark humor.

## Zapisanie modelu

Nie usuwamy zmiennych z modelu, gdyż w tym przypadku OLS zmniejszyłoby to zdolność predykcyjną.

In [17]:
data_interactions=pd.read_csv("../data/data_add.csv")
data_interactions=data_interactions.drop(columns=['Unnamed: 0'])
features_interactions=data_interactions.columns.tolist()
features_interactions.remove('stars')
data_interactions['lighthearted_relaxing']=data_interactions['lighthearted']+data_interactions['relaxing']
data_interactions['lighthearted_funny']=data_interactions['lighthearted']+data_interactions['funny']
train_data_interactions, test_data_interactions = train_test_split(data_interactions, test_size=0.2, random_state=SEED)
test_indices = test_data_interactions.index
features_interactions.append('lighthearted_funny')
features_interactions.append('lighthearted_relaxing')
target='stars'
#Ewaluacja modelu
model = LinearRegression(fit_intercept=True)
train_results, test_results, y_test_pred = evaluation(train_data_interactions[features_interactions], train_data_interactions[target], test_data_interactions[features_interactions], test_data_interactions[target], model)
print("Train RMSE: {}".format(round(train_results, 5)))
print("Test RMSE: {}".format(round(test_results, 5)))

#Zapisanie modelu
modelOLS = {
    "name": "OLS",
    "trainResults": train_results,
    "testResults": test_results,
    "predictions": y_test_pred,
    "indices": test_indices,
}

with open("../data/model_OLS.p", "wb") as fp:
    pickle.dump(modelOLS, fp)

Train RMSE: 0.21124
Test RMSE: 0.21884


## Podsumowanie

$\text{Najlepsze wyniki walidacji krzyżowej uzyskano dla modelu bazującego na danych z interakcjami, bez transformacji zmiennych.}$<p>
$\text{Wyniki na zbiorze treningowym (RMSE): 0.21124}$<p>
$\text{Wyniki na zbiorze testowym (RMSE): 0.21884}$<p>