# Import bibliotek i danych

In [31]:
import pandas as pd
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor 
import statsmodels.api as sm
from sklearn.model_selection import KFold
from sklearn import metrics
from sklearn.metrics import *
from sklearn.linear_model import LinearRegression
from scipy import stats
import warnings
import numpy as np
import pickle
warnings.filterwarnings("ignore")

# Oszacowanie modelu OLS

### Najbardziej podstawowy model (bez feature engineeringu):

In [40]:
data=pd.read_csv("data_eda.csv")
data=data.drop(columns=['Unnamed: 0'])

features=data.columns.tolist()
features.remove('stars')
wzor = 'stars~' + '+'.join(features)
mod = smf.ols(formula = wzor, 
              data = data)
res = mod.fit()
res.summary()

0,1,2,3
Dep. Variable:,stars,R-squared:,0.462
Model:,OLS,Adj. R-squared:,0.46
Method:,Least Squares,F-statistic:,191.3
Date:,"Mon, 22 Apr 2024",Prob (F-statistic):,0.0
Time:,16:07:31,Log-Likelihood:,599.87
No. Observations:,6930,AIC:,-1136.0
Df Residuals:,6898,BIC:,-916.7
Df Model:,31,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.0092,0.032,94.115,0.000,2.946,3.072
pages,-3.883e-05,2e-05,-1.940,0.052,-7.81e-05,4.07e-07
reviews,9.675e-07,1.08e-07,8.988,0.000,7.56e-07,1.18e-06
series,0.0750,0.007,10.502,0.000,0.061,0.089
mix,0.0690,0.030,2.333,0.020,0.011,0.127
character,-0.0956,0.024,-3.962,0.000,-0.143,-0.048
plot,-0.1866,0.035,-5.407,0.000,-0.254,-0.119
funny,0.4076,0.019,21.337,0.000,0.370,0.445
lighthearted,-0.0072,0.032,-0.221,0.825,-0.071,0.057

0,1,2,3
Omnibus:,238.608,Durbin-Watson:,1.931
Prob(Omnibus):,0.0,Jarque-Bera (JB):,343.875
Skew:,-0.349,Prob(JB):,2.13e-75
Kurtosis:,3.839,Cond. No.,774000.0


In [41]:
rmse=[]
kf = KFold(n_splits=5, shuffle=True, random_state=2024)
for train, test in kf.split(data.index.values): 
    mod = LinearRegression().fit(data[features].iloc[train], data.iloc[train]["stars"])
    y_pred = mod.predict(data[features].iloc[test])
    y_test=data.iloc[test]["stars"]
    rmse.append(mean_squared_error(y_test, y_pred)**0.5)
print("Linear Regression RMSE: {}".format(round(sum(rmse)/len(rmse),5)))

Linear Regression RMSE: 0.22302


### Model z interakcjami:

In [42]:
data=pd.read_csv("data_add.csv")
data=data.drop(columns=['Unnamed: 0','index_0'])

features=data.columns.tolist()
features.remove('stars')
wzor = 'stars~' + '+'.join(features)
mod = smf.ols(formula = wzor, 
              data = data)
res = mod.fit()
res.summary()

0,1,2,3
Dep. Variable:,stars,R-squared:,0.502
Model:,OLS,Adj. R-squared:,0.497
Method:,Least Squares,F-statistic:,111.5
Date:,"Mon, 22 Apr 2024",Prob (F-statistic):,0.0
Time:,16:07:37,Log-Likelihood:,863.17
No. Observations:,6930,AIC:,-1600.0
Df Residuals:,6867,BIC:,-1169.0
Df Model:,62,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.0085,0.050,59.734,0.000,2.910,3.107
pages,-3.953e-05,1.97e-05,-2.011,0.044,-7.81e-05,-9.89e-07
reviews,8.056e-07,1.05e-07,7.691,0.000,6e-07,1.01e-06
series,0.0066,0.020,0.328,0.743,-0.033,0.046
mix,0.0470,0.020,2.339,0.019,0.008,0.086
character,-0.0504,0.012,-4.289,0.000,-0.073,-0.027
plot,-0.0623,0.018,-3.499,0.000,-0.097,-0.027
funny,0.4308,0.019,22.551,0.000,0.393,0.468
lighthearted,0.0065,0.033,0.197,0.844,-0.058,0.071

0,1,2,3
Omnibus:,217.779,Durbin-Watson:,1.941
Prob(Omnibus):,0.0,Jarque-Bera (JB):,336.436
Skew:,-0.304,Prob(JB):,8.79e-74
Kurtosis:,3.892,Cond. No.,1.11e+16


In [43]:
rmse=[]
kf = KFold(n_splits=5, shuffle=True, random_state=2024)
for train, test in kf.split(data.index.values): 
    mod = LinearRegression().fit(data[features].iloc[train], data.iloc[train]["stars"])
    y_pred = mod.predict(data[features].iloc[test])
    y_test=data.iloc[test]["stars"]
    rmse.append(mean_squared_error(y_test, y_pred)**0.5)
print("Linear Regression RMSE: {}".format(round(sum(rmse)/len(rmse),5)))

Linear Regression RMSE: 0.21586


### Model z transformacjami zmiennych:

In [45]:
data=pd.read_csv("data_fe.csv")
data=data.drop(columns=['Unnamed: 0'])
features=data.columns.tolist()
features.remove('stars')
wzor = 'stars~' + '+'.join(features)
mod = smf.ols(formula = wzor, 
              data = data)
res = mod.fit()
res.summary()

0,1,2,3
Dep. Variable:,stars,R-squared:,0.411
Model:,OLS,Adj. R-squared:,0.408
Method:,Least Squares,F-statistic:,123.5
Date:,"Mon, 22 Apr 2024",Prob (F-statistic):,0.0
Time:,16:08:05,Log-Likelihood:,286.89
No. Observations:,6930,AIC:,-493.8
Df Residuals:,6890,BIC:,-220.0
Df Model:,39,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,2.4971,0.104,24.059,0.000,2.294,2.701
Crime,0.0121,0.011,1.067,0.286,-0.010,0.034
Romans,-0.1011,0.009,-11.391,0.000,-0.118,-0.084
sad_mm,0.2771,0.022,12.399,0.000,0.233,0.321
reviews_boxcox,-0.3770,0.033,-11.312,0.000,-0.442,-0.312
reflective_boxcox,0.0222,0.007,3.396,0.001,0.009,0.035
mysterious_rs,0.0434,0.011,4.123,0.000,0.023,0.064
inspiring_log,0.0119,0.003,3.497,0.000,0.005,0.018
informative_boxcox,0.0028,0.001,2.395,0.017,0.001,0.005

0,1,2,3
Omnibus:,120.773,Durbin-Watson:,1.975
Prob(Omnibus):,0.0,Jarque-Bera (JB):,162.387
Skew:,-0.225,Prob(JB):,5.47e-36
Kurtosis:,3.6,Cond. No.,402.0


In [46]:
rmse=[]
kf = KFold(n_splits=5, shuffle=True, random_state=2024)
for train, test in kf.split(data.index.values): 
    mod = LinearRegression().fit(data[features].iloc[train], data.iloc[train]["stars"])
    y_pred = mod.predict(data[features].iloc[test])
    y_test=data.iloc[test]["stars"]
    rmse.append(mean_squared_error(y_test, y_pred)**0.5)
print("Linear Regression RMSE: {}".format(round(sum(rmse)/len(rmse),5)))

Linear Regression RMSE: 0.23359


Najlepszy wynik dla modelu z interkacjami bez transformacji, ale nadal dużo zmiennych wyszło jako nieistotne statystyczne, nawet te o dużej wartości MI, dlatego przeprowadzimy analizę VIF współliniowości

In [110]:
data=pd.read_csv("data_add.csv")
data=data.drop(columns=['Unnamed: 0','index_0'])

features=data.columns.tolist()
features.remove('stars')

In [98]:
# the independent variables set 
X = data[features]
  
# VIF dataframe 
vif_data = pd.DataFrame() 
vif_data["feature"] = X.columns 
  
# calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(X.values, i) 
                          for i in range(len(X.columns))] 

vif_data.sort_values(by=["VIF"], ascending=False).head(50)

Unnamed: 0,feature,VIF
46,Fiction_plot,inf
3,mix,inf
4,character,inf
5,plot,inf
39,Fiction_character,inf
44,Fiction_mix,inf
45,Fiction_mysterious,343.076395
18,mysterious,342.279517
49,Fiction_tense,244.430939
12,tense,236.183548


Na początek usuniemy z modelu zmienne: mix, character, plot, Fiction_mysterious, Fiction_tense, Fiction_adventurous, Fiction_dark, Fiction_emotional, Fiction_challenging, Fiction_hopeful, Fiction_author_stars, Fiction_sad, Fiction_reflective, Nonfiction_challenging, series_mix, Fiction, Fanstasy_mix i Literary_sad

In [100]:
features.remove('mix')
features.remove('character')
features.remove('plot')
features.remove('Fiction_mysterious')
features.remove('Fiction_tense')
features.remove('Fiction_adventurous')
features.remove('Fiction_dark')
features.remove('Fiction_emotional')
features.remove('Fiction_challenging')
features.remove('Fiction_hopeful')
features.remove('Fiction_author_stars')
features.remove('Fiction_sad')
features.remove('Fiction_reflective')
features.remove('Nonfiction_challenging')
features.remove('series_mix')
features.remove('Fiction')
features.remove('Fantasy_mix')
features.remove('Literary_sad')

In [101]:
wzor = 'stars~' + '+'.join(features)
mod = smf.ols(formula = wzor, 
              data = data)
res = mod.fit()
res.summary()

0,1,2,3
Dep. Variable:,stars,R-squared:,0.485
Model:,OLS,Adj. R-squared:,0.481
Method:,Least Squares,F-statistic:,137.9
Date:,"Mon, 22 Apr 2024",Prob (F-statistic):,0.0
Time:,11:17:06,Log-Likelihood:,749.24
No. Observations:,6930,AIC:,-1402.0
Df Residuals:,6882,BIC:,-1074.0
Df Model:,47,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.0361,0.031,98.771,0.000,2.976,3.096
pages,-4.713e-05,1.98e-05,-2.382,0.017,-8.59e-05,-8.34e-06
reviews,8.546e-07,1.06e-07,8.056,0.000,6.47e-07,1.06e-06
series,0.0417,0.018,2.345,0.019,0.007,0.077
funny,0.4018,0.019,21.191,0.000,0.365,0.439
lighthearted,0.0202,0.032,0.624,0.532,-0.043,0.084
emotional,0.3723,0.028,13.308,0.000,0.317,0.427
hopeful,0.3049,0.037,8.187,0.000,0.232,0.378
inspiring,0.1955,0.043,4.503,0.000,0.110,0.281

0,1,2,3
Omnibus:,243.015,Durbin-Watson:,1.943
Prob(Omnibus):,0.0,Jarque-Bera (JB):,376.718
Skew:,-0.331,Prob(JB):,1.57e-82
Kurtosis:,3.931,Cond. No.,835000.0


Pozostały zmienne nieistotne statystycznie: Fantasy_author_stars, Fantasy_inspiring, Nonfiction_author_stars, Romans_author_stars oraz reflective, które mają niską wartość VIF.

In [102]:
features.remove('Fantasy_author_stars')
features.remove('Fantasy_inspiring')
features.remove('Nonfiction_author_stars')
features.remove('Romans_author_stars')
features.remove('reflective')

Również: lighthearted, która mają stosunkową wysoką wartość VIF.
Z EDA wiemy, że zmienna lighthearted jest skorelowana z funny oraz relaxing, więc spróbujemy utworzyć nową zmienną.

In [112]:
data['lighthearted_relaxing']=data['lighthearted']+data['relaxing']
data['lighthearted_funny']=data['lighthearted']+data['funny']
features.append('lighthearted_funny')
features.append('lighthearted_relaxing')
features.remove('lighthearted')

In [104]:
wzor = 'stars~' + '+'.join(features)
mod = smf.ols(formula = wzor, 
              data = data)
res = mod.fit()
res.summary()

0,1,2,3
Dep. Variable:,stars,R-squared:,0.485
Model:,OLS,Adj. R-squared:,0.482
Method:,Least Squares,F-statistic:,154.3
Date:,"Mon, 22 Apr 2024",Prob (F-statistic):,0.0
Time:,11:18:32,Log-Likelihood:,748.53
No. Observations:,6930,AIC:,-1411.0
Df Residuals:,6887,BIC:,-1117.0
Df Model:,42,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,3.0426,0.027,112.962,0.000,2.990,3.095
pages,-4.646e-05,1.98e-05,-2.351,0.019,-8.52e-05,-7.73e-06
reviews,8.579e-07,1.06e-07,8.099,0.000,6.5e-07,1.07e-06
series,0.0415,0.017,2.387,0.017,0.007,0.076
funny,0.4761,0.036,13.104,0.000,0.405,0.547
emotional,0.3721,0.028,13.371,0.000,0.318,0.427
hopeful,0.3075,0.037,8.310,0.000,0.235,0.380
inspiring,0.2001,0.043,4.656,0.000,0.116,0.284
relaxing,0.6439,0.060,10.790,0.000,0.527,0.761

0,1,2,3
Omnibus:,241.889,Durbin-Watson:,1.944
Prob(Omnibus):,0.0,Jarque-Bera (JB):,373.491
Skew:,-0.331,Prob(JB):,7.899999999999999e-82
Kurtosis:,3.925,Cond. No.,1.11e+16


Reszta zmiennych w modelu jest istotna statystycznie

Większość zmiennych wpływa pozytywnie na zmienną objaśnianą oprócz: pages, Fantasy, Crime, Children, Romans, Realism, Fiction_character, Fiction_plot, Literary_inspiring, Literary_mix, Nonfiction_hopeful, series_emotional i lighthearted_funny.
Jednak zmienna Fantasy jest również obecna w interakcji z adventurous, a zmienna Crime w interkacji z mix.

Można wyciągnąc wniosek, że dobrze są oceniane ksiązki Fantasy jedynie, gdy mają wysoki wskaźnik przygodowości, a Crime, gdy mają mix postaci i fabuły. Gorzej są oceniane książki z genre fiction jedynie skupione na postaciach lub fabule, a nie mixie. Dodatkowo kategorie Children, Romans i Realism mają średnio niższe oceny. Również mniej lubione są książki z kategorii Literary, które są inspirujące lub mają mix fabułu i postaci, także książki Nonfiction, które są pełne nadziei i emocjonalne serie.
Zaskakująco ludzie również gorzej oceniają książki, które są śmieszne jeśli są beztroskie, możliwe, że preferowane są książki, które mają tzw. dark humor.

# Jakość predykcyjna modelu OLS

In [25]:
def CVTestOLS(nFolds = 5, randomState=2024, debug=False, features=features):
    kf = KFold(n_splits=nFolds, shuffle=True, random_state=randomState)
    # Listy z wynikami
    testResults = []
    trainResults = []
    predictions = []
    indices = []
    # Pętla walidująca model na kolejnych foldach
    for train, test in kf.split(df.index.values):
        # Przygotowanie estymatora
        clf = LinearRegression()
        if debug:
            print(clf)
        # Trenowanie modelu
        clf.fit(df.iloc[train][features], df.iloc[train][target])
        # Przygotowanie prognoz dla zbioru treningowego i testowego
        predsTrain = clf.predict(df.iloc[train][features])
        preds = clf.predict(df.iloc[test][features])
        # Zachowajmy informacje o predykcjach dla tego foldu
        predictions.append(preds.tolist().copy())
        # Razem z indeksami w oryginalnym data frame
        indices.append(df.iloc[test].index.tolist().copy())
        # Policzenie RMSE dla foldów
        trainScore = metrics.mean_squared_error(df.iloc[train][target], predsTrain)**0.5
        testScore = metrics.mean_squared_error(df.iloc[test][target], preds)**0.5
        # Zapisanie wyników dla foldów
        trainResults.append(trainScore)
        testResults.append(testScore)
        # Informowanie o każdym foldzie razem z wynikami treningowymi możemy opcjonalnie wyświetlać w trakcie
        if debug:
            print("Train RMSE:", trainScore,
                  "Valid RMSE:", testScore)
        
    return trainResults, testResults, predictions, indices

Nie usuwamy zmiennych z modelu, gdyż w tym przypadku OLS zmniejszyłoby to zdolność predykcyjną.

In [50]:
df=pd.read_csv("data_add.csv")
df=df.drop(columns=['Unnamed: 0','index_0'])

features=df.columns.tolist()
features.remove('stars')
df['lighthearted_relaxing']=df['lighthearted']+df['relaxing']
df['lighthearted_funny']=df['lighthearted']+df['funny']
features.append('lighthearted_funny')
features.append('lighthearted_relaxing')
target='stars'
#Zapisujemy model
trainResults, testResults, predictions, indices = CVTestOLS(features=features)
print(np.mean(testResults))

modelOLS = {
    "name":"OLS",
    "trainResults":trainResults.copy(),
    "testResults":testResults.copy(),
    "predictions":predictions.copy(),
    "indices":indices.copy(),
}

with open("model_OLS.p", "wb") as fp:
    pickle.dump(modelOLS, fp)

0.21585546946069475


Widzimy, że w przypadku modelu regresji feature engineering nie pomógł w mocy predycyjnej, jednak możemy nadal ustalić tą wartość jako bazową dla dalszych modeli: *0.21586*