# Import bibliotek

In [1]:
import pandas as pd
import numpy as np
import pickle
import statsmodels.api as sm
from sklearn.metrics import *
from sklearn.model_selection import train_test_split, KFold
from sklearn import neighbors
import typing
SEED = 17

# Ładowanie danych

In [2]:
data=pd.read_csv("../data/data_eda.csv")
data=data.drop(columns=['Unnamed: 0'])
features=data.columns.tolist()
features.remove('stars')
target='stars'
data

Unnamed: 0,pages,stars,reviews,series,mix,character,plot,funny,lighthearted,emotional,...,author_stars,Fiction,Nonfiction,Literary,Fantasy,Crime,Social,Children,Romans,Realism
0,273,4.00,2017,0,0.44,0.51,0.02,0.27,0.37,0.91,...,4.305000,1,1,0,0,0,1,0,1,1
1,302,3.78,7330,0,0.39,0.42,0.17,0.03,0.01,0.18,...,3.670000,1,0,0,0,1,0,0,0,0
2,400,4.15,16761,0,0.51,0.39,0.08,0.02,0.01,0.88,...,0.000000,1,0,1,0,0,0,0,0,0
3,459,4.16,2128,1,0.48,0.10,0.40,0.04,0.02,0.07,...,0.000000,1,0,0,1,0,0,0,0,0
4,160,3.65,6634,1,0.28,0.16,0.54,0.92,0.73,0.00,...,4.115000,1,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6925,432,4.15,30643,0,0.48,0.05,0.46,0.00,0.00,0.40,...,3.856667,1,0,0,1,1,0,0,0,0
6926,352,3.62,1058,0,0.55,0.13,0.30,0.15,0.10,0.25,...,3.700000,1,0,0,1,0,1,0,0,0
6927,535,3.88,30975,1,0.45,0.08,0.45,0.14,0.19,0.31,...,3.870000,1,0,0,1,0,0,1,0,0
6928,472,3.88,5914,1,0.64,0.12,0.22,0.07,0.00,0.36,...,3.660000,1,0,0,1,0,0,1,0,0


$\text{Podział danych na zbiór treningowy i testowy}$

In [3]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=SEED)

In [4]:
def perform_cv(X: pd.DataFrame, y: pd.Series, algorithm: typing.Any, cv: typing.Any = KFold(n_splits=5, shuffle=True, random_state=SEED), metric: typing.Any = mean_squared_error) -> typing.List[float]:
    """
    Perform cross-validation and return list of scores
    
    Args:
        X (pd.DataFrame): input data
        y (pd.Series): target data
        algorithm (typing.Any): algorithm to use for training and prediction
        cv (typing.Any): cross-validation strategy
        metric (typing.Any): metric to use for evaluation
    
    Returns:
        typing.List[float]: list of scores in order: train_scores, validation_scores
    """
    train_scores = []
    validation_scores = []
    for train_idx, val_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        algorithm.fit(X_train, y_train)
        y_train_pred = algorithm.predict(X_train)
        y_val_pred = algorithm.predict(X_val)
        train_scores.append(metric(y_train, y_train_pred, squared=False))
        validation_scores.append(metric(y_val, y_val_pred, squared=False))
    return np.mean(train_scores), np.mean(validation_scores)

def evaluation(X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series, algorithm: typing.Any, metric: typing.Any = mean_squared_error) -> typing.Tuple[float, float, np.ndarray]:
    """
    Train the algorithm on the train data and evaluate on the train and test data
    
    Args:
        X_train (pd.DataFrame): input train data
        y_train (pd.Series): target train data
        X_test (pd.DataFrame): input test data
        y_test (pd.Series): target test data
        algorithm (typing.Any): algorithm to use for training and prediction
        metric (typing.Any): metric to use for evaluation
    
    Returns:
        typing.Tuple[float, float, np.ndarray]: train_score, test_score, predictions on test data
    """
    algorithm.fit(X_train, y_train)
    y_train_pred = algorithm.predict(X_train)
    y_test_pred = algorithm.predict(X_test)
    train_results = metric(y_train, y_train_pred, squared=False)
    test_results = metric(y_test, y_test_pred, squared=False)
    return train_results, test_results, y_test_pred

Zobaczymy czy w tym przypadku feature engineering poprawił jakość predykcyjną.

## Model bez feature engineeringu

In [5]:
model = neighbors.KNeighborsRegressor(n_neighbors=50, n_jobs=-1)
train_scores, validation_scores = perform_cv(train_data[features], train_data[target], model)
print("Train RMSE:", train_scores)
print("Validation RMSE:", validation_scores)

Train RMSE: 0.2908859671453747
Validation RMSE: 0.297474944141276


## Model z interakcjami

In [6]:
data_interactions=pd.read_csv("../data/data_interactions.csv")
data_interactions=data_interactions.drop(columns=['Unnamed: 0'])
features_interactions=data_interactions.columns.tolist()
features_interactions.remove('stars')
train_data_interactions, test_data_interactions = train_test_split(data_interactions, test_size=0.2, random_state=SEED)
model = neighbors.KNeighborsRegressor(n_neighbors=50, n_jobs=-1)
train_scores, validation_scores = perform_cv(train_data_interactions[features_interactions], train_data_interactions[target], model)
print("Train RMSE:", train_scores)
print("Validation RMSE:", validation_scores)

Train RMSE: 0.2915809495320786
Validation RMSE: 0.2977173913653672


## Model z transformacją zmiennych

In [7]:
data_transformations=pd.read_csv("../data/data_feature_engineering.csv")
data_transformations=data_transformations.drop(columns=['Unnamed: 0'])
features_transformations=data_transformations.columns.tolist()
features_transformations.remove('stars')
train_data_transformations, test_data_transformations = train_test_split(data_transformations, test_size=0.2, random_state=SEED)
model = neighbors.KNeighborsRegressor(n_neighbors=50, n_jobs=-1)
train_scores, validation_scores = perform_cv(train_data_transformations[features_transformations], train_data_transformations[target], model)
print("Train RMSE:", train_scores)
print("Validation RMSE:", validation_scores)

Train RMSE: 0.2455440588842674
Validation RMSE: 0.25063166139536924


Widzimy, że w tym przypadku feature engineering poprawił znacząco moc predykcyjną modelu, lecz jest nadal słabsza od OLS, spróbujemy to poprawić tuningiem parametrów.

# Tuning hiperparametrów kNN

1. Wybór liczby najbliższych sąsiadów

In [8]:
for k in [1, 3, 5, 10, 15, 30, 50, 100, 150, 200]:
    model = neighbors.KNeighborsRegressor(n_neighbors=k, n_jobs=-1)
    train_scores, validation_scores = perform_cv(train_data_transformations[features_transformations], train_data_transformations[target], model)
    print("Neighbors: {}; RMSE train: {:.5f}; RMSE validation: {:.5f}".format(k, train_scores, validation_scores))

Neighbors: 1; RMSE train: 0.00000; RMSE validation: 0.30909
Neighbors: 3; RMSE train: 0.18222; RMSE validation: 0.26000
Neighbors: 5; RMSE train: 0.20460; RMSE validation: 0.25124
Neighbors: 10; RMSE train: 0.22202; RMSE validation: 0.24502
Neighbors: 15; RMSE train: 0.22926; RMSE validation: 0.24493
Neighbors: 30; RMSE train: 0.23958; RMSE validation: 0.24762
Neighbors: 50; RMSE train: 0.24554; RMSE validation: 0.25063
Neighbors: 100; RMSE train: 0.25295; RMSE validation: 0.25550
Neighbors: 150; RMSE train: 0.25655; RMSE validation: 0.25824
Neighbors: 200; RMSE train: 0.25895; RMSE validation: 0.26012


In [9]:
for k in range(10, 20):
    model = neighbors.KNeighborsRegressor(n_neighbors=k, n_jobs=-1)
    train_scores, validation_scores = perform_cv(train_data_transformations[features_transformations], train_data_transformations[target], model)
    print("Neighbors: {}; RMSE train: {:.5f}; RMSE validation: {:.5f}".format(k, train_scores, validation_scores))

Neighbors: 10; RMSE train: 0.22202; RMSE validation: 0.24502
Neighbors: 11; RMSE train: 0.22389; RMSE validation: 0.24460
Neighbors: 12; RMSE train: 0.22537; RMSE validation: 0.24463
Neighbors: 13; RMSE train: 0.22679; RMSE validation: 0.24489
Neighbors: 14; RMSE train: 0.22812; RMSE validation: 0.24497
Neighbors: 15; RMSE train: 0.22926; RMSE validation: 0.24493
Neighbors: 16; RMSE train: 0.23059; RMSE validation: 0.24521
Neighbors: 17; RMSE train: 0.23161; RMSE validation: 0.24545
Neighbors: 18; RMSE train: 0.23245; RMSE validation: 0.24563
Neighbors: 19; RMSE train: 0.23322; RMSE validation: 0.24569


Wydaje się, że najlepsza liczba sąsiadów wynosi 11.

2. Wybór miary dystansu

In [10]:
distances = ['euclidean', 'manhattan', 'chebyshev', 'minkowski']
for distance in distances:
    model = neighbors.KNeighborsRegressor(n_neighbors=11, n_jobs=-1, metric=distance)
    train_scores, validation_scores = perform_cv(train_data_transformations[features_transformations], train_data_transformations[target], model)
    print("Distance: {}; RMSE train: {:.5f}; RMSE validation: {:.5f}".format(distance, train_scores, validation_scores))

Distance: euclidean; RMSE train: 0.22389; RMSE validation: 0.24460
Distance: manhattan; RMSE train: 0.21648; RMSE validation: 0.23593
Distance: chebyshev; RMSE train: 0.23942; RMSE validation: 0.26207
Distance: minkowski; RMSE train: 0.22389; RMSE validation: 0.24460


Lepszy wynik dla miary Manhattan

3. Wybór funkcji wag w predykcjach

In [11]:
weights = ['uniform', 'distance']
for weight in weights:
    model = neighbors.KNeighborsRegressor(n_neighbors=11, n_jobs=-1, metric='manhattan', weights=weight)
    train_scores, validation_scores = perform_cv(train_data_transformations[features_transformations], train_data_transformations[target], model)
    print("Weight: {}; RMSE train: {:.5f}; RMSE validation: {:.5f}".format(weight, train_scores, validation_scores))

Weight: uniform; RMSE train: 0.21648; RMSE validation: 0.23593
Weight: distance; RMSE train: 0.00000; RMSE validation: 0.22807


Lepszy wynik przy odwrotności odległości jako wagach (bliżsi sąsiedzi punktu mają większy wpływ niż sąsiedzi, którzy są dalej).

4. Wybór zmiennych

In [12]:
results = {}
for feature in features_transformations:
    model = neighbors.KNeighborsRegressor(n_neighbors=11, n_jobs=-1, metric='manhattan', weights='distance')
    train_scores, validation_scores = perform_cv(train_data_transformations[[feature]], train_data_transformations[target], model)
    results[feature] = validation_scores
results = dict(sorted(results.items(), key=lambda item: item[1]))
results

{'author_stars_log': 0.2753929747802282,
 'Fiction*author_stars_log': 0.28113808631537285,
 'inspiring_ss': 0.2842611266050452,
 'author_count_log*inspiring_ss': 0.28437469236579904,
 'Fiction*inspiring_ss': 0.28571982112122307,
 'author_count_log*author_stars_log': 0.28666610908850404,
 'author_stars_log*inspiring_ss': 0.2884290797309283,
 'mix_quantile': 0.29051793390758907,
 'Fiction*mix_quantile': 0.29051793390758907,
 'Fiction*emotional_log': 0.2917973121899677,
 'Nonfiction*emotional_log': 0.29462948641595127,
 'Nonfiction*author_stars_log': 0.2953096146219243,
 'hopeful_boxcox': 0.2955098847273227,
 'Realism*inspiring_ss': 0.2955121678912151,
 'sad*Nonfiction': 0.2956366714688835,
 'Nonfiction*inspiring_ss': 0.29587937665237,
 'inspiring_ss*mix_quantile': 0.29599154954706897,
 'Social*author_stars_log': 0.29619890173473784,
 'series*author_stars_log': 0.29642984041541914,
 'Nonfiction*mysterious_ss': 0.29650136020943263,
 'Nonfiction*dark_ss': 0.29663269614860815,
 'Fantasy*auth

Bardzo porównywalne wyniki dla każdej zmiennej

Poniżej zweryfikowano wyniki w sytuacji, gdy iteracyjnie usuwamy zmienne (w kolejności od najmniej istotnej do najbardziej istotnej, sugerując się powyższymi rezultatami).

In [13]:
descending_results = dict(sorted(results.items(), key=lambda item: item[1], reverse=True))
features_transformations_copy = features_transformations.copy()
for feature, score in descending_results.items():
    if len(features_transformations_copy) == 1:
        break
    features_transformations_copy.remove(feature)
    model = neighbors.KNeighborsRegressor(n_neighbors=11, n_jobs=-1, metric='manhattan', weights='distance')
    train_scores, validation_scores = perform_cv(train_data_transformations[features_transformations_copy], train_data_transformations[target], model)
    print("Feature removed: {}; RMSE train: {:.5f}; RMSE validation: {:.5f}".format(feature, train_scores, validation_scores))

Feature removed: reflective_log*tense_log; RMSE train: 0.00000; RMSE validation: 0.22801
Feature removed: lighthearted*pages_quantile; RMSE train: 0.00000; RMSE validation: 0.22800
Feature removed: relaxing_log*character_ss; RMSE train: 0.00000; RMSE validation: 0.22804
Feature removed: lighthearted*emotional_log; RMSE train: 0.00000; RMSE validation: 0.22804
Feature removed: reviews_log; RMSE train: 0.00000; RMSE validation: 0.22801
Feature removed: lighthearted*character_ss; RMSE train: 0.00000; RMSE validation: 0.22800
Feature removed: lighthearted*relaxing_log; RMSE train: 0.00000; RMSE validation: 0.22798
Feature removed: emotional_log*tense_log; RMSE train: 0.00000; RMSE validation: 0.22804
Feature removed: reflective_log*relaxing_log; RMSE train: 0.00000; RMSE validation: 0.22803
Feature removed: reflective_log*pages_quantile; RMSE train: 0.00000; RMSE validation: 0.22804
Feature removed: lighthearted*mysterious_ss; RMSE train: 0.00000; RMSE validation: 0.22805
Feature removed: 

Najlepszy wynik przy usunięciu zmiennych: reflective_log*tense_log, lighthearted*pages_quantile, relaxing_log*character_ss, lighthearted*emotional_log, reviews_log, lighthearted*character_ss, lighthearted*relaxing_log, emotional_log*tense_log, reflective_log*relaxing_log,

reflective_log*pages_quantile, lighthearted*mysterious_ss, tense_log*character_ss, lighthearted*dark_ss, reflective_log*funny_quantile, dark_ss*pages_quantile, sad*tense_log, relaxing_log*funny_quantile, sad*relaxing_log.

Zmienne zostaną usunięte ze zbioru danych.

In [14]:
features_to_remove = ["reflective_log*tense_log", "lighthearted*pages_quantile", "relaxing_log*character_ss", "lighthearted*emotional_log", "reviews_log", "lighthearted*character_ss", "lighthearted*relaxing_log", "emotional_log*tense_log", "reflective_log*relaxing_log",
                    "reflective_log*pages_quantile", "lighthearted*mysterious_ss", "tense_log*character_ss", "lighthearted*dark_ss", "reflective_log*funny_quantile", "dark_ss*pages_quantile", "sad*tense_log", "relaxing_log*funny_quantile", "sad*relaxing_log"]
for feature in features_to_remove:
    features_transformations.remove(feature)

Zweryfikujmy, czy usunięcie zmiennych poprawi wynik modelu.

In [15]:
model = neighbors.KNeighborsRegressor(n_neighbors=11, n_jobs=-1, metric='manhattan', weights='distance')
train_scores, validation_scores = perform_cv(train_data_transformations[features_transformations], train_data_transformations[target], model)
print("Train RMSE:", train_scores)
print("Validation RMSE:", validation_scores)

Train RMSE: 0.0
Validation RMSE: 0.2278613713091504


Usunięcie wybranych zmiennych poprawiło wynik modelu dla zbioru walidacyjnego (z 0.22807 na 0.22786).

5. Skalowanie zmiennych

W poniższym punkcie zweryfikowano wpływ dwukrotnego zwiększenia wartości zmiennej.

W tym celu iteracyjnie podwajano wartości zmiennej i sprawdzano, czy model zwraca lepsze wyniki.

In [16]:
results2 = {}
for feature in features_transformations:
    train_data_transformations_copy = train_data_transformations.copy()
    train_data_transformations_copy[feature] = train_data_transformations[feature]*2
    model = neighbors.KNeighborsRegressor(n_neighbors=11, n_jobs=-1, metric='manhattan', weights='distance')
    train_scores, validation_scores = perform_cv(train_data_transformations_copy[features_transformations], train_data_transformations_copy[target], model)
    results2[feature] = validation_scores
results2 = dict(sorted(results2.items(), key=lambda item: item[1]))
results2

{'emotional_log*reviews_log': 0.22751473279226472,
 'sad*reviews_log': 0.22751639575719268,
 'reviews_log*mix_quantile': 0.22755462145438637,
 'Nonfiction*reviews_log': 0.22759093842708963,
 'Social*reviews_log': 0.22772889967549254,
 'Social*author_stars_log': 0.22775194960418094,
 'Fiction*inspiring_ss': 0.22776189232430663,
 'challenging*reviews_log': 0.22776603800223078,
 'dark_ss*funny_quantile': 0.2277710184290204,
 'Nonfiction*author_stars_log': 0.22777226648371593,
 'author_stars_log*emotional_log': 0.22777482358949003,
 'reviews_log*plot_quantile': 0.22777532774416245,
 'Social*author_count_log': 0.22778445989727142,
 'series*reflective_log': 0.22778522422266279,
 'Literary*Social': 0.22778832962905401,
 'Social': 0.22779190774519922,
 'Nonfiction*dark_ss': 0.22779444789601647,
 'emotional_log*funny_quantile': 0.22779497607921698,
 'reviews_log*inspiring_ss': 0.22779556468688397,
 'Fiction*Nonfiction': 0.22779607938804342,
 'Nonfiction*Social': 0.22779660294905774,
 'author_co

Te z najlepszym wynikiem podwajamy a z najgorszym zmniejszamy o połowę.

Jako próg odcięcia posłuży średnia wartość powyższych rezultatów.

Zmienne, po których zwiększeniu uzyskano wynik mniejszy niż średnia (bo metryka to RMSE, więc im mniejsza wartość tym lepiej), zostaną dwukrotnie zwiększone.

In [17]:
threshold = np.mean(list(results2.values()))
train_data_transformations_copy = train_data_transformations.copy()
for feature, score in results2.items():
    if score < threshold:
        train_data_transformations_copy[feature] = train_data_transformations_copy[feature]*2
model = neighbors.KNeighborsRegressor(n_neighbors=11, n_jobs=-1, metric='manhattan', weights='distance')
train_scores, validation_scores = perform_cv(train_data_transformations_copy[features_transformations], train_data_transformations_copy[target], model)
print("Train RMSE:", train_scores)
print("Validation RMSE:", validation_scores)

Train RMSE: 0.0
Validation RMSE: 0.224199953013302


Warto również zweryfikować, czy zmniejszenie wartości pozostałych zmiennych (mniej istotnych) nie wpłynie na poprawę wyników.

In [18]:
threshold = np.mean(list(results2.values()))
train_data_transformations_copy = train_data_transformations.copy()
for feature, score in results2.items():
    if score < threshold:
        train_data_transformations_copy[feature] = train_data_transformations_copy[feature]*2
    else:
        train_data_transformations_copy[feature] = train_data_transformations_copy[feature]*0.5
model = neighbors.KNeighborsRegressor(n_neighbors=11, n_jobs=-1, metric='manhattan', weights='distance')
train_scores, validation_scores = perform_cv(train_data_transformations_copy[features_transformations], train_data_transformations_copy[target], model)
print("Train RMSE:", train_scores)
print("Validation RMSE:", validation_scores)

Train RMSE: 0.0
Validation RMSE: 0.2214411136066739


Ponownie poprawa wyników - względem poprzedniego punktu, dla zbioru walidacyjnego (z 0.22786 na 0.2214).

## Zapisanie modelu

In [19]:
for feature, score in results2.items():
    if score < threshold:
        train_data_transformations[feature] = train_data_transformations[feature]*2
        test_data_transformations[feature] = test_data_transformations[feature]*2
    else:
        train_data_transformations[feature] = train_data_transformations[feature]*0.5
        test_data_transformations[feature] = test_data_transformations[feature]*0.5

test_indices = test_data_transformations.index

#Ewaluacja modelu
model = neighbors.KNeighborsRegressor(n_neighbors=11, n_jobs=-1, metric='manhattan', weights='distance')
train_results, test_results, y_test_pred = evaluation(train_data_transformations[features_transformations], train_data_transformations[target], test_data_transformations[features_transformations], test_data_transformations[target], model)
print("Train RMSE: {}".format(round(train_results, 5)))
print("Test RMSE: {}".format(round(test_results, 5)))

#Zapisanie modelu
model_KNN = {
    "name": "KNN",
    "trainResults": train_results,
    "testResults": test_results,
    "predictions": y_test_pred,
    "indices": test_indices,
}
with open("../data/model_KNN.p", "wb") as fp:
    pickle.dump(model_KNN, fp)

Train RMSE: 0.0
Test RMSE: 0.22613


## Podsumowanie

$\text{Najlepsze wyniki walidacji krzyżowej uzyskano dla modelu bazującego na danych z transformacją zmiennych}$<p>
$\text{Ponadto, optymalizacja hiperparametrów, usunięcie najmniej istotnych zmiennych oraz skalowanie wybranych cech pozwoliło na poprawę wyników}$<p>
$\text{Wyniki na zbiorze treningowym (RMSE): 0.0}$<p>
$\text{Wyniki na zbiorze testowym (RMSE): 0.22613}$