# Import bibliotek

In [1]:
import pandas as pd
import numpy as np
import pickle
import statsmodels.api as sm
from sklearn.metrics import *
from sklearn.model_selection import train_test_split, KFold
from sklearn import neighbors
import typing
SEED = 17

# Ładowanie danych

In [2]:
data=pd.read_csv("../data/data_eda.csv")
data=data.drop(columns=['Unnamed: 0'])
features=data.columns.tolist()
features.remove('stars')
target='stars'
data

Unnamed: 0,pages,stars,reviews,series,mix,character,plot,funny,lighthearted,emotional,...,author_stars,Fiction,Nonfiction,Literary,Fantasy,Crime,Social,Children,Romans,Realism
0,273,4.00,2017,0,0.44,0.51,0.02,0.27,0.37,0.91,...,4.305000,1,1,0,0,0,1,0,1,1
1,302,3.78,7330,0,0.39,0.42,0.17,0.03,0.01,0.18,...,3.670000,1,0,0,0,1,0,0,0,0
2,400,4.15,16761,0,0.51,0.39,0.08,0.02,0.01,0.88,...,0.000000,1,0,1,0,0,0,0,0,0
3,459,4.16,2128,1,0.48,0.10,0.40,0.04,0.02,0.07,...,0.000000,1,0,0,1,0,0,0,0,0
4,160,3.65,6634,1,0.28,0.16,0.54,0.92,0.73,0.00,...,4.115000,1,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6925,432,4.15,30643,0,0.48,0.05,0.46,0.00,0.00,0.40,...,3.856667,1,0,0,1,1,0,0,0,0
6926,352,3.62,1058,0,0.55,0.13,0.30,0.15,0.10,0.25,...,3.700000,1,0,0,1,0,1,0,0,0
6927,535,3.88,30975,1,0.45,0.08,0.45,0.14,0.19,0.31,...,3.870000,1,0,0,1,0,0,1,0,0
6928,472,3.88,5914,1,0.64,0.12,0.22,0.07,0.00,0.36,...,3.660000,1,0,0,1,0,0,1,0,0


$\text{Podział danych na zbiór treningowy i testowy}$

In [3]:
train_data, test_data = train_test_split(data, test_size=0.2, random_state=SEED)

In [4]:
def perform_cv(X: pd.DataFrame, y: pd.Series, algorithm: typing.Any, cv: typing.Any = KFold(n_splits=5, shuffle=True, random_state=SEED), metric: typing.Any = mean_squared_error) -> typing.List[float]:
    """
    Perform cross-validation and return list of scores
    
    Args:
        X (pd.DataFrame): input data
        y (pd.Series): target data
        algorithm (typing.Any): algorithm to use for training and prediction
        cv (typing.Any): cross-validation strategy
        metric (typing.Any): metric to use for evaluation
    
    Returns:
        typing.List[float]: list of scores in order: train_scores, validation_scores
    """
    train_scores = []
    validation_scores = []
    for train_idx, val_idx in cv.split(X, y):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        algorithm.fit(X_train, y_train)
        y_train_pred = algorithm.predict(X_train)
        y_val_pred = algorithm.predict(X_val)
        train_scores.append(metric(y_train, y_train_pred, squared=False))
        validation_scores.append(metric(y_val, y_val_pred, squared=False))
    return np.mean(train_scores), np.mean(validation_scores)

def evaluation(X_train: pd.DataFrame, y_train: pd.Series, X_test: pd.DataFrame, y_test: pd.Series, algorithm: typing.Any, metric: typing.Any = mean_squared_error) -> typing.Tuple[float, float, np.ndarray]:
    """
    Train the algorithm on the train data and evaluate on the train and test data
    
    Args:
        X_train (pd.DataFrame): input train data
        y_train (pd.Series): target train data
        X_test (pd.DataFrame): input test data
        y_test (pd.Series): target test data
        algorithm (typing.Any): algorithm to use for training and prediction
        metric (typing.Any): metric to use for evaluation
    
    Returns:
        typing.Tuple[float, float, np.ndarray]: train_score, test_score, predictions on test data
    """
    algorithm.fit(X_train, y_train)
    y_train_pred = algorithm.predict(X_train)
    y_test_pred = algorithm.predict(X_test)
    train_results = metric(y_train, y_train_pred, squared=False)
    test_results = metric(y_test, y_test_pred, squared=False)
    return train_results, test_results, y_test_pred

Zobaczymy czy w tym przypadku feature engineering poprawił jakość predykcyjną.

## Model bez feature engineeringu

In [5]:
model = neighbors.KNeighborsRegressor(n_neighbors=50, n_jobs=-1)
train_scores, validation_scores = perform_cv(train_data[features], train_data[target], model)
print("Train RMSE:", train_scores)
print("Validation RMSE:", validation_scores)

Train RMSE: 0.2908859671453747
Validation RMSE: 0.297474944141276


## Model z interakcjami

In [6]:
data_interactions=pd.read_csv("../data/data_add.csv")
data_interactions=data_interactions.drop(columns=['Unnamed: 0'])
features_interactions=data_interactions.columns.tolist()
features_interactions.remove('stars')
train_data_interactions, test_data_interactions = train_test_split(data_interactions, test_size=0.2, random_state=SEED)
model = neighbors.KNeighborsRegressor(n_neighbors=50, n_jobs=-1)
train_scores, validation_scores = perform_cv(train_data_interactions[features_interactions], train_data_interactions[target], model)
print("Train RMSE:", train_scores)
print("Validation RMSE:", validation_scores)

Train RMSE: 0.2897043196010073
Validation RMSE: 0.2952206662560052


## Model z transformacją zmiennych

In [7]:
data_transformations=pd.read_csv("../data/data_fe.csv")
data_transformations=data_transformations.drop(columns=['Unnamed: 0'])
features_transformations=data_transformations.columns.tolist()
features_transformations.remove('stars')
train_data_transformations, test_data_transformations = train_test_split(data_transformations, test_size=0.2, random_state=SEED)
model = neighbors.KNeighborsRegressor(n_neighbors=50, n_jobs=-1)
train_scores, validation_scores = perform_cv(train_data_transformations[features_transformations], train_data_transformations[target], model)
print("Train RMSE:", train_scores)
print("Validation RMSE:", validation_scores)

Train RMSE: 0.25260235114061624
Validation RMSE: 0.25775968118975995


Widzimy, że w tym przypadku feature engineering poprawił znacząco moc predykcyjną modelu, lecz jest nadal słabsza od OLS, spróbujemy to poprawić tuningiem parametrów.

# Tuning hiperparametrów kNN

1. Wybór liczby najbliższych sąsiadów

In [8]:
for k in [1, 3, 5, 10, 15, 30, 50, 100, 150, 200]:
    model = neighbors.KNeighborsRegressor(n_neighbors=k, n_jobs=-1)
    train_scores, validation_scores = perform_cv(train_data_transformations[features_transformations], train_data_transformations[target], model)
    print("Neighbors: {}; RMSE train: {:.5f}; RMSE validation: {:.5f}".format(k, train_scores, validation_scores))

Neighbors: 1; RMSE train: 0.00000; RMSE validation: 0.30768
Neighbors: 3; RMSE train: 0.18172; RMSE validation: 0.25928
Neighbors: 5; RMSE train: 0.20468; RMSE validation: 0.25255
Neighbors: 10; RMSE train: 0.22417; RMSE validation: 0.24995
Neighbors: 15; RMSE train: 0.23321; RMSE validation: 0.25031
Neighbors: 30; RMSE train: 0.24438; RMSE validation: 0.25278
Neighbors: 50; RMSE train: 0.25260; RMSE validation: 0.25776
Neighbors: 100; RMSE train: 0.26136; RMSE validation: 0.26389
Neighbors: 150; RMSE train: 0.26634; RMSE validation: 0.26792
Neighbors: 200; RMSE train: 0.26991; RMSE validation: 0.27090


Wydaje się, że najlepsza liczba sąsiadów wynosi 10.

2. Wybór miary dystansu

In [9]:
distances = ['euclidean', 'manhattan', 'chebyshev', 'minkowski']
for distance in distances:
    model = neighbors.KNeighborsRegressor(n_neighbors=10, n_jobs=-1, metric=distance)
    train_scores, validation_scores = perform_cv(train_data_transformations[features_transformations], train_data_transformations[target], model)
    print("Distance: {}; RMSE train: {:.5f}; RMSE validation: {:.5f}".format(distance, train_scores, validation_scores))

Distance: euclidean; RMSE train: 0.22417; RMSE validation: 0.24995
Distance: manhattan; RMSE train: 0.21356; RMSE validation: 0.23689
Distance: chebyshev; RMSE train: 0.23902; RMSE validation: 0.26541
Distance: minkowski; RMSE train: 0.22417; RMSE validation: 0.24995


Lepszy wynik dla miary Manhattan

3. Wybór funkcji wag w predykcjach

In [10]:
weights = ['uniform', 'distance']
for weight in weights:
    model = neighbors.KNeighborsRegressor(n_neighbors=10, n_jobs=-1, metric='manhattan', weights=weight)
    train_scores, validation_scores = perform_cv(train_data_transformations[features_transformations], train_data_transformations[target], model)
    print("Weight: {}; RMSE train: {:.5f}; RMSE validation: {:.5f}".format(weight, train_scores, validation_scores))

Weight: uniform; RMSE train: 0.21356; RMSE validation: 0.23689
Weight: distance; RMSE train: 0.00000; RMSE validation: 0.22873


Lepszy wynik przy odwrotności odległości jako wagach (bliżsi sąsiedzi punktu mają większy wpływ niż sąsiedzi, którzy są dalej).

4. Wybór zmiennych

In [11]:
results = {}
for feature in features_transformations:
    model = neighbors.KNeighborsRegressor(n_neighbors=10, n_jobs=-1, metric='manhattan', weights='distance')
    train_scores, validation_scores = perform_cv(train_data_transformations[[feature]], train_data_transformations[target], model)
    results[feature] = validation_scores
results = dict(sorted(results.items(), key=lambda item: item[1]))
results

{'author_stars': 0.2764297242255428,
 'inspiring_log': 0.28531859636488044,
 'Fiction_mix_qt': 0.2907843972716632,
 'Fiction_emotional_boxcox': 0.2920820264250924,
 'Fiction_hopeful_rs': 0.2923358930926584,
 'Nonfiction_author_stars': 0.29430447287788725,
 'series_author_stars': 0.29502469862736386,
 'Social_author_stars': 0.29595190476471733,
 'Fantasy_author_stars': 0.29646088661698056,
 'Nonfiction_dark_ss': 0.29708194065932386,
 'Nonfiction_author_count_ss': 0.2983919369702595,
 'series_mix_qt': 0.298477414262312,
 'emotional_boxcox': 0.3003059294720321,
 'informative_boxcox': 0.3008279535811423,
 'Social_challenging_mm': 0.30111730615497556,
 'Fantasy_mix_qt': 0.3018817297166998,
 'Literary_emotional_boxcox': 0.3020605393435344,
 'Nonfiction_challenging_mm': 0.3025284569433614,
 'Literary_inspiring_log': 0.30254866076248715,
 'Fiction_adventurous_qt': 0.3035452760603026,
 'Literary_hopeful_rs': 0.30479530126108056,
 'Crime': 0.3048286850096803,
 'Romans': 0.3054794874342523,
 'Fic

Bardzo porównywalne wyniki dla każdej zmiennej

Poniżej zweryfikowano wyniki w sytuacji, gdy iteracyjnie usuwamy zmienne (w kolejności od najmniej istotnej do najbardziej istotnej, sugerując się powyższymi rezultatami).

In [12]:
descending_results = dict(sorted(results.items(), key=lambda item: item[1], reverse=True))
features_transformations_copy = features_transformations.copy()
for feature, score in descending_results.items():
    if len(features_transformations_copy) == 1:
        break
    features_transformations_copy.remove(feature)
    model = neighbors.KNeighborsRegressor(n_neighbors=10, n_jobs=-1, metric='manhattan', weights='distance')
    train_scores, validation_scores = perform_cv(train_data_transformations[features_transformations_copy], train_data_transformations[target], model)
    print("Feature removed: {}; RMSE train: {:.5f}; RMSE validation: {:.5f}".format(feature, train_scores, validation_scores))

Feature removed: reviews_boxcox; RMSE train: 0.00000; RMSE validation: 0.22907
Feature removed: tense_rs_dark_ss; RMSE train: 0.00000; RMSE validation: 0.22906
Feature removed: lighthearted_mm__funny_qt; RMSE train: 0.00000; RMSE validation: 0.22936
Feature removed: lighthearted_mm__relaxing_boxcox; RMSE train: 0.00000; RMSE validation: 0.22573
Feature removed: series_inspiring_log; RMSE train: 0.00000; RMSE validation: 0.22545
Feature removed: Literary_author_stars; RMSE train: 0.00000; RMSE validation: 0.22566
Feature removed: reflective_boxcox; RMSE train: 0.00000; RMSE validation: 0.22569
Feature removed: Fiction_pages_qt; RMSE train: 0.00000; RMSE validation: 0.22649
Feature removed: sad_mm; RMSE train: 0.00000; RMSE validation: 0.22752
Feature removed: challenging_mm; RMSE train: 0.00000; RMSE validation: 0.22779
Feature removed: Fiction_character_ss; RMSE train: 0.00000; RMSE validation: 0.22743
Feature removed: mysterious_rs; RMSE train: 0.00000; RMSE validation: 0.22686
Featur

Najlepszy wynik przy usunięciu zmiennych: reviews_boxcox, tense_rs_dark_ss, lighthearted_mm__funny_qt, lighthearted_mm__relaxing_boxcox, series_inspiring_log.

Zmienne zostaną usunięte ze zbioru danych

In [13]:
features_to_remove = ["reviews_boxcox", "tense_rs_dark_ss", "lighthearted_mm__funny_qt", "lighthearted_mm__relaxing_boxcox", "series_inspiring_log"]
for feature in features_to_remove:
    features_transformations.remove(feature)

Zweryfikujmy, czy usunięcie zmiennych poprawi wynik modelu.

In [14]:
model = neighbors.KNeighborsRegressor(n_neighbors=10, n_jobs=-1, metric='manhattan', weights='distance')
train_scores, validation_scores = perform_cv(train_data_transformations[features_transformations], train_data_transformations[target], model)
print("Train RMSE:", train_scores)
print("Validation RMSE:", validation_scores)

Train RMSE: 0.0
Validation RMSE: 0.2254511593378382


Usunięcie wybranych zmiennych poprawiło wynik modelu dla zbioru walidacyjnego (z 0.22873 na 0.22545).

5. Skalowanie zmiennych

W poniższym punkcie zweryfikowano wpływ dwukrotnego zwiększenia wartości zmiennej.

W tym celu iteracyjnie podwajano wartości zmiennej i sprawdzano, czy model zwraca lepsze wyniki.

In [15]:
results2 = {}
for feature in features_transformations:
    train_data_transformations_copy = train_data_transformations.copy()
    train_data_transformations_copy[feature] = train_data_transformations[feature]*2
    model = neighbors.KNeighborsRegressor(n_neighbors=10, n_jobs=-1, metric='manhattan', weights='distance')
    train_scores, validation_scores = perform_cv(train_data_transformations_copy[features_transformations], train_data_transformations_copy[target], model)
    results2[feature] = validation_scores
results2 = dict(sorted(results2.items(), key=lambda item: item[1]))
results2

{'author_stars': 0.22376032555967176,
 'emotional_boxcox': 0.22437562431221214,
 'Fiction_mix_qt': 0.2246592205448304,
 'Fiction_pages_qt': 0.22481286575407777,
 'Fiction_emotional_boxcox': 0.22485181681732672,
 'Literary_author_stars': 0.2248595258872267,
 'Fiction_author_count_ss': 0.22491466138966545,
 'Fiction_adventurous_qt': 0.22505001706161593,
 'Fantasy_author_stars': 0.22508901636247053,
 'Literary_emotional_boxcox': 0.22510485442004297,
 'Fantasy_mix_qt': 0.2251297159429444,
 'series_mix_qt': 0.22514856312647263,
 'Nonfiction_author_count_ss': 0.22518891939391109,
 'Literary_mysterious_rs': 0.22525384700268375,
 'mysterious_rs': 0.2252746969331448,
 'Fiction_plot_qt': 0.22533144123470464,
 'sad_mm': 0.22534526232439914,
 'series_author_stars': 0.22535895419518806,
 'Fiction_character_ss': 0.2253852921817016,
 'Fiction_hopeful_rs': 0.22538883644077248,
 'Nonfiction_author_stars': 0.22543488035062964,
 'Fiction_challenging_mm': 0.2254525664257779,
 'challenging_mm': 0.225532621

Te z najlepszym wynikiem podwajamy a z najgorszym zmniejszamy o połowę.

Jako próg odcięcia posłuży średnia wartość powyższych rezultatów.

Zmienne, po których zwiększeniu uzyskano wynik mniejszy niż średnia (bo metryka to RMSE, więc im mniejsza wartość tym lepiej), zostaną dwukrotnie zwiększone.

In [16]:
threshold = np.mean(list(results2.values()))
train_data_transformations_copy = train_data_transformations.copy()
for feature, score in results2.items():
    if score < threshold:
        train_data_transformations_copy[feature] = train_data_transformations_copy[feature]*2
model = neighbors.KNeighborsRegressor(n_neighbors=10, n_jobs=-1, metric='manhattan', weights='distance')
train_scores, validation_scores = perform_cv(train_data_transformations_copy[features_transformations], train_data_transformations_copy[target], model)
print("Train RMSE:", train_scores)
print("Validation RMSE:", validation_scores)

Train RMSE: 0.0
Validation RMSE: 0.22176314629610322


Warto również zweryfikować, czy zmniejszenie wartości pozostałych zmiennych (mniej istotnych) nie wpłynie na poprawę wyników.

In [17]:
threshold = np.mean(list(results2.values()))
train_data_transformations_copy = train_data_transformations.copy()
for feature, score in results2.items():
    if score < threshold:
        train_data_transformations_copy[feature] = train_data_transformations_copy[feature]*2
    else:
        train_data_transformations_copy[feature] = train_data_transformations_copy[feature]*0.5
model = neighbors.KNeighborsRegressor(n_neighbors=10, n_jobs=-1, metric='manhattan', weights='distance')
train_scores, validation_scores = perform_cv(train_data_transformations_copy[features_transformations], train_data_transformations_copy[target], model)
print("Train RMSE:", train_scores)
print("Validation RMSE:", validation_scores)

Train RMSE: 0.0
Validation RMSE: 0.22041794184475266


Ponownie poprawa wyników - względem poprzedniego punktu, dla zbioru walidacyjnego (z 0.22545 na 0.22041).

## Zapisanie modelu

In [18]:
for feature, score in results2.items():
    if score < threshold:
        train_data_transformations[feature] = train_data_transformations[feature]*2
        test_data_transformations[feature] = test_data_transformations[feature]*2
    else:
        train_data_transformations[feature] = train_data_transformations[feature]*0.5
        test_data_transformations[feature] = test_data_transformations[feature]*0.5

test_indices = test_data_transformations.index

#Ewaluacja modelu
model = neighbors.KNeighborsRegressor(n_neighbors=10, n_jobs=-1, metric='manhattan', weights='distance')
train_results, test_results, y_test_pred = evaluation(train_data_transformations[features_transformations], train_data_transformations[target], test_data_transformations[features_transformations], test_data_transformations[target], model)
print("Train RMSE: {}".format(round(train_results, 5)))
print("Test RMSE: {}".format(round(test_results, 5)))

#Zapisanie modelu
model_KNN = {
    "name": "KNN",
    "trainResults": train_results,
    "testResults": test_results,
    "predictions": y_test_pred,
    "indices": test_indices,
}
with open("../data/model_KNN.p", "wb") as fp:
    pickle.dump(model_KNN, fp)

Train RMSE: 0.0
Test RMSE: 0.225


## Podsumowanie

$\text{Najlepsze wyniki walidacji krzyżowej uzyskano dla modelu bazującego na danych z transformacją zmiennych}$<p>
$\text{Ponadto, optymalizacja hiperparametrów, usunięcie najmniej istotnych zmiennych oraz skalowanie wybranych cech pozwoliło na poprawę wyników}$<p>
$\text{Wyniki na zbiorze treningowym (RMSE): 0.0}$<p>
$\text{Wyniki na zbiorze testowym (RMSE): 0.225}$