# Import bibliotek

In [61]:
import pandas as pd
import numpy as np
import pickle
import statsmodels.api as sm
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn import neighbors

# Oszacowanie modelu kNN

In [62]:
def CVTestKNN(nFolds = 5, randomState=2024, debug=False, features=features, *args, **kwargs):
    kf = KFold(n_splits=nFolds, shuffle=True, random_state=randomState)
    # Listy z wynikami
    testResults = []
    trainResults = []
    predictions = []
    indices = []
    # Pętla walidująca model na kolejnych foldach
    for train, test in kf.split(df.index.values):
        # Przygotowanie estymatora
        clf = neighbors.KNeighborsRegressor(*args, **kwargs)
        if debug:
            print(clf)
        # Trenowanie modelu
        clf.fit(df.iloc[train][features], df.iloc[train][target])
        # Przygotowanie prognoz dla zbioru treningowego i testowego
        predsTrain = clf.predict(df.iloc[train][features])
        preds = clf.predict(df.iloc[test][features])
        # Zachowajmy informacje o predykcjach dla tego foldu
        predictions.append(preds.tolist().copy())
        # Razem z indeksami w oryginalnym data frame
        indices.append(df.iloc[test].index.tolist().copy())
        # Policzenie RMSE dla foldów
        trainScore = metrics.mean_squared_error(df.iloc[train][target], predsTrain)**0.5
        testScore = metrics.mean_squared_error(df.iloc[test][target], preds)**0.5
        # Zapisanie wyników dla foldów
        trainResults.append(trainScore)
        testResults.append(testScore)
        # Informowanie o każdym foldzie razem z wynikami treningowymi możemy opcjonalnie wyświetlać w trakcie
        if debug:
            print("Train RMSE:", trainScore,
                  "Valid RMSE:", testScore)
        
    return trainResults, testResults, predictions, indices

Zobaczymy czy w tym przypadku feature engineering poprawił jakość predykcyjną:

Model bez feature engineeringu:

In [55]:
df=pd.read_csv("data_eda.csv")
df=df.drop(columns=['Unnamed: 0'])
features=df.columns.tolist()
features.remove('stars')
target='stars'

In [57]:
trainResults, testResults, predictions, indices = CVTestKNN(features=features,n_neighbors=50, n_jobs=-1, p=2, debug=False)
print(np.mean(testResults))

0.29718792358784857


Model z interakcjami:

In [63]:
df=pd.read_csv("data_add.csv")
df=df.drop(columns=['Unnamed: 0'])
features=df.columns.tolist()
features.remove('stars')
target='stars'

In [64]:
trainResults, testResults, predictions, indices = CVTestKNN(features=features,n_neighbors=50, n_jobs=-1, p=2, debug=False)
print(np.mean(testResults))

0.295654458494012


Model z transformacją zmiennych:

In [58]:
df=pd.read_csv("data_fe.csv")
df=df.drop(columns=['Unnamed: 0'])
features=df.columns.tolist()
features.remove('stars')
target='stars'

In [59]:
trainResults, testResults, predictions, indices = CVTestKNN(features=features,n_neighbors=50, n_jobs=-1, p=2, debug=False)
print(np.mean(testResults))

0.25651391317177474


Widzimy, że w tym przypadku feature engineering poprawił znacząco moc predykcyjną modelu, lecz jest nadal słabsza od OLS, spróbujemy to poprawić tuningiem parametrów.

# Tuning hiperparametrów kNN

1. Wybór liczby najbliższych sąsiadów

In [4]:
for k in [1, 3, 5, 10, 15, 30, 50, 100, 150, 200]:
    trainResults, testResults, predictions, indices = CVTestKNN(n_neighbors=k, n_jobs=-1, p=2)
    print(k, np.mean(trainResults), np.mean(testResults))

1 0.0 0.30081044226868137
3 0.1801776704743168 0.2575041102974596
5 0.20297742763910342 0.2506271471840048
10 0.22334966818424307 0.24726283974535637
15 0.23184142042316824 0.24904974090543947
30 0.24353507502423585 0.2518807303043076
50 0.25153642732533055 0.25651391317177474
100 0.2609955583567132 0.26356092682709814
150 0.2651254238254027 0.2670667462538331
200 0.26872643441994504 0.2703872320874618


Wydaje się, że najlepsza jest liczba 10-ciu


2. Wybór miary dystansu

In [5]:
for k in [5, 10, 15]:
    trainResults, testResults, predictions, indices = CVTestKNN(n_neighbors=k, n_jobs=-1, p=1)
    print(k, np.mean(testResults))

5 0.2400728402173018
10 0.23714754587900191
15 0.2371800109899858


Lepszy wynik dla miary Manhattan

3. Wybór funkcji wag w predykcjach

In [7]:
for k in ['uniform', 'distance']:
    trainResults, testResults, predictions, indices = CVTestKNN(weights=k, n_neighbors=10, n_jobs=-1, p=1)
    print(k, np.mean(testResults))

uniform 0.23714754587900191
distance 0.22715809432132272


Lepszy wynik przy odwrotności odległości jako wagach (bliżsi sąsiedzi punktu mają większy wpływ niż sąsiedzi, którzy są dalej).

4. Wybór zmiennych

In [17]:
# Estymujemy model dla każdej zmiennej pojedynczo.
results = []
for feature in features:
    # Estymacja modelu
    trainResults, testResults, predictions, indices = CVTestKNN(weights='distance', n_neighbors=10, n_jobs=-1, p=1, features=[feature])
    # Zapisanie wyniku jako do obiektu results
    results.append((feature, np.mean(testResults)))
results = sorted(results, key=lambda x: x[1], reverse=False)
results

[('author_stars', 0.28681211312172755),
 ('series_author_stars', 0.2891324674665949),
 ('Nonfiction_author_stars', 0.29510628651844895),
 ('Nonfiction_challenging_mm', 0.29678866578420654),
 ('Fiction_mix_qt', 0.29733237368174076),
 ('Social_author_stars', 0.29834374403982455),
 ('Literary_author_stars', 0.2988068981036811),
 ('inspiring_log', 0.29895734662268425),
 ('Fiction_emotional_boxcox', 0.29923728659543575),
 ('Literary_emotional_boxcox', 0.30067157297427843),
 ('series_inspiring_log', 0.30090466749258127),
 ('Nonfiction_author_count_ss', 0.3009893021621016),
 ('Nonfiction_dark_ss', 0.30138628948276946),
 ('Fiction_hopeful_rs', 0.3014319109482832),
 ('Fiction_author_count_ss', 0.3027270556529116),
 ('Literary_inspiring_log', 0.30350403642234436),
 ('series_mix_qt', 0.3048832060449637),
 ('Social_challenging_mm', 0.30551112465428865),
 ('Literary_hopeful_rs', 0.3066110470894786),
 ('emotional_boxcox', 0.3084932117671919),
 ('Fantasy_mix_qt', 0.31036980316483004),
 ('Romans', 0.3

Bardzo porównywalne wyniki dla każdej zmiennej

In [26]:
results = sorted(results, key=lambda x: x[1], reverse=True)
for i in results[:-1]:
    featuresSet2 = [x for x,y in results if y < i[1]]
    trainResults, testResults, predictions, indices = CVTestKNN(weights='distance', n_neighbors=10, n_jobs=-1, p=1, features=featuresSet2)
    print(i[0], np.mean(testResults))

reviews_boxcox 0.22710525096738846
tense_rs_dark_ss 0.2271355481851018
lighthearted_mm__relaxing_boxcox 0.224529222133442
lighthearted_mm__funny_qt 0.22439216786036442
mysterious_rs 0.22478657963830173
reflective_boxcox 0.22502317996537502
sad_mm 0.22531469839581514
Crime 0.22502627085510368
challenging_mm 0.22578488339582892
informative_boxcox 0.22274739606854554
Fiction_adventurous_qt 0.22299010544814063
Fiction_challenging_mm 0.22363147324816687
Fiction_plot_qt 0.22410069929760948
Fiction_character_ss 0.22406840199964284
Fantasy_author_stars 0.22400293165855345
Fiction_pages_qt 0.2237239657528983
Literary_mysterious_rs 0.22347514845730693
Romans 0.22354670349230435
Fantasy_mix_qt 0.224240084712019
emotional_boxcox 0.2263706605238595
Literary_hopeful_rs 0.22614166238535086
Social_challenging_mm 0.22712460213166802
series_mix_qt 0.22796110467003103
Literary_inspiring_log 0.2268645976854101
Fiction_author_count_ss 0.22638887247216352
Fiction_hopeful_rs 0.22695072741661262
Nonfiction_da

Najlepszy wynik przy usunięciu zmiennych: reviews_boxcox, tense_rs_dark_ss, lighthearted_mm__relaxing_boxcox, lighthearted_mm__funny_qt, mysterious_rs, reflective_boxcox, sad_mm, Crime, challenging_mm, informative_boxcox.

In [35]:
features = [x for x,y in results if y <0.31785]

5. Skalowanie zmiennych

In [30]:
dfCopy=df.copy()
results2 = []
for feature in features:
    df = dfCopy.copy()
    # Dwukrotnie zwiększamy zmienną
    df[feature]=df[feature]*2
    #Walidujemy model
    trainResults, testResults, predictions, indices = CVTestKNN(weights='distance', n_neighbors=10, n_jobs=-1, p=1, features=features)
    results2.append((feature, np.mean(testResults)))
df = dfCopy.copy()
results2 = sorted(results2, key=lambda x: x[1], reverse=False)
results2

[('author_stars', 0.22198043295205835),
 ('Fiction_challenging_mm', 0.22207283294488459),
 ('Fiction_mix_qt', 0.2222923675547234),
 ('Literary_author_stars', 0.2223371399802551),
 ('Nonfiction_challenging_mm', 0.22234718583926374),
 ('Fantasy_mix_qt', 0.22244283016634192),
 ('Fiction_emotional_boxcox', 0.22247104377772192),
 ('Fiction_plot_qt', 0.2225133610460412),
 ('emotional_boxcox', 0.22251958961763135),
 ('series_mix_qt', 0.22253571551854553),
 ('Social_challenging_mm', 0.22262037300948906),
 ('Fantasy_author_stars', 0.22266196306275626),
 ('Literary_emotional_boxcox', 0.2227120792191128),
 ('Nonfiction_author_stars', 0.22273073768545312),
 ('Nonfiction_dark_ss', 0.22275095872441994),
 ('Fiction_adventurous_qt', 0.22277835170443056),
 ('series_author_stars', 0.22284355185431814),
 ('Nonfiction_author_count_ss', 0.22285111400798918),
 ('Social_author_stars', 0.22291283892288688),
 ('Fiction_pages_qt', 0.2229586238417117),
 ('Literary_hopeful_rs', 0.22301648840022836),
 ('Fiction_au

Te z najlepszym wynikiem podwajamy a z najgorszym zmniejszamy o połowę.

In [36]:
features2 = [x for x,y in results2 if y < 0.22274739606854554]
features05 = [x for x,y in results2 if y > 0.22274739606854554]
dfCopy=df.copy()
for feature in features2:
    # Dwukrotnie zwiększamy zmienną
    df[feature]=df[feature]*2
for feature in features05:
    # Dwukrotnie zmniejszamy zmienną
    df[feature]=df[feature]*0.5
    #Walidujemy model
trainResults, testResults, predictions, indices = CVTestKNN(weights='distance', n_neighbors=10, n_jobs=-1, p=1, features=features)
print(np.mean(testResults))
df=dfCopy.copy()

0.21798414066396382


In [39]:
features2 = [x for x,y in results2 if y < 0.2224]
features05 = [x for x,y in results2 if y > 0.223]
dfCopy=df.copy()
for feature in features2:
    # Dwukrotnie zwiększamy zmienną
    df[feature]=df[feature]*2
for feature in features05:
    # Dwukrotnie zmniejszamy zmienną
    df[feature]=df[feature]*0.5
    #Walidujemy model
trainResults, testResults, predictions, indices = CVTestKNN(weights='distance', n_neighbors=10, n_jobs=-1, p=1, features=features)
print(np.mean(testResults))
df=dfCopy.copy()

0.21710026652719422


In [54]:
features2 = [x for x,y in results2 if y < 0.2224]
features05 = [x for x,y in results2 if y > 0.223]
dfCopy=df.copy()
for feature in features2:
    # Dwukrotnie zwiększamy zmienną
    df[feature]=df[feature]*2
for feature in features05:
    # Dwukrotnie zmniejszamy zmienną
    df[feature]=df[feature]*0.5

features3 = [x for x,y in results2 if y < 0.222]
features03 = [x for x,y in results2 if y > 0.2235]

for feature in features3:
    # Dwukrotnie zwiększamy zmienną
    df[feature]=df[feature]*1.5
for feature in features03:
    # Dwukrotnie zmniejszamy zmienną
    df[feature]=df[feature]*0.75
    #Walidujemy model
trainResults, testResults, predictions, indices = CVTestKNN(weights='distance', n_neighbors=10, n_jobs=-1, p=1, features=features)
print(np.mean(testResults))

df=dfCopy.copy()

0.21691452248443724


Ostateczny wynik jest nieznacznie gorszy od OLS: *0.21691*