In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from scipy import stats
import seaborn as sns
from scipy.linalg import sqrtm
from sklearn.metrics import mean_squared_error
from math import sqrt

plt.style.use('fivethirtyeight')

In [None]:
REVIEWS_URL = 'https://ids-storage-football-prediction.s3-eu-west-1.amazonaws.com/data_mmwd/reviews.csv'
REVIEWS_AND_POLARITY_URL = 'https://ids-storage-football-prediction.s3-eu-west-1.amazonaws.com/data_mmwd/reviews_en_polarity.csv'
CALENDAR_URL = 'https://ids-storage-football-prediction.s3-eu-west-1.amazonaws.com/data_mmwd/calendar.csv'
LISTINGS_URL = 'https://ids-storage-football-prediction.s3-eu-west-1.amazonaws.com/data_mmwd/listings.csv'

**Załadowanie danych przedstawiających recenzje użytkowników**


Dane są umieszczone w pliku o formacie csv:

In [None]:
reviews = pd.read_csv(REVIEWS_AND_POLARITY_URL)

In [None]:
reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,language,polarity
0,36642,63694,2010-07-11,126491,Robert,I gave Donna stars only because she is new t...,en,0.807
1,36642,92464,2010-09-07,204008,Eduardo,Donna is a nice and pleasant person. We enjoye...,en,0.985
2,36642,100164,2010-09-19,164372,Asha,"Our trip was too short in Ottawa, we really wi...",en,0.743
3,36642,262662,2011-05-14,414166,Emie,Donna is a nice person. I had a short stay at ...,en,0.8304
4,36642,266613,2011-05-16,562356,Deepak,Our stay at Donna's home was very pleasant. Sh...,en,0.9537


Polarity jest to wskaźnik określający jak bardzo pozytywna była recenzja w skali od 0 do 1.

In [None]:
df_rec = reviews[['reviewer_id', 'listing_id', 'polarity']]

In [None]:
df_rec.nunique()

reviewer_id    75014
listing_id      2183
polarity        4614
dtype: int64

**Budowa systemu rekomendacji na podstawie polarity**

In [None]:
# problem systemów rekomendacji - rzadkość macierzy / utility matrix
matrix = df_rec.pivot_table(index='reviewer_id', columns='listing_id', values='polarity')
matrix.head()

listing_id,36642,59258,158824,261065,336692,365499,413300,490182,674799,678641,682632,682634,685334,764035,793593,875133,902575,943979,987450,1026310,1038420,1097429,1113071,1135469,1151370,1225070,1242836,1247877,1450947,1520644,1574060,1587517,1617217,1641234,1650642,1656607,1676024,1732533,1774018,1823110,...,45201459,45201519,45201564,45201639,45201731,45202463,45203975,45283796,45309318,45332262,45345364,45349175,45359931,45406789,45406790,45406794,45406800,45406836,45406862,45406863,45412462,45423886,45426776,45455016,45468443,45494801,45547813,45568790,45594322,45610710,45649168,45668365,45669755,45682632,45713759,45714208,45725137,45768503,45774987,45796523
reviewer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
3527,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3708,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9176,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9187,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
9365,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [None]:
userrows = list(matrix.index)
users_index = {userrows[i]: i for i in range(len(userrows))}

itemcols = list(matrix.columns)
items_index = {itemcols[i]: i for i in range(len(itemcols))}

In [None]:
# I Rozkład SVD - Rozkład według wartości osobliwych

from scipy.sparse.linalg import svds
# Poniższa funkcja obejmuje także tworzenie utility matrix.

def recommend_predictions(df_rec, k):
    """
    :param df_rec: ramka danych z wartościami polarity dla poszczególnych recenzentów
    :param k: Liczba wartości osobliwych do obliczenia

    zwraca ramke danych z przewidzianymi wartościami polarity dla kazdej pary: użytkownik, przedmiot.
    """
    # tworzenie utility matrix
    util_mat = df_rec.pivot_table(index='reviewer_id', columns='listing_id', values='polarity')

    # wyciąganie z macierzy indeksów recenzentów i przedmiotów
    reviewer_rows = list(util_mat.index)
    reviewer_index = {reviewer_rows[i]: i for i in range(len(reviewer_rows))}
    listing_cols = list(util_mat.columns)
    listing_index = {listing_cols[i]: i for i in range(len(listing_cols))}

    # maskowanie wartości NaN i usuwanie means
    mask = np.isnan(util_mat)
    masked_arr = np.ma.masked_array(util_mat, mask)
    item_means = np.mean(masked_arr, axis=0) 
    util_mat = masked_arr.filled(item_means)
    means = np.tile(item_means, (util_mat.shape[0],1))
    util_mat_demeaned = util_mat - means

    # uruchomienie SVD
    U, sigma, Vt = svds(util_mat_demeaned, k = k)
    sigma = np.diag(sigma)  
    all_predicted_polarity = np.dot(np.dot(U, sigma), Vt) + means

    return all_predicted_polarity, reviewer_index, listing_index

In [None]:
pred,_,_ = recommend_predictions(df_rec, k=150)

Ocena na podstawie danych treningowych:

In [None]:
this_pred = [] #tutaj będziemy przechowywać przewidywane oceny
for i, row in df_rec.iterrows():
    user = row['reviewer_id']
    item = row['listing_id']
    
    try:
        u_index = users_index[user]
        if item in items_index:
            i_index = items_index[item]
            pred_rating = pred[u_index, i_index]
        else:
            pred_rating = np.mean(svdout[u_index, :])
        this_pred.append(pred_rating)
        
    except:
        print('error with user', user, 'at index', i)

In [None]:
rmse = sqrt(mean_squared_error(df_rec['polarity'], this_pred))
print(f'RMSE wynosi {rmse}')

RMSE wynosi 0.17203368100965857


Optymalizacja hiperparametrów

In [None]:
# testujemy dla różnych wartości k (liczby wartości osobliwych)

no_of_features = [70, 100, 120, 150]
rmse= []

for k in no_of_features:
    pred_out, _, _ = recommend_predictions(df_rec, k=k)
    
    this_pred = [] #tutaj będziemy przechowywać przewidywane oceny
    for i, row in df_rec.iterrows():
        user = row['reviewer_id']
        item = row['listing_id']

        try:
            u_index = users_index[user]
            if item in items_index:
                i_index = items_index[item]
                pred_rating = pred_out[u_index, i_index]
            else:
                pred_rating = np.mean(svdout[u_index, :])
            this_pred.append(pred_rating)

        except:
            print('error with user', user, 'at index', i)
            continue

    rmse_i = sqrt(mean_squared_error(df_rec['polarity'], this_pred))
    rmse.append(rmse_i)
    
    print(k, rmse_i)

70 0.19470533403979248


# Nowa sekcja

In [None]:
no_of_features = [50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150]
rmse= []

for k in no_of_features:
    pred_out, _, _ = recommend_predictions(df_rec, k=k)
    
    this_pred = [] #tutaj będziemy przechowywać przewidywane oceny
    for i, row in df_rec.iterrows():
        user = row['reviewer_id']
        item = row['listing_id']

        try:
            u_index = users_index[user]
            if item in items_index:
                i_index = items_index[item]
                pred_rating = pred_out[u_index, i_index]
            else:
                pred_rating = np.mean(svdout[u_index, :])
            this_pred.append(pred_rating)

        except:
            print('error with user', user, 'at index', i)
            continue

    rmse_i = sqrt(mean_squared_error(df_rec['polarity'], this_pred))
    rmse.append(rmse_i)
    
    print(k, rmse_i)

Na wykresie prezentujemy wyniki dla poszczególnych wartości RMSE (
Odchylenie wartości średniokwadratowej) otrzymanych dla odpowiednich wartości k (liczby wartości osobliwych):

In [None]:
no_of_features = [50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150]
rmse = [0.2016, 0.1981, 0.1947, 0.1916, 0.1885, 0.1855, 0.1828, 0.1799, 0.1771, 0.1746, 0.1720]

In [None]:
plt.figure(figsize=(10,6))
plt.scatter(no_of_features, rmse);

**W tym etapie chcemy otrzymać jak najlepsze wyniki rekomendacji dla użytkownika**

In [None]:
listing_id_array = df_rec['listing_id'].unique()

def get_recommendations(predMat, reviewer, N):
    """
    predMat: predicted matrix, svdout
    reviewer: selected reviewer_id
    N: top N recommendations
    
    return top N recommendations for specified user
    """
    u_index = users_index[reviewer]
    item_i = [items_index[listing_id_array[i]] for i in range(len(listing_id_array))]
    
    pred_user = [predMat[u_index, i_index] for i_index in item_i]
    
    d = {'listing_id': listing_id_array, 'predicted_polarity': pred_user}
    user_rec = pd.DataFrame(data=d)
    user_rec.sort_values(by=['predicted_polarity'], ascending=False, inplace=True)
    user_rec.reset_index(inplace=True, drop=True)
    
    return user_rec[:N]

In [None]:
# używamy macieży z metody SVD:
user_rec = get_recommendations(pred, df_rec['reviewer_id'][0], 100)
user_rec.style.bar(subset=['predicted_polarity'], align='mid', color=['#d65f5f', '#5fba7d'])

In [None]:
user_rec.to_csv('user_rec.csv')