In [32]:
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from datetime import datetime
from sortedcontainers import SortedList
import os

In [33]:
with open("../data/user_to_channel.json", "rb") as f:
    user_to_channel = pickle.load(f)
with open("../data/channel_to_user.json", "rb") as f:
    channel_to_user = pickle.load(f)
with open("../data/to_rating.json", "rb") as f:
    to_rating = pickle.load(f)
with open("../data/to_rating_test.json", "rb") as f:
    to_rating_test = pickle.load(f)
with open("../data/to_rating_val.json", "rb") as f:
    to_rating_val = pickle.load(f)

In [34]:
len(pd.Series([m for (u,m), k in to_rating.items()]).unique())

8553

In [35]:
N = len(user_to_channel.keys())
m1 = np.max(list(channel_to_user.keys()))
m2 = np.max(pd.Series([m for (u,m), k in to_rating_test.items()]).unique())
M = max(m1, m2)

In [36]:
print("number of users: ", N)
print("number of channels: ", M)

number of users:  9821
number of channels:  9999


In [37]:
count = 0
n = 25
limit = 2
neighbors = {}
averages = {}
deviations = []
for i in user_to_channel.keys():
    channel_i = user_to_channel[i]
    channel_i_set = set(channel_i)
    
    channel_to_rating_i = {channel: to_rating[(i, channel)] for channel in channel_i}
    average_i = np.mean(list(channel_to_rating_i.values()))
    rating_to_dev = {channel: rating - average_i for channel, rating in channel_to_rating_i.items()}
    dev_i = np.array(list(rating_to_dev.values()))
    std_i = np.sqrt(dev_i[0].dot(dev_i[0]))
    
    averages[i]=average_i
    deviations.append(rating_to_dev)
    
    sl = SortedList()
    for j in user_to_channel.keys():
        if j != i:
            channel_j = user_to_channel[j]
            channel_j_set = set(channel_j)
            common = (channel_i_set & channel_j_set)
            
            if len(common) >= limit:
                channel_to_rating_j = {channel: to_rating[(j, channel)] for channel in channel_j}
                average_j = np.mean(list(channel_to_rating_j.values()))
                rating_to_dev_j = {channel: rating - average_j for channel, rating in channel_to_rating_j.items()}
                dev_j = np.array(list(rating_to_dev_j.values()))
                std_j = np.sqrt(dev_j[0].dot(dev_j[0]))
                
                nom = sum(rating_to_dev[m] * rating_to_dev_j[m] for m in common)
                denom = std_i * std_j
                w_ij = nom/denom
                
                sl.add((-w_ij, j))
                if len(sl) > n:
                    del sl[-1]

    neighbors[i] = (sl)
    count += 1
    print(count/N)

0.00010182262498727217
0.00020364524997454434
0.0003054678749618165
0.0004072904999490887
0.0005091131249363609
0.000610935749923633
0.0007127583749109052
0.0008145809998981774
0.0009164036248854496
0.0010182262498727218
0.001120048874859994
0.001221871499847266
0.0013236941248345381
0.0014255167498218105
0.0015273393748090826
0.0016291619997963547
0.0017309846247836268


  w_ij = nom/denom


0.0018328072497708992
0.0019346298747581713
0.0020364524997454436
0.0021382751247327157
0.002240097749719988
0.00234192037470726
0.002443742999694532
0.002545565624681804
0.0026473882496690763
0.002749210874656349
0.002851033499643621
0.002952856124630893
0.003054678749618165
0.0031565013746054373
0.0032583239995927094
0.0033601466245799815
0.0034619692495672537
0.003563791874554526
0.0036656144995417983
0.0037674371245290704
0.0038692597495163425
0.003971082374503615
0.004072904999490887
0.004174727624478159
0.0042765502494654314
0.004378372874452703
0.004480195499439976
0.004582018124427247
0.00468384074941452
0.0047856633744017925
0.004887485999389064
0.004989308624376337
0.005091131249363608
0.005192953874350881
0.005294776499338153
0.005396599124325425
0.005498421749312698
0.005600244374299969
0.005702066999287242
0.005803889624274514
0.005905712249261786
0.006007534874249058
0.00610935749923633
0.006211180124223602
0.006313002749210875
0.006414825374198147
0.006516647999185419
0.

  w_ij = nom/denom


0.01771713674778536
0.01781895937277263
0.017920781997759903
0.018022604622747174
0.018124427247734446
0.018226249872721718
0.01832807249770899
0.018429895122696265
0.018531717747683536
0.018633540372670808
0.01873536299765808
0.01883718562264535
0.018939008247632623
0.019040830872619895
0.01914265349760717
0.01924447612259444
0.019346298747581713
0.019448121372568985
0.019549943997556257
0.019651766622543528
0.0197535892475308
0.019855411872518075
0.019957234497505347
0.02005905712249262
0.02016087974747989
0.020262702372467162
0.020364524997454433
0.020466347622441705
0.02056817024742898
0.020669992872416252
0.020771815497403524
0.020873638122390795
0.020975460747378067
0.02107728337236534
0.02117910599735261
0.021280928622339886
0.021382751247327157
0.02148457387231443
0.0215863964973017
0.021688219122288972
0.021790041747276244
0.021891864372263516
0.02199368699725079
0.022095509622238062
0.022197332247225334
0.022299154872212606
0.022400977497199877
0.02250280012218715
0.022604622

In [38]:
def predict(i, m):
    numerator = 0
    denominator = 0
    for neg_w, j in neighbors[i]:
        try:
            numerator += -neg_w * deviations[j][m]
            denominator += abs(neg_w)
        except:
            pass
    if denominator == 0:
        prediction = averages[i]
    else:
        prediction = averages[i] + (numerator/denominator)
    if isinstance(prediction, np.ndarray):
        prediction = prediction.tolist()[0]
    return prediction

            

In [39]:
train_prediction = []
train_target = []
for (i,m), target in to_rating.items():
    prediction = predict(i, m)
    train_prediction.append(prediction)
    train_target.append(target[0])


In [40]:
test_prediction = []
test_target = []
for (i,m), target in to_rating_test.items():
    if i in neighbors.keys():
        prediction = predict(i, m)
        test_prediction.append(prediction)
        test_target.append(target[0])

In [41]:
import random
random_sample = random.choices(train_target, k=70158)

In [42]:
A = pd.DataFrame([train_prediction, train_target, random_sample])
A = np.transpose(A)
A.dropna(inplace = True)
A.rename(columns = {0: "predictions", 2: "random", 1: "target"}, inplace = True)

In [43]:
def mse(a, b):
    a = np.array(a)
    b = np.array(b)
    return np.mean((a-b)**2)
print("Predicted mse: ", mse(A.predictions, A.target))
print("Random mse: ", mse(A.random, A.target))

Predicted mse:  2.7398747466564553
Random mse:  6.672812619018783


In [44]:
B = pd.DataFrame([test_prediction, test_target])
B = np.transpose(B)
B.dropna(inplace = True)
B.rename(columns = {0: "predictions", 1: "target"}, inplace = True)

In [45]:
print("Test MSE:", mse(B.predictions, B.target))

Test MSE: 4.209407497419983
