In [1]:
import pandas as pd
import numpy as np

In [2]:
joke_text = pd.read_csv('data/JokeText.csv')

joke_text.head()

Unnamed: 0,JokeId,JokeText
0,0,"A man visits the doctor. The doctor says ""I ha..."
1,1,This couple had an excellent relationship goin...
2,2,Q. What's 200 feet long and has 4 teeth? \n\nA...
3,3,Q. What's the difference between a man and a t...
4,4,Q.\tWhat's O. J. Simpson's Internet address? \...


In [3]:
user_ratings1 = pd.read_csv('data/UserRatings1.csv')
user_ratings2 = pd.read_csv('data/UserRatings2.csv')

display(user_ratings2.head())
display(user_ratings1.head())

Unnamed: 0,JokeId,User36711,User36712,User36713,User36714,User36715,User36716,User36717,User36718,User36719,...,User73412,User73413,User73414,User73415,User73416,User73417,User73418,User73419,User73420,User73421
0,0,,,,3.93,,,,,,...,,,,,,,,,,
1,1,,,,,,,,,4.81,...,,,,,,,,,,
2,2,,,,,,,,,,...,,,,,,,,,,
3,3,,,,,,,,,,...,,,,,,,,,,
4,4,5.68,1.07,8.11,-2.33,-5.83,8.2,-5.83,1.94,0.1,...,3.64,4.32,6.99,-9.66,-8.4,-0.63,9.51,-7.67,-1.6,8.3


Unnamed: 0,JokeId,User1,User2,User3,User4,User5,User6,User7,User8,User9,...,User36701,User36702,User36703,User36704,User36705,User36706,User36707,User36708,User36709,User36710
0,0,5.1,-8.79,-3.5,7.14,-8.79,9.22,-4.03,3.11,-3.64,...,,,,,,,,,2.91,
1,1,4.9,-0.87,-2.91,-3.88,-0.58,9.37,-1.55,0.92,-3.35,...,,,,-5.63,,-6.07,,-1.6,-4.56,
2,2,1.75,1.99,-2.18,-3.06,-0.58,-3.93,-3.64,7.52,-6.46,...,,,,,,4.08,,,8.98,
3,3,-4.17,-4.61,-0.1,0.05,8.98,9.27,-6.99,0.49,-3.4,...,,,,,,,,,,
4,4,5.15,5.39,7.52,6.26,7.67,3.45,5.44,-0.58,1.26,...,2.28,-0.49,5.1,-0.29,-3.54,-1.36,7.48,-5.78,0.73,2.62


In [4]:
user_ratings = pd.merge(user_ratings1, user_ratings2, on='JokeId')
user_ratings = user_ratings.loc[:, 'User1':'User20000']
rating_means = user_ratings.mean(axis=0, skipna=True)
user_ratings = user_ratings.fillna(rating_means, axis=0) - rating_means
user_ratings.head()

Unnamed: 0,User1,User2,User3,User4,User5,User6,User7,User8,User9,User10,...,User19991,User19992,User19993,User19994,User19995,User19996,User19997,User19998,User19999,User20000
0,3.353,-8.2066,-2.7565,6.3833,-11.732,7.6414,-1.6082,2.0867,-4.8942,-5.8432,...,-3.466081,3.071486,2.358243,-5.871486,2.919054,-1.354189,6.146216,1.517568,6.111757,0.857027
1,3.153,-0.2866,-2.1665,-4.6367,-3.522,7.7914,0.8718,-0.1033,-4.6042,-3.3232,...,11.053919,2.341486,2.358243,-6.741486,-4.410946,-1.354189,-2.153784,2.637568,6.741757,4.587027
2,0.003,2.5734,-1.4365,-3.8167,-3.522,-5.5086,-1.2182,6.4967,-7.7142,-1.4232,...,8.283919,-1.148514,2.118243,-0.821486,1.899054,-1.354189,5.216216,2.157568,6.401757,-4.002973
3,-5.917,-4.0266,0.6435,-0.7067,6.038,7.6914,-4.5682,-0.5333,-4.6542,0.1768,...,-7.586081,-3.338514,2.318243,-6.061486,2.869054,-1.354189,-10.843784,1.857568,-1.168243,-3.322973
4,3.403,5.9734,8.2635,5.5033,4.728,1.8714,7.8618,-1.6033,0.0058,5.8568,...,-4.286081,4.871486,0.708243,2.378514,-7.950946,1.855811,6.336216,-2.702432,-11.798243,-4.052973


In [5]:
from sklearn.metrics.pairwise import pairwise_distances

data_matrix = user_ratings.as_matrix()

joke_similarity = pairwise_distances(data_matrix, metric='cosine')
user_similarity = pairwise_distances(data_matrix.T, metric='cosine')

In [6]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=0)
        ratings_diff = (ratings - mean_user_rating[np.newaxis, :])
        pred = mean_user_rating[np.newaxis, :] + ratings_diff.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    elif type == 'joke':
        pred = similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T
    return pred

In [7]:
from numpy.random import choice

test_matrix = np.copy(data_matrix)
indices = np.zeros(test_matrix.shape[0], dtype='int')

for i, row in enumerate(test_matrix):
    index = choice(row.nonzero()[0])
    indices[i] = index
    row[index] = 0
    
display(indices)

array([ 9959, 15570,  4783,  1316,  5805, 10696,  4947, 17264, 12114,
       13681,  2068, 18116,  7632, 13943,   260, 17457, 11403, 13506,
       16176, 11489, 13894,  1937, 19755, 15516, 16454,  3662,  3313,
        1460,  9714,  6568,   612,  8858,  4129,  2120, 11452,  7037,
         675, 19780,  3534,  4124,  6990, 12157, 19090,  2897,  4321,
       11705, 13405,  1936,  8076,  4120, 16657,  6927, 14159, 10764,
       16778,  6362, 18710, 18121,  9758, 18765,  7850, 14206,   300,
       15854, 18345,  9479,  9296,  9787, 12493, 13606,  3168,  8896,
        6056, 11296,   566, 14226,  9657,  3374,  1601,  2546,  1977,
        9201,  3808,   861, 15183, 14043,  8849, 12118,   460,  8441,
       15649,  3038, 12387,  1085, 15701,  8527, 10109, 15445,  7466, 11146])

In [8]:
prediction = predict(test_matrix, user_similarity, type='user')

prediction
display(prediction.shape)

(100, 20000)

In [9]:
from math import sqrt
display(indices.shape)

truth = data_matrix[np.arange(len(indices)), indices]
predicted = prediction[np.arange(len(indices)), indices]
display(sqrt(np.square(predicted - truth).sum() / 100))

(100,)

4.476809204810378

In [None]:
to_rate