# **Imports**

In [22]:
import random
import pandas as pd
import numpy as np


# **DataFrames**

In [4]:
dataframe_jokes = pd.read_csv("/content/jokes_dataframe.csv")
dataframe_ratings = pd.read_csv("/content/rating_dataframe.csv")

In [5]:
print(dataframe_jokes.head())
print(dataframe_ratings.head())

   joke_id                                               joke
0        1  A man visits the doctor. The doctor says "I ha...
1        2  This couple had an excellent relationship goin...
2        3  Q. What's 200 feet long and has 4 teeth? A. Th...
3        4  Q. What's the difference between a man and a t...
4        5  Q.\tWhat's O. J. Simpson's Internet address? A...
   user_id  number_of_jokes_rated  joke_1  joke_2  joke_3  joke_4  joke_5  \
0        1                     74   -7.82    8.79   -9.66   -8.16   -7.52   
1        2                    100    4.08   -0.29    6.36    4.37   -2.38   
2        3                     49   99.00   99.00   99.00   99.00    9.03   
3        4                     48   99.00    8.35   99.00   99.00    1.80   
4        5                     91    8.50    4.61   -4.17   -5.39    1.36   

   joke_6  joke_7  joke_8  ...  joke_91  joke_92  joke_93  joke_94  joke_95  \
0   -8.50   -9.85    4.17  ...     2.82    99.00    99.00    99.00    99.00   
1   -9.

Note: Currently working with non-normalized data.

# **Model**

Note: Currently training on entire dataset, no train test split.

## **Preprocessing**

In [6]:
dataframe_dense_rating = dataframe_ratings[dataframe_ratings['number_of_jokes_rated'] == 100] # Dense
dataframe_sparse_rating = dataframe_ratings[dataframe_ratings['number_of_jokes_rated'] < 100] # Sparse

In [11]:
print("Shape of dense ratings dataframe: ", dataframe_dense_rating.shape)
print("Users with all ratings: ", dataframe_dense_rating.shape[0])
print(dataframe_dense_rating.head(2))

print()
print()

print("Shape of sparse ratings dataframe: ", dataframe_sparse_rating.shape)
print("Users with incomplete ratings: ", dataframe_sparse_rating.shape[0])
print(dataframe_sparse_rating.head(2))

Shape of dense ratings dataframe:  (14116, 102)
Users with all ratings:  14116
   user_id  number_of_jokes_rated  joke_1  joke_2  joke_3  joke_4  joke_5  \
1        2                    100    4.08   -0.29    6.36    4.37   -2.38   
5        6                    100   -6.17   -3.54    0.44   -8.50   -7.09   

   joke_6  joke_7  joke_8  ...  joke_91  joke_92  joke_93  joke_94  joke_95  \
1   -9.66   -0.73   -5.34  ...     2.82    -4.95    -0.29     7.86    -0.19   
5   -4.32   -8.69   -0.87  ...    -3.54    -6.89    -0.68    -2.96    -2.18   

   joke_96  joke_97  joke_98  joke_99  joke_100  
1    -2.14     3.06     0.34    -4.32      1.07  
5    -3.35     0.05    -9.08    -5.05     -3.45  

[2 rows x 102 columns]


Shape of sparse ratings dataframe:  (59305, 102)
Users with incomplete ratings:  59305
   user_id  number_of_jokes_rated  joke_1  joke_2  joke_3  joke_4  joke_5  \
0        1                     74   -7.82    8.79   -9.66   -8.16   -7.52   
2        3                     49 

## **Prediction User**

In [13]:
# Picking a prediction User

prediction_user_id = random.choice(dataframe_sparse_rating['user_id'].values)
prediction_user_known_ratings = dataframe_sparse_rating[dataframe_sparse_rating['user_id'] == prediction_user_id].iloc[:, 2:]

In [15]:
print("User ID of predictive user:", prediction_user_id)
print("Rating Dataframe for predictive user: \n", prediction_user_known_ratings)

User ID of predictive user: 68388
Rating Dataframe for predictive user: 
        joke_1  joke_2  joke_3  joke_4  joke_5  joke_6  joke_7  joke_8  joke_9  \
68387    99.0    99.0    99.0    99.0    2.82    99.0    2.67    5.05    99.0   

       joke_10  ...  joke_91  joke_92  joke_93  joke_94  joke_95  joke_96  \
68387     99.0  ...     99.0     99.0     99.0     99.0     99.0     99.0   

       joke_97  joke_98  joke_99  joke_100  
68387     99.0     99.0     99.0      99.0  

[1 rows x 100 columns]


In [20]:
# All unrated jokes from the prediction user

prediction_user_unknown_jokes = []
for column in dataframe_sparse_rating.columns:
  if (dataframe_sparse_rating[column].values[0] == 99): # 99 == Jokes with no ratings
    prediction_user_unknown_jokes.append(column)

print("There are {} jokes unrated by user {}.".format(len(prediction_user_unknown_jokes), prediction_user_id))

There are 26 jokes unrated by user 68388.


### **Similarity (requires explaination and changes)**

We will be finding Similarity between prediction user and other dense users.

**CHANGES**

In [23]:
prediction_user_known_ratings_list = prediction_user_known_ratings.values.ravel()


def weight_factor(x, y):
    '''
    Weight factor implies relationship between user x and user y
    Also know as similarity between user x and user y
    We are using Pearson correlation coefficient here.
    '''
    t1, t2, t3 = 0, 0, 0
    for i, j in zip(x, y):
        t1+=i*j
        t2+=i*i
        t3+=j*j
    return t1/(np.sqrt(t2) * np.sqrt(t3))

similarity = np.array([(dataframe_dense_rating.iloc[i, 0], \
                        weight_factor(prediction_user_known_ratings_list, dataframe_dense_rating.iloc[i, 2:])) \
                       for i in range(dataframe_dense_rating.shape[0])])

In [25]:
print(similarity.size)
print(similarity)

28232
[[ 2.00000000e+00  5.16968547e-01]
 [ 6.00000000e+00 -7.73191952e-01]
 [ 8.00000000e+00 -4.10495688e-01]
 ...
 [ 4.84580000e+04  5.05438251e-02]
 [ 4.84800000e+04  3.44932880e-02]
 [ 4.84820000e+04  6.55219676e-01]]


**CHANGES**

In [29]:
# Sorting the neighbours using similarity

similarity = similarity[np.argsort(similarity[:, 1])]
print("Sorted Neighbours based on Similarity (neighbour, similarity): \n", similarity)

Sorted Neighbours based on Similarity (neighbour, similarity): 
 [[ 1.95680000e+04 -9.03424824e-01]
 [ 9.99000000e+02 -8.83088302e-01]
 [ 7.53500000e+03 -8.63711195e-01]
 ...
 [ 3.16850000e+04  8.97358214e-01]
 [ 4.49880000e+04  9.05677957e-01]
 [ 3.85090000e+04  9.18734042e-01]]


## **Neighbours of Predictive User**

In [38]:
## Number of Neighbours of Predictive User

number_of_neighbours = 30 # For now choose 30 neighbours
neigbours_similarity_set = similarity[similarity[:, 1] > 0.1] # Thresholding to similarity of 0.1
print("We have {} neighbours.".format(neigbours_similarity_set.shape[0]))

random_neigbhour_indexes = np.random.choice(neigbours_similarity_set.shape[0], number_of_neighbours, replace=False)
predictive_user_neigbours = neigbours_similarity_set[random_neigbhour_indexes]

print("Shape of neigbours array (Neighbour Index, Similarity):", predictive_user_neigbours.shape)

We have 7915 neighbours.
Shape of neigbours array (Neighbour Index, Similarity): (30, 2)


In [39]:
# Ratings given by neigbhours to unrated_predictive_user_jokes

neighbours_user_ids = neigbours_similarity_set[:, 0].astype(int)
dataframe_neighbours = dataframe_dense_rating[dataframe_dense_rating['user_id'].isin(neighbours_user_ids)]
dataframe_neighbours_unrated_jokes = dataframe_neighbours[prediction_user_unknown_jokes]

print("Shape of neighbours rating for unknown jokes of predictive user:", dataframe_neighbours_unrated_jokes.shape)
# 26 unrated jokes with 7915 users rating of them

print(dataframe_neighbours_unrated_jokes)

Shape of neighbours rating for unknown jokes of predictive user: (7915, 26)
       joke_71  joke_72  joke_73  joke_74  joke_75  joke_76  joke_77  joke_79  \
1        -5.00     2.77     8.30     7.77     7.33     6.21     7.72     8.64   
13       -7.57     8.93     0.15     8.93    -6.21     8.35     3.35    -8.45   
17        0.58     5.92     3.88     0.78     2.62     7.48     7.38     7.86   
23        6.65     7.28     0.34     3.30    -0.92     7.43     2.77     6.46   
30       -0.34     2.91    -6.07    -6.07    -5.97     7.28    -7.09    -7.18   
...        ...      ...      ...      ...      ...      ...      ...      ...   
48430     8.59     2.72     1.55    -6.75    -1.89     2.82    -6.46    -7.48   
48433    -2.77     6.17     5.19     2.09    -1.55    -0.49     2.43    -2.82   
48439    -7.43     5.10    -0.34     7.18     7.23     7.82     6.89     6.84   
48451    -2.33     8.59     3.50     2.23     1.89     7.09     7.72     2.86   
48481     4.47     2.82     2.82 

**CHANGES**

In [42]:
# Let's predict rating for one joke!

neighbours_similarity = neigbours_similarity_set[:, 1]
first_unrated_joke = prediction_user_unknown_jokes[0]
print("We are predicting score for joke {} for sparse user {}".format((first_unrated_joke), (prediction_user_id)))


# No mean rating yet, so 0
def score_user_item(item_id, neighbours_df,neighbour_user_similarity, active_user_mean_rating = 0):
    item_rating = neighbours_df[item_id]
    t1, t2 = 0, 0
    for similarity, norm_rating in zip(neighbour_user_similarity, item_rating):
        t1+= norm_rating * similarity
        t2+= similarity
    score = (t1 + active_user_mean_rating)/t2
    return score

unrated_joke_score = score_user_item(first_unrated_joke, dataframe_neighbours_unrated_jokes, neighbours_similarity, 0)
print("The rating we found for joke {} for sparse user {} is: {}".format((first_unrated_joke), (prediction_user_id), (unrated_joke_score)))



We are predicting score for joke joke_71 for sparse user 68388
The rating we found for joke joke_71 for sparse user 68388 is: 1.2257220925787329


In [44]:
# From all unrated jokes, calculate of their scores, and get the highest score and reveal the corresponding joke

max_score = -np.inf
max_score_joke = None

joke_score = []

for column in dataframe_neighbours_unrated_jokes.columns:
    score = score_user_item(column, dataframe_neighbours_unrated_jokes, neighbours_similarity, 0)
    joke_score.append([column, score])
    if score > max_score:
        max_score = score
        max_score_joke = column

print("We are recommending {} with highest score {} \n \n".format(max_score_joke, max_score))
print("The list for unrated jokes and their ratings: \n", joke_score)

We are recommending joke_89 with highest score 5.124806284808203 
 

The list for unrated jokes and their ratings: 
 [['joke_71', 1.2257220925787329], ['joke_72', 4.577008242658604], ['joke_73', 3.1435248091336505], ['joke_74', 0.8329404171928685], ['joke_75', 1.7855913421750929], ['joke_76', 4.273766717820955], ['joke_77', 3.010575528999967], ['joke_79', 2.422915977457838], ['joke_80', 3.204162360398013], ['joke_81', 3.8431236610956163], ['joke_83', 4.024977080517668], ['joke_84', 2.903342427274737], ['joke_85', 2.9447939703747434], ['joke_86', 2.345377631113283], ['joke_87', 3.765443932346663], ['joke_88', 4.056456384260266], ['joke_89', 5.124806284808203], ['joke_90', 2.7582301522472736], ['joke_92', 3.4256682527527063], ['joke_93', 4.2464078739141655], ['joke_94', 3.174933889579803], ['joke_95', 3.0973997105820583], ['joke_96', 3.5092856686068066], ['joke_98', 2.9438557348215646], ['joke_99', 2.22036734370685], ['joke_100', 3.0484729377648567]]
