# **User-User Collaborative Filtering**

In [1]:
# Imports

import pandas as pd
import numpy as np

In [5]:
# Datasets

dataframe_normalized_ratings = pd.read_csv('Preprocessed/normalized_ratings.csv')
numpy_ratings_normalized = dataframe_normalized_ratings.to_numpy()

dataframe_non_normalized_ratings = pd.read_csv('Preprocessed/non_normalized_ratings.csv')
numpy_ratings_non_normalized = dataframe_non_normalized_ratings.to_numpy()

dataframe_jokes = pd.read_csv('Preprocessed/dataframe_jokes.csv')

## **Functions**

In [12]:
def dataframe_split(dataframe, threshold):
    """
    Splits df['user_id] in two different parts based on a threshold of number of jokes rated
    Args:
        df (pd.DataFrame): joke rating dataframe
        threshold (int): threshold of number of jokes rated
    Returns:
        df_dense (pd.DataFrame): dataframe having user_id's of only those users which have rated jokes above or equal to threshold
        df_sparse (pd.DataFrame): dataframe having user_id's of only those users which have rated jokes below threshold
    """
    dataframe_dense = (dataframe[dataframe['number_of_jokes_rated'] >= threshold])['user_id']
    dataframe_sparse = (dataframe[dataframe['number_of_jokes_rated'] < threshold])['user_id']
    return dataframe_dense, dataframe_sparse

def generate_correlation_row(active_user, passive_user_dataset, n1):
    """
    Generates a sorted correlation matrix for an active user for all given passive users
    matrix row structure => 0th index: pasive user_id 1st index: its pearson's corelation coefficient with active user
    Args:
        active_user (int): Contains the user_id of the active user
        passive_user_dataset (list<int>): Contains the list of user_ids of the passive users
        n1 (np.array): matrix of database containing joke rattings for every user
    Returns:
        list_corr_i (np.Array): matrix where first ellement represents user id of passive user and second element represets the pearson coefficient of similarity sorted in increasing order
    """
    correlation_list = []

    for j in passive_user_dataset:

        cv=0
        vi=0
        vj=0
        
        for k in range(2,102):
            cv+=n1[active_user-1][k]*n1[j-1][k]
            vi+=n1[active_user-1][k]*n1[active_user-1][k]
            vj+=n1[j-1][k]*n1[j-1][k]
        correlation_list.append([j,(cv/((vi**0.5)*(vj**0.5)))])
    
    correlation_list = np.array(correlation_list)
    correlation_list = correlation_list[correlation_list[:,1].argsort()]
    return correlation_list

def score_user_item(item_id, dataframe_neighbours, neighbour_user_similarity, active_user_mean_rating = 0):
    """
    Predicts the joke rating for the joke id 
    Args:
        neighbours_df (pd.DataFrame): Dataframe of neighbours which willl be used to predict the user ratings
        neighbour_user_similarity (np.array):  array of correlation where 0th index: pasive user_id 1st index: its pearson's corelation coefficient with active user
        active_user_mean_rating (float): current joke rating of user -> No mean rating yet, so 0
    Returns:
        score (float): new joke rating
    """
    item_rating = dataframe_neighbours[item_id]
    
    t1, t2 = 0, 0
    for similarity, norm_rating in zip(neighbour_user_similarity, item_rating):
        t1+= norm_rating * similarity
        t2+= similarity
    
    score = (t1 + active_user_mean_rating)/t2
    
    return score

def predict_ratings(user_id, dataframe_ratings, similarity_threshold, no_of_neighbours, matrix_row, passive_user_dataset):
    """
    Predicts the missing joke ratings for the given user and returns the final ratings (of all 100 jokes) for further processing
    Args:
        user_id (int): id of active user
        dataframe_ratings (pd.DataFrame): Dataframe of user ratings
        similarity_threshold (float): Desiered threshold for similar neigbours
        no_of_neighbours(int): Number of neighbours to be considered for rating (nan value indicates that all users of )
        matrix_row (np.array): matrix of database containing joke rattings for every user
        passive_user_dataset (list<int>): Contains the list of user_ids of the passive users
    Returns:
        joke_score (pd.DataFrame): new joke ratings for user
    """
    user_rating = dataframe_ratings.loc[user_id-1]
    ratings_to_predict = [column for column in dataframe_ratings.columns[2:] if(user_rating[column]==0)]
    
    correlation_row = generate_correlation_row(user_id,passive_user_dataset,matrix_row)
    
    # Find index for first usable correlation row
    index = 0
    while(index<correlation_row.shape[0] and correlation_row[index,1]<similarity_threshold):
        index+=1
    
    neighbours = correlation_row[index:] if(no_of_neighbours!=no_of_neighbours) else(correlation_row[index:])[np.random.choice(correlation_row.shape[0]-index,no_of_neighbours,replace=False)]
    dataframe_neighbours_unrated_jokes = (dataframe_ratings[dataframe_ratings['user_id'].isin(neighbours[:, 0])])

    
    # mechanism of filling joke ratings
    joke_score = user_rating.copy()
    for column in ratings_to_predict:
        score = score_user_item(column, dataframe_neighbours_unrated_jokes, neighbours[:, 1], user_rating[column])
        joke_score[column]=score
    
    
    # converting to dataframe
    dataframe_joke_score = pd.DataFrame(joke_score)
    dataframe_joke_score = joke_score.transpose()
    dataframe_joke_score['user_id'] = user_id
    dataframe_joke_score['number_of_jokes_rated'] = len(ratings_to_predict)
    display(dataframe_joke_score)
    
    return joke_score

def print_recommended_jokes(user_unrated, num_recommend_joke, dataframe_joke):
    """
    Prints the recommended jokes useing setted parameters

    Args:
        joke_score (pd.DataFrame): row of dataframe from which we need to recommend the jokes
        no_jokes (int): no  of jokes to recommend
        joke_df (dict): dataframe of jokes
    Returns:
        None
    """

    print("Recommending Jokes for user {}".format(user_unrated['user_id']))
    
    for i,k in zip(range(num_recommend_joke),(np.argsort((user_unrated.to_numpy())[2:])[::-1])[:num_recommend_joke]):
        print("Joke {}: {}".format(i+1,dataframe_joke.loc[k]['joke']))

def uccf_recommender(user_id, dataframe_rating, dataframe_jokes, matrix_row):
    """
    Recommends jokes for the given user
    Args:
        user_id (int): id of active user
        df (pd.DataFrame): Dataframe of user ratings
        n1 (np.array): matrix of database containing joke rattings for every user
        joke_df (dict): Contains the data of all jokes
    Returns:
        None
    """
    threshold_no_of_ratings = 100
    similarity_threshold = 0.1
    # could be nan as well for considering all possible neighbours
    no_of_neighbours = 30
    dataframe_dense, dataframe_sparse = dataframe_split(dataframe_rating, threshold_no_of_ratings)
    print_recommended_jokes(predict_ratings(user_id, dataframe_rating, similarity_threshold, no_of_neighbours, matrix_row, dataframe_dense), 20, dataframe_jokes)

## **Non-Normalized Data**

In [13]:
user_id = 68388
uccf_recommender(user_id, dataframe_non_normalized_ratings, dataframe_jokes, numpy_ratings_non_normalized)

user_id                  68388.000000
number_of_jokes_rated       72.000000
joke_1                       2.714499
joke_2                       1.398703
joke_3                       2.590480
                             ...     
joke_96                      3.045859
joke_97                      4.111090
joke_98                      2.110463
joke_99                      1.942481
joke_100                     2.579359
Name: 68387, Length: 102, dtype: float64

Recommending Jokes for user 68388.0
Joke 1: A woman has twins, and gives them up for adoption.  One ofthem goes to a family in Egypt and is named "Amal."  The other goes toa  family in Spain; they name him "Juan."  Years later, Juan sends apicture of himself to his mom.  Upon receiving the picture, she tellsher husband that she wishes she also had a picture of Amal.  Her husband responds, "But they are twins-if you've seen Juan, you'veseen   Amal.
Joke 2: A neutron walks into a bar and orders a drink."How much do I owe you?" the neutron asks.The bartender replies, "for you, no charge."
Joke 3: A man piloting a hot air balloon discovers he has wandered off course andis hopelessly lost. He descends to a lower altitude and locates a mandown on the ground. He lowers the balloon further and shouts "Excuse me,can you tell me where I am?"The man below says: "Yes, you're in a hot air balloon, about 30 feetabove this field.""You must work in Information Technology," says the balloonist."Yes I d

## **Normalized**

In [7]:
uccf_recommender(user_id, dataframe_normalized_ratings, dataframe_jokes, numpy_ratings_normalized)

Recommending Jokes for user 68388.0
Joke 1: A woman has twins, and gives them up for adoption.  One ofthem goes to a family in Egypt and is named "Amal."  The other goes toa  family in Spain; they name him "Juan."  Years later, Juan sends apicture of himself to his mom.  Upon receiving the picture, she tellsher husband that she wishes she also had a picture of Amal.  Her husband responds, "But they are twins-if you've seen Juan, you'veseen   Amal.
Joke 2: A neutron walks into a bar and orders a drink."How much do I owe you?" the neutron asks.The bartender replies, "for you, no charge."
Joke 3: A man piloting a hot air balloon discovers he has wandered off course andis hopelessly lost. He descends to a lower altitude and locates a mandown on the ground. He lowers the balloon further and shouts "Excuse me,can you tell me where I am?"The man below says: "Yes, you're in a hot air balloon, about 30 feetabove this field.""You must work in Information Technology," says the balloonist."Yes I d

## **Evaluation**