# **Rating Prediction**

In [21]:
import pandas as pd
import numpy as np
import re
import ast
import pickle as pkl

# Natural Language Processing
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag

# Tools for topic modelling
from pprint import pprint
from operator import itemgetter

# LDA model
from gensim.models import LdaModel
from gensim.corpora import Dictionary

In [22]:
dataframe_non_normalized_ratings = pd.read_csv("Preprocessed/non_normalized_ratings.csv")
LDA_dataframe_jokes = pd.read_csv("Preprocessed/LDA_dataframe_jokes.csv")


numpy_ratings = dataframe_non_normalized_ratings.to_numpy()

In [23]:
# Reading Input File

new_jokes = []
i=1
with open('testfile.txt', 'r') as f:
    while(i!= 21):
        new_jokes.append([i,f.readline()])
        i+=1
new_jokes_df = pd.DataFrame(np.array(new_jokes))
new_jokes_df.columns=['new_joke_id','joke']

In [24]:
common_words = ["---","---|---","i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

## **Model Testing**

### **Functions**

#### **Text Based**

In [25]:
def clean_jokeText(text,common_words):
    """
    This function cleans joke text for further preprocesing.
    It first gets individual words from only lower case , alphanumeric text with no extra whitespaces
    then it removes all common words and joins the left out words to form new cleaned text of joke

    Args:
        text (string): input joke text
        common_words (list): list of common words to be removed from joke text
    Returns:
        string: cleaned text
    """
    words= re.sub(r"\s+", ' ', re.sub(r'[^\w+\s]', ' ', text.lower())).strip().split(' ')
    new_words = [w for w in words if w not in common_words]
    return ' '.join(new_words)

def lemenize_jokeText(text,common_words):
    """
    This function lemenize joke text for further preprocesing.
    It first gets individual words from text
    then it lemenizes every word and joins the new words together to form new text of joke

    Args:
        text (string): input joke text
        common_words (list): list of common words to be removed from joke text
    Returns:
        string: lemenized joke text
    """
    lemeniza=WordNetLemmatizer()
    new_words = [lemeniza.lemmatize(w) for w in word_tokenize(text) if w not in common_words]
    return ' '.join(new_words)

def posTag_jokes(text):
    """
    This function generates postags from joke text for further preprocesing.
    It first gets individual words from text
    then it finds postag for every word
    and then it filters out only nouns and verbs from them and joins them in a string

    Args:
        text (string): input joke text
    Returns:
        (sting):  a chain of words conatining only nouns and verbs
    """
    
    # creating posTags
    text = pos_tag(word_tokenize(text), tagset='universal')
    # update posTags and filter by posTags
    new_word=[]
    for i in text:
        if len(i[0]) > 1 and (i[1] == 'NOUN' or i[1] == 'VERB'):
            new_word.append(i[0])
    return ' '.join(new_word)

def preprocess_joke(text,common_words):
    """
    This function preprocess joke text to make it ready for Topic modeling

    Args:
        text (string): input joke text
        common_words (list): list of common words to be removed from joke text

    Returns:
        (sting):  a chain of words conatining only nouns and verbs
    """
    return posTag_jokes(lemenize_jokeText(clean_jokeText(text,common_words),common_words))

def generate_topics(joke_df,common_words):
    """
        Topic modeling process to be done using LDA for new input jokes
    Args:
        joke_df (pd.DataFrame):  dataframe of jokes
        common_words (list): list of common words to be removed from joke text
    Returns:
        joke_new (pd.DataFrame): dataframe of jokes after topic modeling
    """
    # Preprcessing jokes
    joke_new=joke_df.copy()
    joke_new['Processed_joke']=joke_new['joke'].apply(lambda x: preprocess_joke(x,common_words))
    
    tokens = [d.split() for d in joke_new['Processed_joke'].tolist()]
    dictionary = Dictionary(tokens)
    corpus=[dictionary.doc2bow(token) for token in tokens]
    
    model = LdaModel.load('Models/lda_model')
    # model.update(corpus)
    # model.save('lda_model/model1')
    
    # making clusters
    verbs_and_nouns = joke_new['Processed_joke'].tolist()
    cluster = [model.get_document_topics(dictionary.doc2bow(d.split()), minimum_probability = 0.0) for d in verbs_and_nouns]
    joke_new['cluster'] = pd.Series(cluster)
    
    # Sorting topics by probabilities
    sorted_topics = [(sorted(joke_new['cluster'][i],key=itemgetter(1),  reverse=True)) for i in range(len(joke_df))]
    joke_new['sorted_topics'] = pd.Series(sorted_topics)
    
    # Finding Main Topic for each Joke (finding max Probability topic)
    maximum_probability_topic = [joke_new['sorted_topics'][i][0][0] for i in range(len(joke_df))]
    joke_new['main_topic'] = maximum_probability_topic
    return joke_new


#### **Rating Based**

In [26]:
def score_joke_item(item_rating,corr_joke_df, active_user_mean_rating = 0):
    """
    Predicts the joke rating for the joke id 
    Args:
        item_rating (pd.DataFrame): average rating of every joke to be considered
        corr_joke_df (np.array):  array of correlation for every related  joke to be considered
        active_user_mean_rating (float): current joke rating of user -> No mean rating yet, so 0
    Returns:
        score (float): new joke rating
    """
    t1, t2 = 0, 0
    for similarity, norm_rating in zip(corr_joke_df, item_rating):
        t1+= norm_rating * similarity
        t2+= similarity
        
    score =active_user_mean_rating if t2==0 else (t1 + active_user_mean_rating)/t2
    return score

def generate_item_rating(user_ratings,joke_list):
    """
    Generates average rating of every joke to be considered
    Args:
        user_ratings (np.array): numpy array of user ratings
        joke_list (list<int>): list of jokes to be considered

    Returns:
        item_rating (list<float.)>: list of average ratings of jokes
    """
    item_rating=[]
    bool_to_check= (user_ratings.ndim==1)
    for joke in joke_list:
        if joke not in range(100):
            item_rating.append(0)
        else:
            item_rating.append(user_ratings[int(2+joke)] if (bool_to_check) else np.mean([i for i in user_ratings[:,int(2+joke)] if i!=0]))
    return item_rating

def generate_corelationMatrix(input_row,joke_df):
    """
    Generates a sorted correlation matrix for an input joke row for all given jokes in the dataset
    Args:
        input_row (pd.DataFrame): row of the input joke dataframe
        joke_df (pd.DataFrame):  dataframe of jokes
    Returns:
        list_corr_i (np.Array): matrix where first element represents joke_id and second element represets the pearson coefficient of similarity sorted in increasing order
    """
    v1=0
    cluster=input_row['cluster']
    for i in cluster:
        v1+=i[1]*i[1]
    list_corr_i=[]
    for _,row in joke_df.iterrows():
        cv=0
        vj=0
        row_cluster=ast.literal_eval(row['cluster'])
        for k in range(len(cluster)):
            val=row_cluster[k][1]
            cv+=cluster[k][1]*val
            vj+=val*val
        if vj==0:
            list_corr_i.append([row['joke_id'],0])
        else:
            list_corr_i.append([row['joke_id'],(cv/((v1**0.5)*(vj**0.5)))])
    list_corr_i=np.array(list_corr_i)
    list_corr_i=list_corr_i[list_corr_i[:,1].argsort()]
    return list_corr_i

def predict_new_rating(user_ratings,input_df1,joke_df,common_words,joke_coreelation_threshold=0):
    """
    Generates predicted ratings for new input jokes
    Args:
        user_ratings (np.array): numpy array of user ratings
        input_df1 (pd.DataFrame): dataframe of input jokes
        joke_df (pd.DataFrame):  dataframe of jokes
        common_words (list): list of common words to be removed from joke 
        joke_coreelation_threshold (float) : threshold for joke corelation
    Returns:
        input_df1 (pd.DataFrame): dataframe of input jokes with new ratings
    """
    # Generating topics for input jokes
    new_joke_df=generate_topics(input_df1,common_words)
    list_new_ratings=[]
    for i in range(len(new_joke_df)):
        # Generating correlation matrix for every joke
        matrix=generate_corelationMatrix(new_joke_df.loc[i],joke_df)
        index=0
        while(index<matrix.shape[0] and matrix[index,1]<joke_coreelation_threshold):
            index+=1
        matrix=matrix[index:]
        # calculating Rating for every joke using correlation matrix with threshold
        item_ratings=generate_item_rating(user_ratings.to_numpy(),matrix[:,0])
        list_new_ratings.append(score_joke_item(item_ratings,matrix[:,1]))
        
    # adding Raings to input dataframe
    input_df1['new_rating']=list_new_ratings
    
    return input_df1


##### **User History Based**

In [27]:
def generate_corelationrow(active_user,passive_user_dataset,n1):
    """
    Generates a sorted correlation matrix for an active user for all given passive users
    matrix row structure => 0th index: pasive user_id 1st index: its pearson's corelation coefficient with active user
    Args:
        active_user (int): Contains the user_id of the active user
        passive_user_dataset (list<int>): Contains the list of user_ids of the passive users
        n1 (np.array): matrix of database containing joke rattings for every user
    Returns:
        list_corr_i (np.Array): matrix where first ellement represents user id of passive user and second element represets the pearson coefficient of similarity sorted in increasing order
    """
    list_corr_i=[]
    for j in passive_user_dataset:
        # calculating pearson's co-relation coefficient between active_user and j (A passive_user)
        cv=0
        vi=0
        vj=0
        for k in range(2,102):
            cv+=n1[active_user-1][k]*n1[j-1][k]
            vi+=n1[active_user-1][k]*n1[active_user-1][k]
            vj+=n1[j-1][k]*n1[j-1][k]
        list_corr_i.append([j,(cv/((vi**0.5)*(vj**0.5)))])
    list_corr_i=np.array(list_corr_i)
    list_corr_i=list_corr_i[list_corr_i[:,1].argsort()]
    return list_corr_i

def df_split(df,threshold):
    """
    Splits df['user_id] in two different parts based on a threshold of number of jokes rated
    Args:
        df (pd.DataFrame): joke rating dataframe
        threshold (int): threshold of number of jokes rated
    Returns:
        df_dense (pd.DataFrame): dataframe having user_id's of only those users which have rated jokes above or equal to threshold
        df_sparse (pd.DataFrame): dataframe having user_id's of only those users which have rated jokes below threshold
    """
    df_dense=(df[df['number_of_jokes_rated']>=threshold])['user_id']
    df_sparse=(df[df['number_of_jokes_rated']<threshold])['user_id']
    return df_dense, df_sparse

def score_user_item(item_id, neighbours_df,neighbour_user_similarity, active_user_mean_rating = 0):
    """
    Predicts the joke rating for the joke id 
    Args:
        neighbours_df (pd.DataFrame): Dataframe of neighbours which willl be used to predict the user ratings
        neighbour_user_similarity (np.array):  array of correlation where 0th index: pasive user_id 1st index: its pearson's corelation coefficient with active user
        active_user_mean_rating (float): current joke rating of user -> No mean rating yet, so 0
    Returns:
        score (float): new joke rating
    """
    item_rating = neighbours_df[item_id]
    t1, t2 = 0, 0
    for similarity, norm_rating in zip(neighbour_user_similarity, item_rating):
        t1+= norm_rating * similarity
        t2+= similarity
    score = (t1 + active_user_mean_rating)/t2
    return score

def predict_ratings(user_id,df,similarity_threshold,no_of_neighbours,n1,passive_user_dataset):
    """
    Predicts the missing joke ratings for the given user and returns the final ratings (of all 100 jokes) for further processing
    Args:
        user_id (int): id of active user
        df (pd.DataFrame): Dataframe of user ratings
        similarity_threshold (float): Desiered threshold for similar neigbours
        no_of_neighbours(int): Number of neighbours to be considered for rating (nan value indicates that all users of )
        n1 (np.array): matrix of database containing joke rattings for every user
        passive_user_dataset (list<int>): Contains the list of user_ids of the passive users
    Returns:
        joke_score (pd.DataFrame): new joke ratings for user
    """
    user_rating=df.loc[user_id-1]
    ratings_to_predict=[column for column in df.columns[2:] if(user_rating[column]==0)]
    correlation_row=generate_corelationrow(user_id,passive_user_dataset,n1)
    
    # Find index for first usable corelation row
    index=0
    while(index<correlation_row.shape[0] and correlation_row[index,1]<similarity_threshold):
        index+=1
    
    neighbours=correlation_row[index:] if(no_of_neighbours!=no_of_neighbours) else(correlation_row[index:])[np.random.choice(correlation_row.shape[0]-index,no_of_neighbours,replace=False)]
    dataframe_neighbours_unrated_jokes =(df[df['user_id'].isin(neighbours[:, 0])])

    
    # mechanism of filling joke rattings
    joke_score = user_rating.copy()
    for column in ratings_to_predict:
        score = score_user_item(column, dataframe_neighbours_unrated_jokes, neighbours[:, 1], user_rating[column])
        joke_score[column]=score
    return joke_score

def rpub_recommender(user_id,df,n1,input_df1,joke_df,common_words):
    """
    Recommends new jokes to the user based on his previous ratings
    Args:
        user_id (int): id of active user
        df (pd.DataFrame): Dataframe of user ratings
        n1 (np.array): matrix of database containing joke rattings for every user
        input_df1 (pd.DataFrame): dataframe of input jokes
        joke_df (pd.DataFrame):  dataframe of jokes
        common_words (list): list of common words to be removed from joke
    Returns:
        input_df1 (pd.DataFrame): dataframe of input jokes with new ratings
    """
    threshold_no_of_ratings=100
    similarity_threshold=0.1
    # could be nan as well for considering all possible neighbours
    no_of_neighbours=30
    joke_coreelation_threshold=0.8
    df_dense,_=df_split(df,threshold_no_of_ratings)
    new_joke_score=predict_ratings(user_id,df,similarity_threshold,no_of_neighbours,n1,df_dense)
    return predict_new_rating(new_joke_score,input_df1,joke_df,common_words,joke_coreelation_threshold=joke_coreelation_threshold)

### **Testing**

In [28]:
display(dataframe_non_normalized_ratings[['joke_'+str(i) for i in range(1,21)]][:2])

Unnamed: 0,joke_1,joke_2,joke_3,joke_4,joke_5,joke_6,joke_7,joke_8,joke_9,joke_10,joke_11,joke_12,joke_13,joke_14,joke_15,joke_16,joke_17,joke_18,joke_19,joke_20
0,-7.82,8.79,-9.66,-8.16,-7.52,-8.5,-9.85,4.17,-8.98,-4.76,-8.5,-6.75,-7.18,8.45,-7.18,-7.52,-7.43,-9.81,-9.85,-9.85
1,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,9.22,6.75,8.64,4.42,7.43,4.56,-0.97,4.66,-0.68,3.3,-1.21


In [29]:
display(predict_new_rating(dataframe_non_normalized_ratings, new_jokes_df, LDA_dataframe_jokes, common_words))

Unnamed: 0,new_joke_id,joke,new_rating
0,1,"What falls, but never needs a bandage? The rai...",0.759473
1,2,I was going to tell you a joke about boxing bu...,0.760047
2,3,I'm not a fan of spring cleaning. Let's be hon...,0.622488
3,4,Why did the egg hide? It was a little chicken.\n,0.325672
4,5,What did the dirt say to the rain? If you keep...,0.511384
5,6,Why couldn't the sunflower ride its bike? It l...,0.279256
6,7,What's an egg's favorite vacation spot? New Yo...,0.268576
7,8,I ate a sock yesterday. It was very time-consu...,0.352633
8,9,What kind of candy do astronauts like? Mars ba...,0.257238
9,10,I wanted to buy some camo pants but couldn't f...,0.614257


In [33]:
user_id = 10
display(rpub_recommender(user_id,dataframe_non_normalized_ratings,numpy_ratings,new_jokes_df,LDA_dataframe_jokes,common_words))

Unnamed: 0,new_joke_id,joke,new_rating
0,1,"What falls, but never needs a bandage? The rai...",4.013175
1,2,I was going to tell you a joke about boxing bu...,4.013181
2,3,I'm not a fan of spring cleaning. Let's be hon...,0.0
3,4,Why did the egg hide? It was a little chicken.\n,3.845758
4,5,What did the dirt say to the rain? If you keep...,3.845132
5,6,Why couldn't the sunflower ride its bike? It l...,3.845866
6,7,What's an egg's favorite vacation spot? New Yo...,3.845888
7,8,I ate a sock yesterday. It was very time-consu...,4.392884
8,9,What kind of candy do astronauts like? Mars ba...,3.477642
9,10,I wanted to buy some camo pants but couldn't f...,0.0
