# **Rating Prediction**

In [75]:
import pandas as pd
import numpy as np
import re
import ast
import pickle as pkl

# Natural Language Processing
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag

# Tools for topic modelling
from pprint import pprint
from operator import itemgetter

# LDA model
from gensim.models import LdaModel
from gensim.corpora import Dictionary

In [76]:
merged_dataframe_ratings = pd.read_csv("Preprocessed/non_normalized_ratings.csv")
LDA_dataframe_jokes = pd.read_csv("Preprocessed/LDA_dataframe_jokes.csv")

In [77]:
# Reading Input File

new_jokes = []
i=1
with open('testfile.txt', 'r') as f:
    while(i!= 21):
        new_jokes.append([i,f.readline()])
        i+=1
new_jokes_df = pd.DataFrame(np.array(new_jokes))
new_jokes_df.columns=['new_joke_id','joke']

In [78]:
common_words = ["---","---|---","i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

## **Model Testing**

### **Functions**

#### **Text Based**

In [79]:
def clean_jokeText(text,common_words):
    """
    This function cleans joke text for further preprocesing.
    It first gets individual words from only lower case , alphanumeric text with no extra whitespaces
    then it removes all common words and joins the left out words to form new cleaned text of joke

    Args:
        text (string): input joke text
        common_words (list): list of common words to be removed from joke text
    Returns:
        string: cleaned text
    """
    words= re.sub(r"\s+", ' ', re.sub(r'[^\w+\s]', ' ', text.lower())).strip().split(' ')
    new_words = [w for w in words if w not in common_words]
    return ' '.join(new_words)

def lemenize_jokeText(text,common_words):
    """
    This function lemenize joke text for further preprocesing.
    It first gets individual words from text
    then it lemenizes every word and joins the new words together to form new text of joke

    Args:
        text (string): input joke text
        common_words (list): list of common words to be removed from joke text
    Returns:
        string: lemenized joke text
    """
    lemeniza=WordNetLemmatizer()
    new_words = [lemeniza.lemmatize(w) for w in word_tokenize(text) if w not in common_words]
    return ' '.join(new_words)

def posTag_jokes(text):
    """
    This function generates postags from joke text for further preprocesing.
    It first gets individual words from text
    then it finds postag for every word
    and then it filters out only nouns and verbs from them and joins them in a string

    Args:
        text (string): input joke text
    Returns:
        (sting):  a chain of words conatining only nouns and verbs
    """
    
    # creating posTags
    text = pos_tag(word_tokenize(text), tagset='universal')
    # update posTags and filter by posTags
    new_word=[]
    for i in text:
        if len(i[0]) > 1 and (i[1] == 'NOUN' or i[1] == 'VERB'):
            new_word.append(i[0])
    return ' '.join(new_word)

def preprocess_joke(text,common_words):
    """
    This function preprocess joke text to make it ready for Topic modeling

    Args:
        text (string): input joke text
        common_words (list): list of common words to be removed from joke text

    Returns:
        (sting):  a chain of words conatining only nouns and verbs
    """
    return posTag_jokes(lemenize_jokeText(clean_jokeText(text,common_words),common_words))

def generate_topics(joke_df,common_words):
    """
        Topic modeling process to be done using LDA for new input jokes
    Args:
        joke_df (pd.DataFrame):  dataframe of jokes
        common_words (list): list of common words to be removed from joke text
    Returns:
        joke_new (pd.DataFrame): dataframe of jokes after topic modeling
    """
    # Preprcessing jokes
    joke_new=joke_df.copy()
    joke_new['Processed_joke']=joke_new['joke'].apply(lambda x: preprocess_joke(x,common_words))
    
    tokens = [d.split() for d in joke_new['Processed_joke'].tolist()]
    dictionary = Dictionary(tokens)
    corpus=[dictionary.doc2bow(token) for token in tokens]
    
    model = LdaModel.load('Models/lda_model')
    # model.update(corpus)
    # model.save('lda_model/model1')
    
    # making clusters
    verbs_and_nouns = joke_new['Processed_joke'].tolist()
    cluster = [model.get_document_topics(dictionary.doc2bow(d.split()), minimum_probability = 0.0) for d in verbs_and_nouns]
    joke_new['cluster'] = pd.Series(cluster)
    
    # Sorting topics by probabilities
    sorted_topics = [(sorted(joke_new['cluster'][i],key=itemgetter(1),  reverse=True)) for i in range(len(joke_df))]
    joke_new['sorted_topics'] = pd.Series(sorted_topics)
    
    # Finding Main Topic for each Joke (finding max Probability topic)
    maximum_probability_topic = [joke_new['sorted_topics'][i][0][0] for i in range(len(joke_df))]
    joke_new['main_topic'] = maximum_probability_topic
    return joke_new


#### **Rating Based**

In [80]:
def score_user_item(item_rating,corr_joke_df, active_user_mean_rating = 0):
    """
    Predicts the joke rating for the joke id 
    Args:
        item_rating (pd.DataFrame): average rating of every joke to be considered
        corr_joke_df (np.array):  array of correlation for every related  joke to be considered
        active_user_mean_rating (float): current joke rating of user -> No mean rating yet, so 0
    Returns:
        score (float): new joke rating
    """
    t1, t2 = 0, 0
    for similarity, norm_rating in zip(corr_joke_df, item_rating):
        t1+= norm_rating * similarity
        t2+= similarity
        
    score =active_user_mean_rating if t2==0 else (t1 + active_user_mean_rating)/t2
    return score

def generate_item_rating(user_ratings,joke_list):
    """
    Generates average rating of every joke to be considered
    Args:
        user_ratings (np.array): numpy array of user ratings
        joke_list (list<int>): list of jokes to be considered

    Returns:
        item_rating (list<float.)>: list of average ratings of jokes
    """
    item_rating=[]
    for joke in joke_list:
        if joke not in range(100):
            item_rating.append(0)
        else:
            item_rating.append(np.mean([i for i in user_ratings[:,int(2+joke)] if i!=0]))
    return item_rating

def generate_corelationMatrix(input_row,joke_df):
    """
    Generates a sorted correlation matrix for an input joke row for all given jokes in the dataset
    Args:
        input_row (pd.DataFrame): row of the input joke dataframe
        joke_df (pd.DataFrame):  dataframe of jokes
    Returns:
        list_corr_i (np.Array): matrix where first element represents joke_id and second element represets the pearson coefficient of similarity sorted in increasing order
    """
    v1=0
    cluster=input_row['cluster']
    for i in cluster:
        v1+=i[1]*i[1]
    list_corr_i=[]
    for _,row in joke_df.iterrows():
        cv=0
        vj=0
        row_cluster=ast.literal_eval(row['cluster'])
        for k in range(len(cluster)):
            val=row_cluster[k][1]
            cv+=cluster[k][1]*val
            vj+=val*val
        if vj==0:
            list_corr_i.append([row['joke_id'],0])
        else:
            list_corr_i.append([row['joke_id'],(cv/((v1**0.5)*(vj**0.5)))])
    list_corr_i=np.array(list_corr_i)
    list_corr_i=list_corr_i[list_corr_i[:,1].argsort()]
    return list_corr_i

def predict_new_rating(user_ratings,input_df,joke_df,common_words,joke_coreelation_threshold=0.2):
    """
    Generates predicted ratings for new input jokes
    Args:
        user_ratings (np.array): numpy array of user ratings
        input_df (pd.DataFrame): dataframe of input jokes
        joke_df (pd.DataFrame):  dataframe of jokes
        common_words (list): list of common words to be removed from joke 
        joke_coreelation_threshold (float) : threshold for joke corelation
    Returns:
        input_df (pd.DataFrame): dataframe of input jokes with new ratings
    """
    # Generating topics for input jokes
    new_joke_df=generate_topics(input_df,common_words)
    list_new_ratings=[]
    for i in range(len(new_joke_df)):
        # Generating correlation matrix for every joke
        matrix=generate_corelationMatrix(new_joke_df.loc[i],joke_df)
        index=0
        while(index<matrix.shape[0] and matrix[index,1]<joke_coreelation_threshold):
            index+=1
        matrix=matrix[index:]
        # calculating Rating for every joke using correlation matrix with threshold
        item_ratings=generate_item_rating(user_ratings.to_numpy(),matrix[:,0])
        list_new_ratings.append(score_user_item(item_ratings,matrix[:,1]))
        
    # adding Raings to input dataframe
    input_df['new_rating']=list_new_ratings
    
    return input_df


### **Testing**

In [81]:
display(predict_new_rating(merged_dataframe_ratings, new_jokes_df, LDA_dataframe_jokes, common_words))

Unnamed: 0,new_joke_id,joke,new_rating
0,1,"What does an atheist say during an orgasm? ""O...",1.404265
1,2,Two men are discussing the age old question: w...,0.579673
2,3,Arnold Swartzeneger and Sylvester Stallone are...,0.624062
3,4,"A horse walks into a bar. Bartender says: ""So...",1.198681
4,5,A boy comes home from school and tells his mot...,0.599147
5,6,A couple has been married for 75 years. For th...,0.561238
6,7,There was an engineer who had an exceptional g...,0.412504
7,8,"The graduate with a Science degree asks, ""Why ...",0.89385
8,9,Three engineering students were gathered toget...,0.703156
9,10,A guy goes into confession and says to the pri...,0.669508
