In [112]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from bs4 import BeautifulSoup
import re
import os
import codecs
import html2text
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from textblob import TextBlob
from nltk.corpus import stopwords
from wordcloud import WordCloud
import seaborn
from nltk import word_tokenize, pos_tag
import ast
from operator import itemgetter
from gensim.models import LdaModel
from scipy.sparse.linalg import svds
from gensim.models.phrases import Phrases, Phraser
from gensim.corpora import Dictionary
from gensim.models.nmf import Nmf
from gensim.models.coherencemodel import CoherenceModel
from sklearn.decomposition import NMF


Approach: 

In this recommendation system approach, we employ a hybrid model that combines collaborative filtering (CF) and content-based filtering (CBF) to enhance the accuracy and explainability of our predictions. We start by addressing the sparsity issue in the user-item matrix through collaborative filtering, identifying similar users based on historical ratings to fill in missing values. Subsequently, we incorporate content-based filtering by extracting features from jokes, such as keywords or genres, to capture the essence of each joke. During testing, we calculate the similarity between target jokes and others using a similarity measure like cosine similarity. We then select a set of most similar jokes based on content features and provide a prediction for the target joke's rating, determined by the average or weighted average rating of the selected similar jokes. This hybrid approach leverages both user preferences and content characteristics, offering a well-rounded recommendation system that effectively addresses the cold start problem and provides clear explanations for its predictions.

In [113]:
dataset1 = pd.read_excel("jester-data-1.xls")
dataset2 = pd.read_excel("jester-data-2.xls")
dataset3 = pd.read_excel("jester-data-3.xls")

def insert_return(frame):
    ret_lst = []
    for index,row in frame.iterrows():
        ret_lst.append(list(row))
    
    return ret_lst

def combine_dataframe(frame1, frame2, frame3):
    joke_lst = ["Number of jokes rated"]
    for i in range(100):
        joke_lst.append(f"joke-{i}")

    rating_lst = []

    rating_lst.extend(insert_return(frame1))
    rating_lst.extend(insert_return(frame2))
    rating_lst.extend(insert_return(frame3))

    return pd.DataFrame(data=rating_lst,columns=joke_lst)

In [114]:
ratings = combine_dataframe(dataset1, dataset2,dataset3)

In [115]:
ratings.replace(99.0, np.nan, inplace=True)
ratings.head()


Unnamed: 0,Number of jokes rated,joke-0,joke-1,joke-2,joke-3,joke-4,joke-5,joke-6,joke-7,joke-8,...,joke-90,joke-91,joke-92,joke-93,joke-94,joke-95,joke-96,joke-97,joke-98,joke-99
0,100.0,4.08,-0.29,6.36,4.37,-2.38,-9.66,-0.73,-5.34,8.88,...,2.82,-4.95,-0.29,7.86,-0.19,-2.14,3.06,0.34,-4.32,1.07
1,49.0,,,,,9.03,9.27,9.03,9.27,,...,,,,9.08,,,,,,
2,48.0,,8.35,,,1.8,8.16,-2.82,6.21,,...,,,,0.53,,,,,,
3,91.0,8.5,4.61,-4.17,-5.39,1.36,1.6,7.04,4.61,-0.44,...,5.19,5.58,4.27,5.19,5.73,1.55,3.11,6.55,1.8,1.6
4,100.0,-6.17,-3.54,0.44,-8.5,-7.09,-4.32,-8.69,-0.87,-6.65,...,-3.54,-6.89,-0.68,-2.96,-2.18,-3.35,0.05,-9.08,-5.05,-3.45


In [164]:
user_item_matrix = ratings.drop(columns=["Number of jokes rated"]).to_numpy()

user_item_matrix = np.nan_to_num(user_item_matrix)

mask_matrix = np.where(user_item_matrix != 0, 1, 0)

U, S, Vt = svds(user_item_matrix, k=8)

predicted_matrix = np.dot(np.dot(U, np.diag(S)), Vt)

predicted_matrix = np.clip(predicted_matrix, -10, 10)

mse = np.sum((predicted_matrix - user_item_matrix)**2) / np.sum(mask_matrix)

print(f'Mean Squared Error: {mse}')

Mean Squared Error: 15.098547712450365


In [165]:
CF_matrix = predicted_matrix

In [166]:
jokes_dataframe = pd.DataFrame()

In [167]:
def load_clean_joke():

    ret_jokes = []

    for i in range(1,101):
        file = 'init'+str(i)+'.html'
        data = codecs.open('jokes/' + file, 'r', encoding="cp1252")
        joke_html = data.read()

        # Extracting joke
        joke = html2text.html2text(joke_html)
        # Extracting joke_id
        joke_id = int(file.split('init')[1].split('.html')[0])
        cleaned_string = re.sub(r'[\|]+|[-]+', '', joke)

        ret_jokes.append(cleaned_string.strip())
    
    return ret_jokes


In [168]:
Jokes = load_clean_joke()

In [169]:
test_Jokes = Jokes[90:]
Jokes = Jokes[:90]

In [170]:
matrix_split1 = CF_matrix[:, :90]
matrix_split2 = CF_matrix[:, 90:]

In [171]:
jokes_dataframe["Original Jokes"] = Jokes

In [172]:
from nltk.corpus import stopwords

def remove_tags_puntuatuions_tags(Joke):
    special_char_patterns = r'[^a-zA-Z0-9\s]'
    urlPatterns = r'http\S+|www\S+'

    text = Joke

    text = re.sub(special_char_patterns, ' ', text)
    text = re.sub(urlPatterns, ' ', text)
    text = re.sub(r'\n', ' ', text)

    return text

def remove_stop_words_Tokenization(Joke):
    
    stopwords_ = set(stopwords.words('english'))
    Tokenize_Joke = []
    for word in Joke.split():
        if word not in stopwords_:
            Tokenize_Joke.append(word)

    return Tokenize_Joke

def lower(Joke):
    ret_lst = [word.lower() for word in Joke]
    return ret_lst

def preprocess_clean_jokes(uncleaned_Jokes):

    Cleaned_Jokes = []

    for Joke in uncleaned_Jokes:
        Joke = remove_tags_puntuatuions_tags(Joke)
        Joke = TextBlob(Joke)
        Joke = remove_stop_words_Tokenization(Joke)

        Cleaned_Jokes.append(lower(Joke))

    return Cleaned_Jokes



In [173]:
preprocessed_jokes = preprocess_clean_jokes(Jokes)

In [174]:
jokes_dataframe["PreProcessed Jokes"] = preprocessed_jokes

In [175]:
from nltk.stem import PorterStemmer

def stem_tokens(tokenized_text):
    stemmer = PorterStemmer()
    stemmed_tokens = []
    for preprocessed_joke in tokenized_text:
        stemmed_tokens.append([stemmer.stem(token) for token in preprocessed_joke])
    
    return stemmed_tokens

In [176]:
import nltk
from nltk.stem import WordNetLemmatizer

def lemmatize_text(text):
    lemmatizer = WordNetLemmatizer()
        
    lemmatized_text = " ".join([lemmatizer.lemmatize(word) for word in text])
    
    return lemmatized_text

In [177]:
lemmitized_jokes = [lemmatize_text(text) for text in preprocessed_jokes]

In [178]:
jokes_dataframe["lemmitized_jokes"] = lemmitized_jokes 

In [213]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,3))
tfidf_matrix = tfidf_vectorizer.fit_transform(lemmitized_jokes)
feature_names = tfidf_vectorizer.get_feature_names_out()
tfidf_array = tfidf_matrix.toarray()


In [180]:
TF_IDF_lemma = tfidf_array

In [216]:
jokes_dataframe["TF_IDF_Lemma"] = [i for i in TF_IDF_lemma] 

In [217]:
cleaned_list = list(jokes_dataframe["PreProcessed Jokes"])

merged_lst = []

for joke in cleaned_list:
    merged_lst.append(" ".join(joke))



In [183]:
jokes_dataframe["merged cleaned"] = merged_lst

In [184]:
jokes_dataframe["TF_IDF_Lemma"][0]

array([0., 0., 0., ..., 0., 0., 0.])

In [218]:
import nltk

def posTagging(text):
    return nltk.pos_tag(text, tagset='universal')

jokes_dataframe['pos_tags'] = jokes_dataframe['PreProcessed Jokes'].apply(lambda x: posTagging(x))
jokes_dataframe.head(2)

Unnamed: 0,Original Jokes,PreProcessed Jokes,lemmitized_jokes,TF_IDF_Lemma,merged cleaned,pos_tags,ADJ,ADP,ADV,CONJ,DET,NOUN,NUM,PRT,PRON,VERB,PUNC,OTHERS
0,"A man visits the doctor. The doctor says ""I ha...","[a, man, visits, doctor, the, doctor, says, i,...",a man visit doctor the doctor say i bad news y...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",a man visits doctor the doctor says i bad news...,"[(a, DET), (man, NOUN), (visits, VERB), (docto...",,,,,,,,,,,,
1,This couple had an excellent relationship goin...,"[this, couple, excellent, relationship, going,...",this couple excellent relationship going one d...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",this couple excellent relationship going one d...,"[(this, DET), (couple, ADJ), (excellent, NOUN)...",,,,,,,,,,,,


In [219]:
pd.options.mode.chained_assignment = None

In [220]:
jokes_dataframe["pos_tags"][0]

[('a', 'DET'),
 ('man', 'NOUN'),
 ('visits', 'VERB'),
 ('doctor', 'VERB'),
 ('the', 'DET'),
 ('doctor', 'NOUN'),
 ('says', 'VERB'),
 ('i', 'NOUN'),
 ('bad', 'ADJ'),
 ('news', 'NOUN'),
 ('you', 'PRON'),
 ('cancer', 'NOUN'),
 ('alzheimer', 'VERB'),
 ('disease', 'ADP'),
 ('the', 'DET'),
 ('man', 'NOUN'),
 ('replies', 'VERB'),
 ('well', 'ADV'),
 ('thank', 'ADJ'),
 ('god', 'NOUN'),
 ('i', 'NOUN'),
 ('cancer', 'NOUN')]

In [221]:
# jokes_dataframe['ADJ'] = pd.Series(dtype=str)
# jokes_dataframe['ADP'] = pd.Series(dtype=str)
# jokes_dataframe['ADV'] = pd.Series(dtype=str)
# jokes_dataframe['CONJ'] = pd.Series(dtype=str)
# jokes_dataframe['DET'] = pd.Series(dtype=str)
# jokes_dataframe['NOUN'] = pd.Series(dtype=str)
# jokes_dataframe['NUM'] = pd.Series(dtype=str)
# jokes_dataframe['PRT'] = pd.Series(dtype=str)
# jokes_dataframe['PRON'] = pd.Series(dtype=str)
# jokes_dataframe['PRT'] = pd.Series(dtype=str)
# jokes_dataframe['PRON'] = pd.Series(dtype=str)
# jokes_dataframe['VERB'] = pd.Series(dtype=str)
# jokes_dataframe['PUNC'] = pd.Series(dtype=str)
# jokes_dataframe['OTHERS'] = pd.Series(dtype=str)

In [222]:
def aggregate_tags(col_tags, tag_columns = {
    'ADJ': 'ADJ',
    'ADP': 'ADP',
    'ADV': 'ADV',
    'CONJ': 'CONJ',
    'DET': 'DET',
    'NOUN': 'NOUN',
    'NUM': 'NUM',
    'PRT': 'PRT',
    'PRON': 'PRON',
    'VERB': 'VERB',
    '.': '.',
    'X': 'X'}):
    aggregated_tags = {col: [] for col in tag_columns.values()}
    
    ret_frame = pd.DataFrame(columns=aggregated_tags)

    for i,joke_tags in enumerate(col_tags):
        temp_dict = {val:[] for val in aggregated_tags.keys()}
        for tag in joke_tags:
            temp_dict[tag[1]].append(tag[0])

        ret_frame = pd.concat([ret_frame, pd.DataFrame([temp_dict])], ignore_index=True)

    return ret_frame

aggregate_tags_frame = aggregate_tags(jokes_dataframe["pos_tags"])


In [223]:
aggregate_tags_frame

Unnamed: 0,ADJ,ADP,ADV,CONJ,DET,NOUN,NUM,PRT,PRON,VERB,.,X
0,"[bad, thank]",[disease],[well],[],"[a, the, the]","[man, doctor, i, news, cancer, man, god, i, ca...",[],[],[you],"[visits, doctor, says, alzheimer, replies]",[],[]
1,"[couple, girlfriend, awful, big, old]",[that],"[told, possibly, awfully]",[],[this],"[excellent, relationship, day, home, work, pac...","[one, ten]",[],"[he, what, they, he]","[going, came, find, asked, leaving, heard, cou...",[],[]
2,"[long, nelson]","[teeth, willie]",[],[],"[a, the]","[feet, front, row, concert]","[200, 4]",[],[what],[q],[],[]
3,[],[around],[],[],"[a, a]","[difference, man, toilet, follow, use]",[],[],[what],"[q, toilet]",[],[]
4,[slash],[o],[],[],[a],"[j, simpson, internet, address, slash, backsla...",[],[],[what],[q],[],[]
...,...,...,...,...,...,...,...,...,...,...,...,...
85,[much],[],[how],[],"[a, the]","[neutron, walks, bar, orders, i, neutron, bart...",[],[],[],"[drink, owe, asks, charge]",[],[]
86,"[routine, physical, good, bad, ok, good, good,...",[that],"[recently, i, first, then]",[],"[a, the, the, the, the, the, the, the]","[man, examination, phone, call, doctor, doctor...",[24],[],[],"[completing, receives, says, says, give, says,...",[],[]
87,"[czechoslovakian, eyesight, worse, see, simple...",[optometrist],[steadily],[],"[a, the]","[man, time, doctor, testing, eye, chart, lette...",[],[],[],"[felt, growing, felt, go, started, showed, dim...",[],[]
88,"[naval, canadian, please, divert, avoid, avoid...","[navy, lighthouse]","[north, south, north]",[],"[a, this, no, this, this]","[radio, conversation, ship, authorities, cours...","[15, 15, three, three, 15, one, five]",[],"[us, your, us, your, your, we, your]","[americans, recommend, divert, americans, say,...",[],[]


pos tagging: https://212digital.medium.com/an-introduction-to-part-of-speech-tagging-what-it-is-and-how-you-can-use-it-in-natural-language-9723f4696f78

sentiment analysis — By identifying words with positive or negative connotations, POS tagging can be used to calculate the overall sentiment of a piece of text.

topic identification — By looking at which words are most commonly used together, POS tagging can help automatically identify the main topics of a document.

In [224]:
def filterByPOS(tags):
    try:
        tags = ast.literal_eval(tags)
    except ValueError as e:
        print("Error during literal_eval:", e)
        return None
    
    
    txt = []
    
    for word, pos in tags:
        if pos in ['ADJ', 'NOUN'] and len(word) > 1:
            txt.append(word)
            
    return ' '.join(txt)

helper = [filterByPOS(str(tags)) for tags in jokes_dataframe["pos_tags"]]

Applying topic modelling


In [225]:
from gensim.models.phrases import Phrases, Phraser
from gensim.corpora import Dictionary

docs = [d.split() for d in helper if d]

bigram_model = Phrases(docs, min_count=5, threshold=15)
trigram_model = Phrases(bigram_model[docs], min_count=5, threshold=15)

docs_with_ngrams = trigram_model[bigram_model[docs]]

dictionary = Dictionary(docs_with_ngrams)
corpus = [dictionary.doc2bow(doc) for doc in docs_with_ngrams]


In [226]:
temp = dictionary[0]
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=500,
    alpha='auto',
    eta='auto',
    iterations=700,
    num_topics=12,
    passes=15,
    eval_every=None
    ,random_state=42
)

In [227]:
top_topics = model.top_topics(corpus)
top_topics

[([(0.030605953, 'water'),
   (0.030605953, 'fire'),
   (0.023110582, 'sleep'),
   (0.023110578, 'room'),
   (0.023110578, 'wakes'),
   (0.023110576, 'polish'),
   (0.015615202, 'mathematician'),
   (0.0156152, 'time'),
   (0.0156152, 'physicist'),
   (0.0156152, 'engineer'),
   (0.015615198, 'douses'),
   (0.0081197955, 'sees'),
   (0.008119795, 'amount'),
   (0.008119795, 'optimal'),
   (0.008119795, 'minimum'),
   (0.008119795, 'trajectory'),
   (0.008119795, 'solution'),
   (0.008119795, 'looks'),
   (0.008119795, 'sleeping'),
   (0.008119795, 'fills')],
  -2.778047128167708),
 ([(0.012594368, 'pope'),
   (0.012594368, 'original'),
   (0.012594368, 'guardsman'),
   (0.012594367, 'steps'),
   (0.01259436, 'patio'),
   (0.012594355, 'difference'),
   (0.0065489626, 'read'),
   (0.0065489626, 'asks'),
   (0.0065489626, 'scream'),
   (0.0065489626, 'available'),
   (0.0065489626, 'letter'),
   (0.0065489626, 'problem'),
   (0.006548962, 'angels'),
   (0.006548962, 'master'),
   (0.0065

This above code extracts the top topics from the trained LDA model using the top_topics method. Each topic is 
represented as a list of tuples, where each tuple contains a word and its associated weight in the topic.

In [228]:
topic_proportions_matrix = np.zeros((len(corpus), 12)) 

for i, doc_bow in enumerate(corpus):
    topic_distribution = model[doc_bow]
    
    for topic, proportion in topic_distribution:
        topic_proportions_matrix[i, topic] = proportion



In [229]:
topic_word_dists = model.get_topics()
doc_topic_dists = model.get_document_topics(corpus)

most_dominant_topics = [max(doc, key=lambda x: x[1])[0] for doc in doc_topic_dists]


Features extraction for applying algorithm

In [230]:
from string import punctuation
from nltk.sentiment.vader import SentimentIntensityAnalyzer

def text_analysis(text):

    # blob = TextBlob(text)

    # sentiment_polarity = blob.sentiment.polarity
    # sentiment_subjectivity = blob.sentiment.subjectivity

    sid = SentimentIntensityAnalyzer()

    sentiment_scores = sid.polarity_scores(text)

    text_length = len(text)

    words = word_tokenize(text)

    punctuation_count = sum(1 for char in words if char in punctuation)

    stop_words = set(stopwords.words('english'))
    words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]

    word_count = len(words)

    unique_words = len(set(words))

    pos_tags = pos_tag(words)

    verb_count = sum(1 for word, pos in pos_tags if pos.startswith('VB'))
    noun_count = sum(1 for word, pos in pos_tags if pos.startswith('NN'))

    return {
        'text_length': text_length,
        'punctuation_count': punctuation_count,
        'word_count': word_count,
        'unique_words': unique_words,
        'verb_count': verb_count,
        'noun_count': noun_count,
        'joke_sentiment_neg': sentiment_scores['neg'],
        'joke_sentiment_neu': sentiment_scores['neu'],
        'joke_sentiment_pos': sentiment_scores['pos']
        }

def extract_Featrues(raw_jokes, topic_proportion, num_topics, TF_Vector):

    columns=['text_length', 'punctuation_count', 'word_count',
                                      'unique_words','verb_count','noun_count', 'joke_sentiment_neg', 
                                       'joke_sentiment_neu', 'joke_sentiment_pos'
                                    ]

    lst = [f'topic-{i+1}' for i in range(num_topics)]

    columns.extend(lst)
    ret_frame = pd.DataFrame(columns=columns)
    

    for id,joke in enumerate(raw_jokes):
        joke = text_analysis(joke)

        for topic,proportion in zip(lst, topic_proportion[id]):
            joke[topic] = proportion

        ret_frame = pd.concat([ret_frame, pd.DataFrame([joke])], ignore_index=True)

    return ret_frame

In [231]:
features_jokes = extract_Featrues(list(jokes_dataframe["Original Jokes"]), topic_proportions_matrix, 12)

In [235]:
jokes_dataframe["Original Jokes"]

0     A man visits the doctor. The doctor says "I ha...
1     This couple had an excellent relationship goin...
2     Q. What's 200 feet long and has 4 teeth?\n\nA....
3     Q. What's the difference between a man and a t...
4     Q. What's O. J. Simpson's Internet address?\n\...
                            ...                        
85    A neutron walks into a bar and orders a drink....
86    A man, recently completing a routine physical ...
87    A Czechoslovakian man felt his eyesight was gr...
88    _A radio conversation of a US naval ship with ...
89    Q: How many programmers does it take to change...
Name: Original Jokes, Length: 90, dtype: object

In [236]:
features_jokes['joke_id'] = [i for i in range(1,91)]

In [237]:
features_jokes

Unnamed: 0,text_length,punctuation_count,word_count,unique_words,verb_count,noun_count,joke_sentiment_neg,joke_sentiment_neu,joke_sentiment_pos,topic-1,...,topic-4,topic-5,topic-6,topic-7,topic-8,topic-9,topic-10,topic-11,topic-12,joke_id
0,162,4,16,13,3,10,0.246,0.691,0.063,0.478537,...,0.0,0.00000,0.0,0.000000,0.0,0.000000,0.0,0.483437,0.0,1
1,378,6,32,31,9,12,0.041,0.909,0.050,0.000000,...,0.0,0.00000,0.0,0.974758,0.0,0.000000,0.0,0.000000,0.0,2
2,86,4,9,9,0,4,0.000,1.000,0.000,0.000000,...,0.0,0.00000,0.0,0.000000,0.0,0.000000,0.0,0.937528,0.0,3
3,109,4,8,7,1,6,0.000,1.000,0.000,0.000000,...,0.0,0.00000,0.0,0.923837,0.0,0.000000,0.0,0.000000,0.0,4
4,94,9,10,7,0,10,0.440,0.471,0.089,0.000000,...,0.0,0.00000,0.0,0.000000,0.0,0.000000,0.0,0.957356,0.0,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,137,8,12,11,3,7,0.095,0.905,0.000,0.000000,...,0.0,0.00000,0.0,0.000000,0.0,0.950732,0.0,0.000000,0.0,86
86,413,17,42,22,10,20,0.120,0.741,0.140,0.986567,...,0.0,0.00000,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,87
87,340,15,32,29,11,12,0.066,0.903,0.030,0.000000,...,0.0,0.97859,0.0,0.000000,0.0,0.000000,0.0,0.000000,0.0,88
88,790,25,71,47,12,34,0.116,0.773,0.111,0.000000,...,0.0,0.00000,0.0,0.000000,0.0,0.000000,0.0,0.991322,0.0,89


In [238]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features_jokes.drop(columns=["joke_id"]))

In [250]:
tf_idf_vector = TF_IDF_lemma

In [253]:
features_scaled.shape

(90, 21)

In [254]:
merged_array = np.hstack((features_scaled, tf_idf_vector))

In [260]:
from sklearn.metrics.pairwise import cosine_similarity

def transform_joke(joke):
    tokenized_joke = joke.split()

    ngram_joke = trigram_model[bigram_model[tokenized_joke]]

    joke_bow = dictionary.doc2bow(ngram_joke)

    return joke_bow

def find_closest_vectors(target_vector, list_of_vectors, top_n=40):
    target_vector = target_vector.reshape(1, -1)

    similarities = cosine_similarity(target_vector, list_of_vectors)

    top_indices = np.argsort(similarities[0])[-top_n:][::-1]

    return top_indices, similarities[0][top_indices]

def testing(test_jokes, train_jokes_frame, train_scaler, topic_model, num_topics, user_item_imputed):
    preprocess_test = preprocess_clean_jokes(test_jokes)

    testing_ratings = []

    for joke in preprocess_test:
        lemma_test = lemmatize_text(joke)
        pos_tag_test = posTagging(lemma_test.split())
        filter_Pos = filterByPOS(str(pos_tag_test))
        bow_joke = transform_joke(filter_Pos)
        topic_distribution = list(topic_model.get_document_topics(bow_joke))
        features_extracted = text_analysis(' '.join(joke))
        features = list(features_extracted.values())
        topic_proportions_matrix = np.zeros((1, 12)) 

        for i, doc_bow in enumerate([bow_joke]):
            topic_distribution = topic_model[doc_bow]
            for topic, proportion in topic_distribution:
                topic_proportions_matrix[i, topic] = proportion

        features.extend(topic_proportions_matrix[0])
        # scaled_test = train_scaler.transform(np.array(features).reshape(1, -1))
        features = np.array(features)
        top_i_closest, similarities = find_closest_vectors(features, train_jokes_frame)

        # Calculate weighted average using cosine similarities as weights
        weighted_average_ratings = np.average(user_item_imputed[:, top_i_closest], axis=1, weights=similarities)

        testing_ratings.append(weighted_average_ratings.tolist())

    return testing_ratings


In [261]:
predictions = testing(test_Jokes,features_jokes.drop(columns=["joke_id"]), scaler, model,10,matrix_split1)

In [262]:
jokes_dataframe

Unnamed: 0,Original Jokes,PreProcessed Jokes,lemmitized_jokes,TF_IDF_Lemma,merged cleaned,pos_tags,ADJ,ADP,ADV,CONJ,DET,NOUN,NUM,PRT,PRON,VERB,PUNC,OTHERS
0,"A man visits the doctor. The doctor says ""I ha...","[a, man, visits, doctor, the, doctor, says, i,...",a man visit doctor the doctor say i bad news y...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",a man visits doctor the doctor says i bad news...,"[(a, DET), (man, NOUN), (visits, VERB), (docto...",,,,,,,,,,,,
1,This couple had an excellent relationship goin...,"[this, couple, excellent, relationship, going,...",this couple excellent relationship going one d...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",this couple excellent relationship going one d...,"[(this, DET), (couple, ADJ), (excellent, NOUN)...",,,,,,,,,,,,
2,Q. What's 200 feet long and has 4 teeth?\n\nA....,"[q, what, 200, feet, long, 4, teeth, a, the, f...",q what 200 foot long 4 teeth a the front row w...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",q what 200 feet long 4 teeth a the front row w...,"[(q, VERB), (what, PRON), (200, NUM), (feet, N...",,,,,,,,,,,,
3,Q. What's the difference between a man and a t...,"[q, what, difference, man, toilet, a, a, toile...",q what difference man toilet a a toilet follow...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",q what difference man toilet a a toilet follow...,"[(q, VERB), (what, PRON), (difference, NOUN), ...",,,,,,,,,,,,
4,Q. What's O. J. Simpson's Internet address?\n\...,"[q, what, o, j, simpson, internet, address, a,...",q what o j simpson internet address a slash sl...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",q what o j simpson internet address a slash sl...,"[(q, VERB), (what, PRON), (o, ADP), (j, NOUN),...",,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,A neutron walks into a bar and orders a drink....,"[a, neutron, walks, bar, orders, drink, how, m...",a neutron walk bar order drink how much i owe ...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",a neutron walks bar orders drink how much i ow...,"[(a, DET), (neutron, NOUN), (walks, NOUN), (ba...",,,,,,,,,,,,
86,"A man, recently completing a routine physical ...","[a, man, recently, completing, routine, physic...",a man recently completing routine physical exa...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",a man recently completing routine physical exa...,"[(a, DET), (man, NOUN), (recently, ADV), (comp...",,,,,,,,,,,,
87,A Czechoslovakian man felt his eyesight was gr...,"[a, czechoslovakian, man, felt, eyesight, grow...",a czechoslovakian man felt eyesight growing st...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",a czechoslovakian man felt eyesight growing st...,"[(a, DET), (czechoslovakian, ADJ), (man, NOUN)...",,,,,,,,,,,,
88,_A radio conversation of a US naval ship with ...,"[a, radio, conversation, us, naval, ship, cana...",a radio conversation u naval ship canadian aut...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",a radio conversation us naval ship canadian au...,"[(a, DET), (radio, NOUN), (conversation, NOUN)...",,,,,,,,,,,,


In [263]:
np.array(predictions).T.shape

(73418, 10)

In [264]:
matrix_split2.shape

(73418, 10)

In [265]:
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(matrix_split2, np.array(predictions).T)


In [266]:
mse

2.716337910189438

In [267]:
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Transpose predictions to match the shape of matrix_split2
array_x = matrix_split2
array_y = np.array(predictions).T

# Initialize lists to store scores for each column
rmse_values = []
mse_values = []
mae_values = []
r_squared_values = []

for col_x, col_y in zip(array_x.T, array_y):
    common_length = min(len(col_x), len(col_y))
    
    rmse = np.sqrt(mean_squared_error(col_x[:common_length], col_y[:common_length]))
    rmse_values.append(rmse)

    mae = mean_absolute_error(col_x[:common_length], col_y[:common_length])
    mae_values.append(mae)

    r_squared = r2_score(col_x[:common_length], col_y[:common_length])
    r_squared_values.append(r_squared)

print(f"RMSE: {np.mean(rmse)}, MAE: {np.mean(mae)}, R-squared: {np.mean(r_squared)}")


RMSE: 3.1006921656900333, MAE: 2.4228822061383517, R-squared: -0.3432699075499501
