# **Imports**

In [None]:
import re
import ast
import random
import nltk
import pandas as pd
import numpy as np
import seaborn as sns
from pprint import pprint
from operator import itemgetter
from gensim.models import LdaModel
from matplotlib import pyplot as plt
from gensim.corpora import Dictionary
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize, pos_tag
!python -m nltk.downloader all

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Package basque_grammars is already up-to-date!
[nltk_data]    | Downloading package bcp47 to /root/nltk_data...
[nltk_data]    |   Package bcp47 is already up-to-dat

# **DataFrames**

In [None]:
dataframe_jokes = pd.read_csv("/content/jokes_dataframe.csv")
dataframe_ratings = pd.read_csv("/content/rating_dataframe.csv")

In [None]:
print(dataframe_jokes.head())
print(dataframe_ratings.head())

   joke_id                                               joke
0        1  A man visits the doctor. The doctor says "I ha...
1        2  This couple had an excellent relationship goin...
2        3  Q. What's 200 feet long and has 4 teeth? A. Th...
3        4  Q. What's the difference between a man and a t...
4        5  Q.\tWhat's O. J. Simpson's Internet address? A...
   user_id  number_of_jokes_rated  joke_1  joke_2  joke_3  joke_4  joke_5  \
0        1                     74   -7.82    8.79   -9.66   -8.16   -7.52   
1        2                    100    4.08   -0.29    6.36    4.37   -2.38   
2        3                     49   99.00   99.00   99.00   99.00    9.03   
3        4                     48   99.00    8.35   99.00   99.00    1.80   
4        5                     91    8.50    4.61   -4.17   -5.39    1.36   

   joke_6  joke_7  joke_8  ...  joke_91  joke_92  joke_93  joke_94  joke_95  \
0   -8.50   -9.85    4.17  ...     2.82    99.00    99.00    99.00    99.00   
1   -9.

Note: Currently working with non-normalized data.

# **Model**

## **Preprocessing**

In [None]:
def cleanText(text):
    text = text.lower()
    text = re.sub(r'[^\w+\s]', ' ', text)
    text = re.sub(r"\s+", ' ', text)
    text = text.strip()
    return text

stopwords = ["---","---|---","i", "me", "my", "myself", "we",
             "our", "ours", "ourselves", "you", "your", "yours",
             "yourself", "yourselves", "he", "him", "his", "himself",
             "she", "her", "hers", "herself", "it", "its", "itself",
             "they", "them", "their", "theirs", "themselves", "what",
             "which", "who", "whom", "this", "that", "these", "those",
             "am", "is", "are", "was", "were", "be", "been", "being",
             "have", "has", "had", "having", "do", "does", "did", "doing",
             "a", "an", "the", "and", "but", "if", "or", "because", "as",
             "until", "while", "of", "at", "by", "for", "with", "about",
             "against", "between", "into", "through", "during", "before",
             "after", "above", "below", "to", "from", "up", "down", "in",
             "out", "on", "off", "over", "under", "again", "further", "then",
             "once", "here", "there", "when", "where", "why", "how", "all",
             "any", "both", "each", "few", "more", "most", "other", "some",
             "such", "no", "nor", "not", "only", "own", "same", "so", "than",
             "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]

def removeStopwords(text):
    words = text.split(' ')
    new_words = []
    for w in words:
        if w in stopwords:
            pass
        else:
            new_words.append(w)
    return ' '.join(new_words)

dataframe_jokes['cleanedText1'] = dataframe_jokes['joke'].apply(lambda x: cleanText(x))
dataframe_jokes['cleanedText2'] = dataframe_jokes['cleanedText1'].apply(lambda x: removeStopwords(x))
print(dataframe_jokes.head())

   joke_id                                               joke  \
0        1  A man visits the doctor. The doctor says "I ha...   
1        2  This couple had an excellent relationship goin...   
2        3  Q. What's 200 feet long and has 4 teeth? A. Th...   
3        4  Q. What's the difference between a man and a t...   
4        5  Q.\tWhat's O. J. Simpson's Internet address? A...   

                                        cleanedText1  \
0  a man visits the doctor the doctor says i have...   
1  this couple had an excellent relationship goin...   
2  q what s 200 feet long and has 4 teeth a the f...   
3  q what s the difference between a man and a to...   
4  q what s o j simpson s internet address a slas...   

                                        cleanedText2  
0  man visits doctor doctor says bad news havecan...  
1  couple excellent relationship going one day ca...  
2  q 200 feet long 4 teeth front row willie nelso...  
3  q difference man toilet toilet doesn follow ar...

### **Lemmetization**

In [None]:
lemmatizer = WordNetLemmatizer()

def performLemmatization(text):
    words = word_tokenize(text)
    new_words=[]

    for w in words:
        new_words.append(lemmatizer.lemmatize(w))

    return ' '.join(new_words)

def posTagging(text):
    text = nltk.word_tokenize(text)
    return pos_tag(text, tagset='universal')

dataframe_jokes['cleanedText3'] = dataframe_jokes['cleanedText2'].apply(lambda x: performLemmatization(x))
dataframe_jokes['pos_tags'] = dataframe_jokes['cleanedText2'].apply(lambda x: posTagging(x))

In [None]:
dataframe_jokes.head(5)

Unnamed: 0,joke_id,joke,cleanedText1,cleanedText2,cleanedText3,pos_tags
0,1,"A man visits the doctor. The doctor says ""I ha...",a man visits the doctor the doctor says i have...,man visits doctor doctor says bad news havecan...,man visit doctor doctor say bad news havecance...,"[(man, NOUN), (visits, NOUN), (doctor, VERB), ..."
1,2,This couple had an excellent relationship goin...,this couple had an excellent relationship goin...,couple excellent relationship going one day ca...,couple excellent relationship going one day ca...,"[(couple, ADJ), (excellent, NOUN), (relationsh..."
2,3,Q. What's 200 feet long and has 4 teeth? A. Th...,q what s 200 feet long and has 4 teeth a the f...,q 200 feet long 4 teeth front row willie nelso...,q 200 foot long 4 teeth front row willie nelso...,"[(q, ADV), (200, NUM), (feet, NOUN), (long, AD..."
3,4,Q. What's the difference between a man and a t...,q what s the difference between a man and a to...,q difference man toilet toilet doesn follow ar...,q difference man toilet toilet doesn follow ar...,"[(q, NOUN), (difference, NOUN), (man, NOUN), (..."
4,5,Q.\tWhat's O. J. Simpson's Internet address? A...,q what s o j simpson s internet address a slas...,q o j simpson internet address slash slash bac...,q o j simpson internet address slash slash bac...,"[(q, NOUN), (o, VERB), (j, VERB), (simpson, AD..."


In [None]:
# Creating columns with lists of respective pos tags

pd.options.mode.chained_assignment = None

pos_tags_columns = ['ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRT', 'PRON', 'VERB', 'PUNC', 'OTHERS']

for column in pos_tags_columns:
    dataframe_jokes[column] = dataframe_jokes['pos_tags'].apply(lambda tags: [tag[0] for tag in tags if tag[1] == column])

# Optionally, if you want to remove duplicate values in each cell:
for column in pos_tags_columns:
    dataframe_jokes[column] = dataframe_jokes[column].apply(lambda tags: list(set(tags)))

print(dataframe_jokes.head(5))

   joke_id                                               joke  \
0        1  A man visits the doctor. The doctor says "I ha...   
1        2  This couple had an excellent relationship goin...   
2        3  Q. What's 200 feet long and has 4 teeth? A. Th...   
3        4  Q. What's the difference between a man and a t...   
4        5  Q.\tWhat's O. J. Simpson's Internet address? A...   

                                        cleanedText1  \
0  a man visits the doctor the doctor says i have...   
1  this couple had an excellent relationship goin...   
2  q what s 200 feet long and has 4 teeth a the f...   
3  q what s the difference between a man and a to...   
4  q what s o j simpson s internet address a slas...   

                                        cleanedText2  \
0  man visits doctor doctor says bad news havecan...   
1  couple excellent relationship going one day ca...   
2  q 200 feet long 4 teeth front row willie nelso...   
3  q difference man toilet toilet doesn follow a

In [None]:
# Counting number of elements for each POS tag and each joke

pos_tags_columns = ['ADJ', 'ADP', 'ADV', 'CONJ', 'DET', 'NOUN', 'NUM', 'PRT', 'PRON', 'VERB', 'PUNC', 'OTHERS']

counts = {col: sum(dataframe_jokes[col].apply(len)) for col in pos_tags_columns}

for col, count in counts.items():
    print(f"{col}: {count}")

ADJ: 437
ADP: 37
ADV: 155
CONJ: 1
DET: 13
NOUN: 1158
NUM: 111
PRT: 2
PRON: 5
VERB: 633
PUNC: 0
OTHERS: 0


In [None]:
# Only taking Verb and Noun

dataframe_jokes_pos_filter = dataframe_jokes.copy()

def filter_by_POS(tags):
    '''
    Filter the text with length of word greater than 1 and belonging to noun and verb category.

    '''

    filtered_tags = []
    for tag in tags:
        if len(tag[0]) > 1 and (tag[1] == 'NOUN' or tag[1] == 'VERB'):
            filtered_tags.append(tag[0])

    return ' '.join(filtered_tags)


dataframe_jokes_pos_filter['cleanedText4'] = dataframe_jokes_pos_filter['pos_tags'].apply(lambda x: filter_by_POS(x))


In [None]:
print(dataframe_jokes_pos_filter.head(5))

   joke_id                                               joke  \
0        1  A man visits the doctor. The doctor says "I ha...   
1        2  This couple had an excellent relationship goin...   
2        3  Q. What's 200 feet long and has 4 teeth? A. Th...   
3        4  Q. What's the difference between a man and a t...   
4        5  Q.\tWhat's O. J. Simpson's Internet address? A...   

                                        cleanedText1  \
0  a man visits the doctor the doctor says i have...   
1  this couple had an excellent relationship goin...   
2  q what s 200 feet long and has 4 teeth a the f...   
3  q what s the difference between a man and a to...   
4  q what s o j simpson s internet address a slas...   

                                        cleanedText2  \
0  man visits doctor doctor says bad news havecan...   
1  couple excellent relationship going one day ca...   
2  q 200 feet long 4 teeth front row willie nelso...   
3  q difference man toilet toilet doesn follow a

## **Topic Modelling with LDA**

In [None]:
# Remove rare and common tokens.

tokens = [d.split() for d in dataframe_jokes_pos_filter['cleanedText4'].tolist()]


dictionary = Dictionary(tokens)
corpus = [dictionary.doc2bow(token) for token in tokens]

print(dictionary)

Dictionary<1160 unique tokens: ['alzheimer', 'cancer', 'disease', 'doctor', 'god']...>


In [None]:
# LDA Model

# Set training parameters.
num_topics = 10
chunksize = 2000
passes = 20
iterations = 400
eval_every = None  # Don't evaluate model perplexity, takes too much time.

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

model = LdaModel(
    corpus=corpus,
    id2word=id2word,
    chunksize=chunksize,
    alpha='auto',
    eta='auto',
    iterations=iterations,
    num_topics=num_topics,
    passes=passes,
    eval_every=eval_every
)

top_topics = model.top_topics(corpus) #, num_words=20)
pprint(top_topics)

[([(0.026701683, 'wow'),
   (0.026701678, 'bill'),
   (0.021466032, 'says'),
   (0.016230375, 'throw'),
   (0.016230375, 'could'),
   (0.0109947175, 'andmake'),
   (0.010994716, 'bow'),
   (0.010994716, 'send'),
   (0.010994716, 'plane'),
   (0.010994716, 'dollar'),
   (0.010994716, 'dog'),
   (0.010994716, 'pope'),
   (0.010994711, 'shoes'),
   (0.005759031, 'sitting'),
   (0.005759031, 'make'),
   (0.005759031, 'bills'),
   (0.005759031, 'hillary'),
   (0.005759031, 'country'),
   (0.005759031, 'airplane'),
   (0.005759031, 'clinton')],
  -9.681668433000866),
 ([(0.024397047, 'news'),
   (0.017150356, 'says'),
   (0.014734796, 'man'),
   (0.012319232, 'said'),
   (0.012319232, 'goes'),
   (0.007488103, 'dad'),
   (0.007488103, 'pope'),
   (0.007488103, 'william'),
   (0.0074881017, 'doctor'),
   (0.0074881017, 'teller'),
   (0.0074881017, 'go'),
   (0.0074881017, 'engineer'),
   (0.0050725318, 'years'),
   (0.0050725318, 'mother'),
   (0.0050725318, 'half'),
   (0.0050725318, 'marrie

### **Main Topic for each Joke**

In [None]:
verbs_and_nouns = dataframe_jokes_pos_filter['cleanedText4'].tolist()

cluster = []

for d in verbs_and_nouns:
    bow = dictionary.doc2bow(d.split())
    topics = model.get_document_topics(bow, minimum_probability = 0.0)
    cluster.append(topics)

dataframe_jokes_pos_filter['cluster'] = pd.Series(cluster)

sorted_topics = []
maximum_probability_topic = []

for i in range(len(dataframe_jokes)):
    sorted_topics.append(sorted(dataframe_jokes_pos_filter['cluster'][i],key=itemgetter(1),  reverse=True))
    maximum_probability_topic.append(dataframe_jokes_pos_filter['sorted_topics'][i][0][0])

dataframe_jokes_pos_filter['sorted_topics'] = pd.Series(sorted_topics)
dataframe_jokes_pos_filter['main_topic'] = maximum_probability_topic

In [None]:
print(dataframe_jokes_pos_filter.head(5))

   joke_id                                               joke  \
0        1  A man visits the doctor. The doctor says "I ha...   
1        2  This couple had an excellent relationship goin...   
2        3  Q. What's 200 feet long and has 4 teeth? A. Th...   
3        4  Q. What's the difference between a man and a t...   
4        5  Q.\tWhat's O. J. Simpson's Internet address? A...   

                                        cleanedText1  \
0  a man visits the doctor the doctor says i have...   
1  this couple had an excellent relationship goin...   
2  q what s 200 feet long and has 4 teeth a the f...   
3  q what s the difference between a man and a to...   
4  q what s o j simpson s internet address a slas...   

                                        cleanedText2  \
0  man visits doctor doctor says bad news havecan...   
1  couple excellent relationship going one day ca...   
2  q 200 feet long 4 teeth front row willie nelso...   
3  q difference man toilet toilet doesn follow a

### **Normalization**

In [None]:
dataframe_ratings.replace(99.0, np.nan, inplace=True)
dataframe_ratings_only = dataframe_ratings.iloc[:, 2:102]
dataframe_ratings_only['mean_rating'] = dataframe_ratings_only.mean(axis=1)
dataframe_ratings_only['std_dev'] = dataframe_ratings_only.std(axis=1)
for i in range(100):
    dataframe_ratings_only[dataframe_ratings_only.columns[i]] = (dataframe_ratings_only[dataframe_ratings_only.columns[i]] - dataframe_ratings_only['mean_rating'])/dataframe_ratings_only['std_dev']

dataframe_ratings_only['quantile1'] = dataframe_ratings_only.quantile(q=0.25, axis=1)
dataframe_ratings_only['quantile3'] = dataframe_ratings_only.quantile(q=0.75, axis=1)

for i in range(100):
    dataframe_ratings_only[dataframe_ratings_only.columns[i]] = (dataframe_ratings_only[dataframe_ratings_only.columns[i]]-dataframe_ratings_only['quantile1'])/(dataframe_ratings_only['quantile3']-dataframe_ratings_only['quantile1'])

dataframe_ratings_only['user_id'] = dataframe_ratings.iloc[:,0]
dataframe_ratings_only['number_of_jokes_rated'] = dataframe_ratings.iloc[:,1]

  dataframe_ratings_only['quantile1'] = dataframe_ratings_only.quantile(q=0.25, axis=1)
  dataframe_ratings_only['quantile3'] = dataframe_ratings_only.quantile(q=0.75, axis=1)
  dataframe_ratings_only['user_id'] = dataframe_ratings.iloc[:,0]
  dataframe_ratings_only['number_of_jokes_rated'] = dataframe_ratings.iloc[:,1]


In [None]:
dataframe_ratings_only.head()

Unnamed: 0,joke_1,joke_2,joke_3,joke_4,joke_5,joke_6,joke_7,joke_8,joke_9,joke_10,...,joke_97,joke_98,joke_99,joke_100,mean_rating,std_dev,quantile1,quantile3,user_id,number_of_jokes_rated
0,0.081373,1.515433,-0.077488,0.052018,0.107274,0.022664,-0.093892,1.116555,-0.018778,0.345564,...,0.270451,,,,-3.431892,6.384781,-0.834893,0.979187,1.0,74.0
1,0.586516,0.065036,0.858592,0.621122,-0.184368,-1.053103,0.01253,-0.537589,1.159308,1.199881,...,0.464797,0.140215,-0.415871,0.227327,2.7463,4.857253,-0.73731,0.987945,2.0,100.0
2,,,,,0.992908,1.129078,0.992908,1.129078,,,...,,,,,7.099388,3.752537,0.048131,0.517813,3.0,49.0
3,,1.213763,,,0.254758,1.185944,-0.421669,0.900439,,0.260615,...,,,,,2.658125,4.933237,-0.526657,0.857829,4.0,48.0
4,1.411043,0.615542,-1.179959,-1.429448,-0.04908,0.0,1.112474,0.615542,-0.417178,0.844581,...,0.308793,1.01227,0.0409,0.0,3.252637,4.337773,-0.380987,0.746319,5.0,91.0


Continue from "Per user stable topics"