# Topic Modeling and Recommending Similar Stories

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

random_state=42 # to make sure its reproducible

from glob import glob
datafiles = glob('data/*')

In [2]:
# packages for preprocessing text
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
import string
from nltk.corpus import stopwords

# packages for topic modeling
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity

# loading custom function to display topic vocab
from custom_functions import display_topics

In [3]:
# loading the data
stories_data = pd.read_csv(datafiles[0])

## Cleaning the column names to make it easier to work with

In [4]:
# dropping duplicates if the text are the same
stories_data.drop_duplicates(subset=['Story texts'], inplace=True)

# renaming columns to make them easier to work with
drop_columns = [column for column in stories_data.columns if 'link' in column.lower()]
drop_columns.extend([ 'Title', 'Lede', 'Cringey', 'Haha', 'Me too', 'Interesting', 'Phone', 'Like'])
stories_data.drop(columns=drop_columns, inplace=True, errors='ignore')

stories_data.columns = [column.lower().replace(' ', '_') for column in stories_data.columns]
stories_data.rename(columns={
    'id':'story_id'
}, inplace=True)
stories_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2014 entries, 0 to 2060
Data columns (total 8 columns):
story_id        2014 non-null int64
perspective     2014 non-null object
age             1548 non-null float64
lgbtq           1548 non-null object
race            1548 non-null object
topic           2014 non-null object
published_at    819 non-null object
story_texts     2013 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 141.6+ KB


In [5]:
# how many are missing data
stories_data.isna().sum()

story_id           0
perspective        0
age              466
lgbtq            466
race             466
topic              0
published_at    1195
story_texts        1
dtype: int64

In [6]:
# dropping the row that doenst have data for the text column since we're using the text column heavily
stories_data.dropna(subset=['story_texts'], inplace=True)

# resetting index since i dropped rows earlier
stories_data.reset_index(inplace=True)
stories_data.drop('index', axis=1, inplace=True)

## Cleaning the Text

1. lemmatizing the text
2. removing stop words
4. create n_grams
3. vectorizing the words to feed to the algorithm

In [7]:
stories_data['cleaned'] = stories_data.story_texts.str.replace('[^\w\s]','')

# getting the POS tags to feed to the lem
stories_data['pos_tagged'] = stories_data.cleaned.map(word_tokenize).map(pos_tag)

In [8]:
lem = WordNetLemmatizer()
stories_data['lemmed'] = stories_data['pos_tagged'].map(lambda x: [lem.lemmatize(word, tag[0].lower()) for word, tag in x if tag[0].lower() in ['a', 'v', 'r', 'n']])
stories_data.lemmed = stories_data.lemmed.map(lambda x: ' '.join(word for word in x))

## Topic Modeling with different algorithms

In [9]:
# transforming the stop words in the same pattern that the text is being processed as 
stop_words = stopwords.words('english')
stop_words = [s.translate(str.maketrans('', '', string.punctuation)) for s in stop_words]

# additional stop words removing from corpus
additional_stop_words = ['im', 'idk', 'friend', 'nothing', 'meh'
                         , 'ha', 'hey', 'hi', 'ive', 'vjfjfjfc', 'umm']

In [10]:
#tfidf vectorizer
tfidf = TfidfVectorizer(stop_words=stop_words + additional_stop_words
                        , lowercase=True
                       , ngram_range=(1,2))

# document term matrix 
tfidf_dtm = tfidf.fit_transform(stories_data.lemmed)

In [11]:
# tfidf_nmf.components_ # an array like a dictionary with words and values aka the h component matrix lol 

In [12]:
n_components=9
tfidf_nmf = NMF(n_components=n_components
            , random_state=random_state)
tfidf_nmf_data = tfidf_nmf.fit_transform(tfidf_dtm)

In [13]:
# note to self: it seems like stop words are removed before n grams are created
# need to check source code to make sure
# i wonder if you can modifer the sihoette score for topic modeling

display_topics(tfidf_nmf, tfidf.get_feature_names())


Topic  0
tell, say, want, go, school, mom, tell tell, come, day, anyone

Topic  1
sex, sex sex, want, want sex, boyfriend, sex want, sex say, sexy, bad, keyon

Topic  2
good, good girl, room good, room, girl good, good act, dick, hello, home good, good guy

Topic  3
get, period, go, start, puberty, get period, mom, go puberty, hair, bathroom

Topic  4
make, feel, felt, people, make feel, really, well, think, look, way

Topic  5
know, know know, want, want know, know feel, know relationship, know say, feel, bye, really know

Topic  6
ask, say, consent, ask consent, yes, fuck, say yes, boyfriend, question, ask say

Topic  7
girl, like, like girl, girl school, gender, boy, dress, question, good girl, act

Topic  8
date, talk, like, love, guy, start, break, relationship, want, still


In [14]:
m, n = tfidf_nmf_data.shape
tfidf_nmf_df = pd.DataFrame(tfidf_nmf_data, columns=[f'topic_{num}' for num in range(n)])
tfidf_nmf_df['max_topic'] = tfidf_nmf_data.argmax(axis=1)
# tfidf_nmf_df['max_topic'].value_counts().sort_values()

In [15]:
combined = pd.merge(stories_data, tfidf_nmf_df, left_index=True, right_index=True)

## Figuring out what the topics actually mean by looking at the stories

sorting the stories by the highest ranking to figure out what the topics mean

In [None]:
# dictionary to map the topics numbers and the defined topic names
topics = {}
combined.sort_values(by='topic_0', ascending=False).head(n_top)['story_texts'].tolist()

In [None]:
# creating a function instead of just copying and pasting code
def top_n_docs(df, sorting_column, text_column, n_top=1):
    '''
    pass in the dataframe, the column that you're sorting values by, the number of top docs to print
    , and the column name that has the text values
    
    returns a list of values that follow these parameters
    '''
    return df.sort_values(by=sorting_column, ascending=False).head(n_top)[text_column].tolist()

In [None]:
topics['topic_0'] = 'interpersonal_relationships_conversations' # venting and advice

top_n_docs(combined, 'topic_1', 'story_texts')

In [None]:
topics['topic_1'] = 'the word sex'
top_n_docs(combined, 'topic_2', 'story_texts')

In [None]:
topics['topic_2'] = 'app_confusion'
top_n_docs(combined, 'topic_3', 'story_texts')

In [None]:
topics['topic_3'] = 'puberty' # first period
top_n_docs(combined, 'topic_4', 'story_texts')

In [None]:
topics['topic_4'] = 'feeling_judged' # social relationships
top_n_docs(combined, 'topic_5', 'story_texts')

In [None]:
topics['topic_5'] = 'i_dont_know'
top_n_docs(combined, 'topic_6', 'story_texts')

In [None]:
topics['topic_6'] = 'consent_and_sex'
top_n_docs(combined, 'topic_7', 'story_texts')

In [None]:
topics['topic_7'] = 'being_a_girl'
top_n_docs(combined, 'topic_8', 'story_texts')

In [None]:
topics['topic_8'] = 'dating_and_relationships'

# renaming the columns to make more intuitive sense 
combined.rename(columns=topics, inplace=True)

## Recommending similar stories based on their topic modeling scores

Now that we have different features to compare the stories along different dimensions, can we generate recommendations based on how similar stories are to each other?

In [None]:
cosine_scores = cosine_similarity(tfidf_nmf_data, tfidf_nmf_data)

# have to get the second most similar since the most similar would be with itself and we dont want that lol
combined['most_similar_index'] = cosine_scores.argsort()[:, -2] 
# combined.head()

In [None]:
# wow a self join have the most similar story in the same row to make it easier to compare
self_joined = pd.merge(combined, combined['story_texts'], left_on='most_similar_index', right_index=True)

In [None]:
# there are a few stories that are deemed to be most similar to lots of other stories
sns.distplot(combined.most_similar_index.value_counts());

In [None]:
random_sample = self_joined[~self_joined.published_at.isna()].sample(n=10
                   , random_state=random_state)
# random_sample

In [None]:
# checking a recommended story
random_sample[['story_texts_x', 'story_texts_y']].head(1).values

## Are there patterns with these topics?

In [None]:
# correlation with different topics in the docs
sns.pairplot(data=tfidf_nmf_df)

There seems to be some correlation between topic 7 with topic 2, and topic 2 with topic 8. It would be interesting to see why