In [1]:
import pandas as pd
import gensim
from gensim import corpora

In [2]:
df = pd.read_csv('preprocessed_df.csv', index_col=0)
df

Unnamed: 0,season,episode,scene,line,character,line_preprocessed
0,1,1,1,All right Jim. Your quarterlies look very good...,Michael,jim quarterlies library
1,1,1,1,"Oh, I told you. I couldn't close it. So...",Jim,tell close
2,1,1,1,So you've come to the master for guidance? Is ...,Michael,master guidance grasshopper
3,1,1,1,"Actually, you called me in here, but yeah.",Jim,call
5,1,1,2,"[on the phone] Yes, I'd like to speak to your ...",Michael,speak office manager michael scott regional ma...
...,...,...,...,...,...,...
59904,9,23,112,It all seems so very arbitrary. I applied for ...,Creed,arbitrary apply job company hiring desk empty ...
59905,9,23,113,I just feel lucky that I got a chance to share...,Meredith,feel lucky chance share crummy story dump pape...
59906,9,23,114,I'm happy that this was all filmed so I can re...,Phyllis,happy filmed remember paper company write
59907,9,23,115,I sold paper at this company for 12 years. My ...,Jim,sell paper company job speak client phone quan...


In [3]:
def combine_season_episode(df):
    df['season_episode'] = df['season'].apply(lambda x: f's{x:02d}') + df['episode'].apply(lambda x: f'e{x:02d}')

    aggregation_functions = {
        'character': lambda x: set(x),
        'line_preprocessed': ' '.join,  
    }

    grouped_data = df.groupby(['season_episode', 'scene']).agg(aggregation_functions).reset_index()

    return grouped_data

In [4]:
combined_df = combine_season_episode(df)
combined_df

Unnamed: 0,season_episode,scene,character,line_preprocessed
0,s01e01,1,"{Jim, Michael}",jim quarterlies library tell close master guid...
1,s01e01,2,{Michael},speak office manager michael scott regional ma...
2,s01e01,3,"{Pam, Michael}",dunder mifflin regional manager entire floor k...
3,s01e01,4,{Michael},people boss god hilarious pretty sum find spen...
4,s01e01,5,{Dwight},play rum pump pum gift rum pump pum
...,...,...,...,...
8694,s09e23,112,{Creed},arbitrary apply job company hiring desk empty ...
8695,s09e23,113,{Meredith},feel lucky chance share crummy story dump pape...
8696,s09e23,114,{Phyllis},happy filmed remember paper company write
8697,s09e23,115,{Jim},sell paper company job speak client phone quan...


## LDA

In [5]:
tokenized_data = [doc.split() for doc in combined_df['line_preprocessed']]

In [6]:
dictionary = corpora.Dictionary(tokenized_data)
print(len(dictionary))
dictionary.filter_extremes(no_below=2, no_above=0.5)
print(len(dictionary))

15047
7658


In [13]:
from gensim.models import Phrases

num_topics = 10
dictionary.filter_extremes(no_below=2, no_above=0.5)
corpus = [dictionary.doc2bow(doc) for doc in tokenized_data]
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, passes=1000, alpha=(0.01 * num_topics), eta=0.01, num_topics=num_topics, random_state=42)

In [14]:
topics = lda_model.print_topics(num_topics=10, num_words=10)
for topic in topics:
    print(topic)

(0, '0.031*"dwight" + 0.019*"jim" + 0.016*"guy" + 0.016*"question" + 0.010*"desk" + 0.009*"farm" + 0.008*"people" + 0.008*"ryan" + 0.008*"schrute" + 0.007*"office"')
(1, '0.036*"party" + 0.019*"fun" + 0.019*"eat" + 0.015*"christmas" + 0.014*"dog" + 0.013*"day" + 0.012*"god" + 0.012*"food" + 0.011*"phyllis" + 0.010*"night"')
(2, '0.029*"talk" + 0.027*"day" + 0.025*"andy" + 0.024*"guy" + 0.024*"call" + 0.020*"time" + 0.018*"date" + 0.014*"people" + 0.014*"week" + 0.011*"idea"')
(3, '0.030*"paper" + 0.023*"company" + 0.023*"dwight" + 0.021*"manager" + 0.020*"dunder" + 0.020*"sale" + 0.020*"mifflin" + 0.019*"job" + 0.014*"michael" + 0.013*"sell"')
(4, '0.128*"michael" + 0.019*"woman" + 0.019*"jan" + 0.017*"boss" + 0.017*"happy" + 0.016*"scott" + 0.013*"time" + 0.012*"talk" + 0.009*"people" + 0.008*"birthday"')
(5, '0.028*"stop" + 0.018*"guy" + 0.013*"hate" + 0.011*"gay" + 0.009*"lot" + 0.008*"move" + 0.008*"car" + 0.008*"life" + 0.008*"call" + 0.008*"dwight"')
(6, '0.033*"baby" + 0.024*"dw

In [15]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

prepared_data = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(prepared_data)

In [16]:
pyLDAvis.save_html(prepared_data, 'lda_alfa_001_10.html')