In [5]:
%load_ext autoreload
%autoreload 2
import pandas as pd
 
import P3_helpers as hlp
from P3_helpers import get_dataframe, get_parsed_comment, get_LDA_model, tokenize_one_comment, get_LDA_model_from_saved_file

import json

## Loading and preprocessing DF

In [2]:
df = get_dataframe()
df.columns

Index(['Source', 'Target', 'Vote', 'Results', 'Year', 'Date', 'Comment'], dtype='object')

In [5]:
df['Parsed_Comment'] = df['Comment'].apply(get_parsed_comment)
df['Tokenized_Comment_not_filtered'] = df['Parsed_Comment'].apply(tokenize_one_comment)
df['BoW'] = hlp.get_bow_column(df['Tokenized_Comment_not_filtered'])


## Computing and saving topic models

In [16]:
isTrainSession = False

if isTrainSession:
    comments_series = df['Parsed_Comment'].copy(deep=True)
    topic_range = range(3, 10, 2)
    for nb_topics in topic_range:
        print(f"nb_topics: {nb_topics}")
        lda = get_LDA_model(comments_series=comments_series, num_topics=nb_topics)
        print(f"Saving model for {nb_topics} topics")
        lda.save(fname=f"lda_model_{nb_topics}_all_true_lda")
        print(f"Topic_{nb_topics} done")

nb_topics: 3
Saving model for 3 topics
Topic_3 done
nb_topics: 5
Saving model for 5 topics
Topic_5 done
nb_topics: 7
Saving model for 7 topics
Topic_7 done
nb_topics: 9
Saving model for 9 topics
Topic_9 done


## Loading pre-computed models

In [6]:
models = []
nb_topics = range(3, 10, 2)

for nb_topic in nb_topics:
    models.append(get_LDA_model_from_saved_file(f"./topic_model_states_dict/lda_model_{nb_topic}_all_true_lda"))

## Infer Topics by Comment

In [14]:
for i, model in enumerate(models):
    df[f'Topics_from_{nb_topics[i]}'] = df['BoW'].apply(model.get_document_topics)


In [15]:
df.to_csv('df_with_topics.csv', index=False)

In [8]:
for i, model in enumerate(models):
    d = model.print_topics(num_words=15)
    d = {topic[0]:topic[1] for topic in d}
    with open(f"topics_{nb_topics[i]}_dict.json", 'w') as f:
        json.dump(d,f)

TypeError: object of type 'int' has no len()

In [18]:
df.Topics_from_3 = df.Topics_from_3.apply(lambda themes_list: [(models[0].show_topic(item[0],topn=15), item[1]) for item in themes_list])

In [20]:
df.Topics_from_5 = df.Topics_from_5.apply(lambda themes_list: [(models[1].show_topic(item[0],topn=15), item[1]) for item in themes_list])

In [1]:
df.Topics_from_7 = df.Topics_from_7.apply(lambda themes_list: [(models[2].show_topic(item[0],topn=15), item[1]) for item in themes_list])

NameError: name 'df' is not defined

In [21]:
df.Topics_from_9 = df.Topics_from_9.apply(lambda themes_list: [(models[3].show_topic(item[0],topn=15), item[1]) for item in themes_list])

In [19]:
df.to_csv('df_with_topics_v2.csv', index=False)

[([('support', 0.26539585),
   ('good', 0.052981544),
   ('admin', 0.0238177),
   ('user', 0.021110583),
   ('editor', 0.018061612),
   ('great', 0.017362958),
   ('strong', 0.015362159),
   ('—', 0.013216087),
   ('excellent', 0.010478184),
   ('contributor', 0.009782508),
   ('seen', 0.008929965),
   ('work', 0.008561838),
   ('like', 0.008408844),
   ('tools', 0.007776866),
   ('looks', 0.0075895917)],
  0.7776697),
 ([('oppose', 0.03404599),
   ('edits', 0.03332886),
   ('wikipedia', 0.019860066),
   ('edit', 0.018645601),
   ('experience', 0.014715313),
   ('good', 0.012932773),
   ('admin', 0.0126834065),
   ('months', 0.012108943),
   ('user', 0.011974941),
   ('work', 0.010490522),
   ('articles', 0.010187494),
   ('time', 0.010005021),
   ('need', 0.008791911),
   ('like', 0.008295264),
   ('article', 0.008207716)],
  0.11116561),
 ([('oppose', 0.025571983),
   ('vote', 0.010133443),
   ('admin', 0.009774776),
   ('user', 0.009733658),
   ('think', 0.008607526),
   ('neutral',

## Draft 2

In [7]:
topic_range = range(3, 10, 2)
nb_words = 15

for nb_topics in topic_range:
    print(f"nb_topics: {nb_topics}, nb_words: {nb_words}")
    current_topics = get_LDA_topics_pipeline(comments_series, num_topics=nb_topics)
    with open(f"nbTopics_{nb_topics}_nbWords_{nb_words}.json", "w") as f:
        json.dump(current_topics, f)

nb_topics: 3, nb_words: 5
nb_topics: 3, nb_words: 10
nb_topics: 3, nb_words: 15
nb_topics: 5, nb_words: 5
nb_topics: 5, nb_words: 10
nb_topics: 5, nb_words: 15
nb_topics: 7, nb_words: 5
nb_topics: 7, nb_words: 10
nb_topics: 7, nb_words: 15
nb_topics: 9, nb_words: 5
nb_topics: 9, nb_words: 10
nb_topics: 9, nb_words: 15


In [2]:
with open("./topic_raw_resuls/nbTopics_2_nbWords_10.json", "r") as file:
    topics = json.load(file)
topics

[[0,
  '0.031*"oppose" + 0.015*"edits" + 0.011*"admin" + 0.010*"user" + 0.010*"wikipedia" + 0.009*"edit" + 0.008*"time" + 0.008*"neutral" + 0.007*"think" + 0.007*"like"'],
 [1,
  '0.223*"support" + 0.049*"good" + 0.022*"admin" + 0.020*"user" + 0.017*"great" + 0.016*"editor" + 0.014*"strong" + 0.014*"—" + 0.009*"contributor" + 0.009*"work"']]

## Draft 1


In [5]:
tokenize_comments = hlp.tokenize_comments(comments_series)
tokenize_comments[:2]

[['support', 'as', 'co-nom', '.'], ['support', 'as', 'nominator.', '--']]

In [6]:
from gensim.parsing.preprocessing import STOPWORDS

In [7]:
STOPWORDS = list(STOPWORDS)
tokenize_comments = [[word for word in comment if word not in STOPWORDS] for comment in tokenize_comments]

In [8]:
import string
PONCTUATION = string.punctuation
tokenize_comments = [[word for word in comment if word not in PONCTUATION] for comment in tokenize_comments]

In [9]:
tokenize_comments[:2]

[['support', 'co-nom'], ['support', 'nominator.', '--']]

In [10]:
d_r = hlp.get_dict_representation(tokenize_comments)

In [11]:
bow = hlp.get_bow_representation(tokenize_comments,d_r)

In [12]:
model = hlp.init_LDA_model(bow, d_r)
topics = hlp.get_LDA_topics(model)

In [27]:
topics_3t = topics
topics_3t[0]

(0,
 '0.221*"support" + 0.086*"--" + 0.051*"good" + 0.024*"admin" + 0.021*"user" + 0.017*"editor" + 0.017*"great" + 0.014*"strong" + 0.010*"work" + 0.010*"contributor"')

In [18]:
model = hlp.init_LDA_model(bow, d_r, num_topics=6)
topics_6t = hlp.get_LDA_topics(model)

In [19]:
for topic in topics_6t:
    print(topic)

(0, '0.045*"n\'t" + 0.039*"\'s" + 0.028*"\'m" + 0.022*"\'ve" + 0.022*"time" + 0.021*"support" + 0.021*"oppose" + 0.018*"neutral" + 0.017*"think" + 0.017*"vote"')
(1, '0.243*"support" + 0.104*"--" + 0.062*"good" + 0.023*"editor" + 0.022*"admin" + 0.021*"great" + 0.020*"user" + 0.018*"strong" + 0.013*"work" + 0.012*"contributor"')
(2, '0.024*"\'s" + 0.019*"oppose" + 0.014*"``" + 0.012*"user" + 0.011*"page" + 0.011*"wikipedia" + 0.007*"articles" + 0.007*"talk" + 0.006*"admin" + 0.005*"users"')
(3, '0.255*"\'\'" + 0.058*"font" + 0.036*"support" + 0.029*"color=" + 0.028*"style=" + 0.026*"small" + 0.019*"vandal" + 0.018*"—" + 0.018*"span" + 0.014*"``"')
(4, '0.069*"support" + 0.040*"tools" + 0.037*"admin" + 0.035*"user" + 0.031*"reason" + 0.027*"n\'t" + 0.024*"use" + 0.023*"--" + 0.019*"abuse" + 0.014*"big"')
(5, '0.051*"oppose" + 0.046*"edits" + 0.026*"edit" + 0.020*"--" + 0.020*"experience" + 0.016*"wikipedia" + 0.014*"n\'t" + 0.012*"months" + 0.011*"user" + 0.011*"admin"')


In [20]:
model = hlp.init_LDA_model(bow, d_r, num_topics=9)
topics_9t = hlp.get_LDA_topics(model)

In [21]:
for topic in topics_9t:
    print(topic)

(0, '0.284*"support" + 0.110*"--" + 0.073*"good" + 0.025*"editor" + 0.023*"great" + 0.021*"user" + 0.020*"strong" + 0.019*"admin" + 0.014*"excellent" + 0.013*"work"')
(1, '0.059*"edits" + 0.050*"oppose" + 0.033*"edit" + 0.021*"user" + 0.021*"months" + 0.020*"--" + 0.020*"talk" + 0.020*"n\'t" + 0.017*"neutral" + 0.014*"page"')
(2, '0.042*"wikipedia" + 0.024*"good" + 0.024*"admin" + 0.023*"work" + 0.021*"articles" + 0.016*"experience" + 0.015*"need" + 0.013*"admins" + 0.013*"policy" + 0.011*"contributions"')
(3, '0.065*"yes" + 0.060*"vandal" + 0.031*"wikipedian" + 0.025*"nominate" + 0.020*"fighter" + 0.017*"sans" + 0.016*"p" + 0.015*"ms" + 0.013*"comic" + 0.012*"worthy"')
(4, '0.296*"\'\'" + 0.067*"font" + 0.047*"support" + 0.034*"color=" + 0.033*"summaries" + 0.032*"style=" + 0.021*"span" + 0.020*"--" + 0.016*"—" + 0.015*"s"')
(5, '0.128*"oppose" + 0.045*"--" + 0.019*"afd" + 0.018*"wp" + 0.017*"namespace" + 0.014*"reasons" + 0.014*"concerns" + 0.013*"deletion" + 0.011*"agree" + 0.011*"d

In [22]:
model = hlp.init_LDA_model(bow, d_r, num_topics=15)
topics_9t = hlp.get_LDA_topics(model)

In [23]:
for topic in topics_9t:
    print(topic)

(0, '0.219*"good" + 0.155*"support" + 0.058*"work" + 0.041*"yes" + 0.037*"editor" + 0.034*"contributions" + 0.029*"nom" + 0.019*"happy" + 0.018*"record" + 0.017*"luck"')
(1, '0.034*"agree" + 0.029*"strongly" + 0.025*"pov" + 0.023*"absolutely" + 0.019*"personal" + 0.018*"extreme" + 0.017*"changed" + 0.017*"comment" + 0.014*"possible" + 0.014*"nominee"')
(2, '0.078*"edit" + 0.063*"summaries" + 0.060*"questions" + 0.056*"answers" + 0.027*"use" + 0.023*"sup" + 0.022*"history" + 0.018*"policy" + 0.011*"fair" + 0.011*"sense"')
(3, '0.056*"vandal" + 0.046*"wikipedia" + 0.043*"new" + 0.027*"speedy" + 0.025*"deletion" + 0.022*"articles" + 0.022*"criteria" + 0.018*"fighter" + 0.017*"image" + 0.016*"deleted"')
(4, '0.113*"—" + 0.108*"support" + 0.079*"\'m" + 0.071*"sure" + 0.040*"\'ll" + 0.026*"–" + 0.017*"lucky" + 0.017*"2005" + 0.014*"impressed" + 0.012*"soon"')
(5, '0.079*"n\'t" + 0.067*"\'s" + 0.032*"support" + 0.022*"admin" + 0.022*"think" + 0.016*"vote" + 0.015*"like" + 0.015*"..." + 0.014*

#### Pipeline Test

In [25]:
topcis_3t_pipeline = get_LDA_topics_pipeline(comments_series, num_topics=3, ponctuation=True, stopwords=True)

In [26]:
for topic in topcis_3t_pipeline:
    print(topic)

(0, '0.159*"\'\'" + 0.085*"support" + 0.045*"--" + 0.036*"font" + 0.025*"``" + 0.024*"—" + 0.018*"color=" + 0.018*"style=" + 0.016*"looks" + 0.016*"small"')
(1, '0.033*"oppose" + 0.024*"n\'t" + 0.020*"\'s" + 0.014*"edits" + 0.009*"user" + 0.009*"wikipedia" + 0.009*"time" + 0.009*"admin" + 0.008*"\'m" + 0.008*"neutral"')
(2, '0.173*"support" + 0.066*"--" + 0.056*"good" + 0.027*"admin" + 0.023*"user" + 0.020*"editor" + 0.015*"great" + 0.013*"work" + 0.012*"strong" + 0.010*"seen"')


In [28]:
topcis_3t_pipeline = get_LDA_topics_pipeline(comments_series, num_topics=3, ponctuation=True, stopwords=True, fine_tune_stopwords=True)

In [29]:
for topic in topcis_3t_pipeline:
    print(topic)

(0, '0.024*"oppose" + 0.016*"edits" + 0.012*"wikipedia" + 0.011*"admin" + 0.011*"user" + 0.009*"edit" + 0.009*"time" + 0.008*"\'m" + 0.008*"think" + 0.007*"like"')
(1, '0.063*"oppose" + 0.022*"—" + 0.020*"questions" + 0.019*"small" + 0.019*"answers" + 0.013*"span" + 0.013*"neutral" + 0.013*"answer" + 0.011*"solid" + 0.011*"s"')
(2, '0.266*"support" + 0.057*"good" + 0.027*"admin" + 0.024*"user" + 0.020*"great" + 0.020*"editor" + 0.016*"strong" + 0.011*"work" + 0.011*"contributor" + 0.010*"excellent"')


In [30]:
topcis_3t_pipeline = get_LDA_topics_pipeline(comments_series, num_topics=3, ponctuation=True, stopwords=True, fine_tune_stopwords=True)

In [31]:
for topic in topcis_3t_pipeline:
    print(topic)

(0, '0.035*"oppose" + 0.024*"edits" + 0.018*"admin" + 0.014*"time" + 0.014*"wikipedia" + 0.014*"edit" + 0.012*"neutral" + 0.012*"good" + 0.011*"experience" + 0.010*"user"')
(1, '0.020*"oppose" + 0.012*"page" + 0.011*"user" + 0.010*"vote" + 0.008*"talk" + 0.006*"..." + 0.006*"articles" + 0.005*"people" + 0.005*"wikipedia" + 0.005*"comments"')
(2, '0.269*"support" + 0.053*"good" + 0.025*"admin" + 0.021*"user" + 0.018*"editor" + 0.018*"great" + 0.016*"strong" + 0.013*"—" + 0.011*"excellent" + 0.010*"work"')


In [32]:
topics_6t_pipeline = get_LDA_topics_pipeline(comments_series, num_topics=6, ponctuation=True, stopwords=True, fine_tune_stopwords=True)

In [33]:
for topic in topics_6t_pipeline:
    print(topic)

(0, '0.074*"oppose" + 0.057*"edits" + 0.042*"edit" + 0.028*"talk" + 0.028*"user" + 0.025*"page" + 0.012*"months" + 0.010*"vandalism" + 0.009*"vandal" + 0.009*"count"')
(1, '0.270*"support" + 0.071*"good" + 0.032*"user" + 0.032*"admin" + 0.025*"editor" + 0.021*"great" + 0.014*"contributor" + 0.013*"excellent" + 0.012*"seen" + 0.012*"like"')
(2, '0.036*"yes" + 0.018*"color" + 0.016*"agree" + 0.015*"red" + 0.013*"green" + 0.012*"sam" + 0.012*"oh" + 0.009*"withdraw" + 0.008*"background" + 0.007*"e"')
(3, '0.032*"wikipedia" + 0.020*"admin" + 0.017*"good" + 0.016*"time" + 0.015*"experience" + 0.014*"work" + 0.014*"oppose" + 0.014*"articles" + 0.013*"need" + 0.012*"like"')
(4, '0.028*"oppose" + 0.015*"vote" + 0.011*"neutral" + 0.011*"admin" + 0.010*"think" + 0.009*"user" + 0.009*"rfa" + 0.008*"adminship" + 0.007*"like" + 0.007*"comments"')
(5, '0.171*"support" + 0.038*"—" + 0.030*"..." + 0.028*"strong" + 0.021*"thought" + 0.021*"course" + 0.016*"answers" + 0.015*"questions" + 0.015*"nominator

In [34]:
print(type(topics_6t_pipeline))
print(type(topics_6t_pipeline[0]))

<class 'list'>
<class 'tuple'>


In [35]:
import json

In [37]:
with open("test.json", "w") as file:
    json.dump(topics_6t_pipeline, file)

In [38]:
with open("test.json", "r") as file:
    topics_6t_read = json.load(file)

In [39]:
topics_6t_read

[[0,
  '0.074*"oppose" + 0.057*"edits" + 0.042*"edit" + 0.028*"talk" + 0.028*"user" + 0.025*"page" + 0.012*"months" + 0.010*"vandalism" + 0.009*"vandal" + 0.009*"count"'],
 [1,
  '0.270*"support" + 0.071*"good" + 0.032*"user" + 0.032*"admin" + 0.025*"editor" + 0.021*"great" + 0.014*"contributor" + 0.013*"excellent" + 0.012*"seen" + 0.012*"like"'],
 [2,
  '0.036*"yes" + 0.018*"color" + 0.016*"agree" + 0.015*"red" + 0.013*"green" + 0.012*"sam" + 0.012*"oh" + 0.009*"withdraw" + 0.008*"background" + 0.007*"e"'],
 [3,
  '0.032*"wikipedia" + 0.020*"admin" + 0.017*"good" + 0.016*"time" + 0.015*"experience" + 0.014*"work" + 0.014*"oppose" + 0.014*"articles" + 0.013*"need" + 0.012*"like"'],
 [4,
  '0.028*"oppose" + 0.015*"vote" + 0.011*"neutral" + 0.011*"admin" + 0.010*"think" + 0.009*"user" + 0.009*"rfa" + 0.008*"adminship" + 0.007*"like" + 0.007*"comments"'],
 [5,
  '0.171*"support" + 0.038*"—" + 0.030*"..." + 0.028*"strong" + 0.021*"thought" + 0.021*"course" + 0.016*"answers" + 0.015*"questi

#### Done with old version

In [45]:
topics_3t = topics
topics_3t[0]

(0,
 '0.144*"." + 0.135*"support" + 0.052*"--" + 0.033*"," + 0.029*"good" + 0.022*"a" + 0.019*"-" + 0.017*"!" + 0.015*"and" + 0.013*"admin"')

#### Done with old version

In [44]:
topics_pipeline_6t = get_LDA_topics_pipeline(comments_series, num_topics=6)

In [46]:
topics_pipeline_6t[0]

(0,
 '0.065*"." + 0.043*"," + 0.040*"and" + 0.035*"he" + 0.033*"a" + 0.028*"the" + 0.026*"i" + 0.025*"support" + 0.023*"to" + 0.020*"his"')

#### Done with old version

In [47]:
topics_pipeline_9t = get_LDA_topics_pipeline(comments_series, num_topics=10)

In [48]:
topics_pipeline_9t[0]

(0,
 '0.079*"." + 0.049*"of" + 0.048*"," + 0.041*"edits" + 0.039*"oppose" + 0.035*"and" + 0.023*"in" + 0.017*"a" + 0.016*"wikipedia" + 0.016*"experience"')

In [51]:
topics_pipeline_3t_ws = get_LDA_topics_pipeline(comments_series, num_topics=3)

Unexpected exception formatting exception. Falling back to standard exception


Traceback (most recent call last):
  File "/home/thetorf/anaconda3/envs/ada/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3526, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/tmp/ipykernel_72223/1189724256.py", line 1, in <module>
    topics_pipeline_3t_ws = get_LDA_topics_pipeline(comments_series, num_topics=3)
  File "/home/thetorf/Documents/epfl/MA1.2/ada/project-wikiRfA/ada-2023-project-abracadabra/P3_helpers.py", line 374, in get_LDA_topics_pipeline
    dictionary.filter_tokens(bad_ids=[dictionary.token2id[word] for word in STOPWORDS])
  File "/home/thetorf/Documents/epfl/MA1.2/ada/project-wikiRfA/ada-2023-project-abracadabra/P3_helpers.py", line 374, in <listcomp>
    dictionary.filter_tokens(bad_ids=[dictionary.token2id[word] for word in STOPWORDS])
KeyError: 'ltd'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/thetorf/anaconda3/envs/ada/lib/python3.9/site-pa

In [53]:
topics_pipeline_3t_ws[0]

NameError: name 'topics_pipeline_3t_ws' is not defined