# 0.0 imports

In [1]:
#imports
import json
import datetime
import pandas as pd
import json
import math
import string
import io

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk import *

from gensim import corpora, models

%matplotlib inline 

# 1.0 Reading data

In [2]:
# Create empty lists for each column
src_list = []
tgt_list = []
vot_list = []
res_list = []
yea_list = []
dat_list = []
txt_list = []


# Read the text file line by line
with open('wiki-RfA.txt', 'r', encoding = "utf-8") as f:
    for line in f:
        
        # Extract the SRC, TGT, and TXT columns from each line
        if line.startswith('SRC:'):
            src_list.append(line.strip()[4:])
        elif line.startswith('TGT:'):
            tgt_list.append(line.strip()[4:])
        elif line.startswith('VOT:'):
            vot_list.append(line.strip()[4:])
        elif line.startswith('RES:'):
            res_list.append(line.strip()[4:])
        elif line.startswith('YEA:'):
            yea_list.append(line.strip()[4:])
        elif line.startswith('DAT:'):
            dat_list.append(line.strip()[4:])
        elif line.startswith('TXT:'):
            txt_list.append(line.strip()[4:])

# Create a Pandas dataframe with the extracted columns
df = pd.DataFrame({'SRC': src_list, 'TGT': tgt_list, 'VOT':vot_list, 'RES':res_list, 'YEA':yea_list, 'DAT':dat_list,'TXT': txt_list})

# 2.0 PreProcessing

## 2.1 Tokenizing

In [4]:
def tokenize_fn(all_docs):
    exclude = set(string.punctuation)
    tokenized = []
    tokenizer = TweetTokenizer()
    for doc in all_docs:
        tokens = tokenizer.tokenize(doc.lower())
        tokenized.append(''.join([ch for ch in ' '.join(tokens) if ch not in exclude]).split())
    return tokenized

In [5]:
win_df = pd.DataFrame()

win_df['tokenized'] = tokenize_fn(all_docs = df[df['RES'] == '1']['TXT'])
win_df['body'] = df['TXT']

los_df = pd.DataFrame()

los_df['tokenized'] = tokenize_fn(all_docs = df[df['RES'] == '-1']['TXT'])
los_df['body'] = df['TXT']

## 2.2 Stopwords removal

In [9]:
def stopwords_removal(tokens):
    sws = set(stopwords.words('english'))
    sws.add("…") 
    sws_removed = []
    for j,sent in enumerate(tokens):
        sws_removed.append([i for i in sent if i not in sws and len(i) > 2])
    return sws_removed

In [10]:
win_tokenized_stpwrd = stopwords_removal(win_df['tokenized'])
los_tokenized_stpwrd = stopwords_removal(los_df['tokenized'])

# 3.0 Analysis

## 3.1 Topic Modeling

In [11]:
def topic_model(tokens):
    
    dictionary = corpora.Dictionary(tokens)
    dictionary.filter_extremes(no_below=5, no_above=0.3)
    dictionary.compactify()
    
    corpus = [dictionary.doc2bow(text) for text in tokens]
    ldamodel = models.ldamodel.LdaModel(corpus, num_topics=3, id2word=dictionary, passes=20)
    
    model_topics = ldamodel.print_topics(num_topics = 5, num_words = 5)
    topics_arr=[]
    
    for i in range(0, len(corpus)):  
        topics = ldamodel.get_document_topics(corpus[i])
        topics = sorted(topics, key=lambda x: -x[1])
        topics_arr.append(topics[0][0])
        
    return topics_arr, model_topics

In [12]:
topic_tweet_win, win_lda_topics = topic_model(tokens = win_tokenized_stpwrd)
topic_tweet_los, los_lda_topics = topic_model(tokens = los_tokenized_stpwrd)

In [13]:
win_lda_topics

[(0,
  '0.071*"good" + 0.037*"admin" + 0.026*"editor" + 0.022*"great" + 0.018*"work"'),
 (1,
  '0.016*"oppose" + 0.016*"edits" + 0.012*"user" + 0.011*"see" + 0.009*"time"'),
 (2,
  '0.152*"font" + 0.093*"user" + 0.089*"color" + 0.028*"green" + 0.026*"per"')]

In [14]:
los_lda_topics

[(0,
  '0.129*"font" + 0.091*"user" + 0.072*"color" + 0.054*"per" + 0.028*"style"'),
 (1,
  '0.030*"edits" + 0.021*"good" + 0.019*"experience" + 0.016*"edit" + 0.014*"admin"'),
 (2,
  '0.012*"admin" + 0.012*"user" + 0.009*"would" + 0.008*"wikipedia" + 0.008*"good"')]

In [15]:
win_df['topic'] = topic_tweet_win
los_df['topic'] = topic_tweet_los

In [16]:
win_df.head()

Unnamed: 0,tokenized,body,topic
0,"[support, as, conom]",'''Support''' as co-nom.,2
1,"[support, as, nominator]",'''Support''' as nominator.--,2
2,"[support, per, noms]",'''Support''' per noms.,2
3,"[support, per, noms, bdd, is, a, strong, contr...",'''Support''' per noms. BDD is a strong contri...,1
4,"[support, with, great, pleasure, i, work, with...","'''Support''', with great pleasure. I work wit...",0


In [17]:
los_df.head()

Unnamed: 0,tokenized,body,topic
0,"[i, am, supporting, you, for, our, country, ev...",'''Support''' as co-nom.,2
1,"[your, advocacy, is, good, and, i, support, it...",'''Support''' as nominator.--,2
2,"[sorry, but, you, dont, have, enough, document...",'''Support''' per noms.,1
3,"[strong, oppose, sorry, but, the, fact, you, d...",'''Support''' per noms. BDD is a strong contri...,2
4,"[strong, oppose, occasional, editor, with, bar...","'''Support''', with great pleasure. I work wit...",1


## 3.2 Sentiment Analysis for winner topics

In [18]:
analyzer = SentimentIntensityAnalyzer()

def sentiment_analysis(filtered_tweets):
    labels_vader=[]
    for tweet in filtered_tweets:
        vs = analyzer.polarity_scores(tweet)
        if vs['compound'] > 0.5:
            labels_vader.append("Positive")
        elif vs['compound'] > -0.5 and vs['compound'] < 0.5:
            labels_vader.append("Neutral")
        else:
            labels_vader.append("Negative")
    return labels_vader

In [19]:
win_df['sentiment_label'] = sentiment_analysis(win_df['body'])

In [20]:
win_df.head()

Unnamed: 0,tokenized,body,topic,sentiment_label
0,"[support, as, conom]",'''Support''' as co-nom.,2,Neutral
1,"[support, as, nominator]",'''Support''' as nominator.--,2,Neutral
2,"[support, per, noms]",'''Support''' per noms.,2,Neutral
3,"[support, per, noms, bdd, is, a, strong, contr...",'''Support''' per noms. BDD is a strong contri...,1,Neutral
4,"[support, with, great, pleasure, i, work, with...","'''Support''', with great pleasure. I work wit...",0,Positive


In [21]:
win_df.groupby(['sentiment_label']).size()

sentiment_label
Negative     9621
Neutral     67236
Positive    46431
dtype: int64

In [22]:
win_df.groupby(['topic','sentiment_label']).size()

topic  sentiment_label
0      Negative            4621
       Neutral            32559
       Positive           22387
1      Negative            3549
       Neutral            23950
       Positive           16844
2      Negative            1451
       Neutral            10727
       Positive            7200
dtype: int64

## 3.3 Sentiment Analysis for losers topics

In [27]:
los_df['sentiment_label'] = sentiment_analysis(los_df['body'])

In [28]:
los_df.head()

Unnamed: 0,tokenized,body,topic,sentiment_label
0,"[i, am, supporting, you, for, our, country, ev...",'''Support''' as co-nom.,2,Neutral
1,"[your, advocacy, is, good, and, i, support, it...",'''Support''' as nominator.--,2,Neutral
2,"[sorry, but, you, dont, have, enough, document...",'''Support''' per noms.,1,Neutral
3,"[strong, oppose, sorry, but, the, fact, you, d...",'''Support''' per noms. BDD is a strong contri...,2,Neutral
4,"[strong, oppose, occasional, editor, with, bar...","'''Support''', with great pleasure. I work wit...",1,Positive


In [29]:
los_df.groupby(['sentiment_label']).size()

sentiment_label
Negative     5831
Neutral     39917
Positive    29239
dtype: int64

In [30]:
los_df.groupby(['topic','sentiment_label']).size()

topic  sentiment_label
0      Negative            1201
       Neutral             8297
       Positive            5964
1      Negative            1859
       Neutral            12967
       Positive            9457
2      Negative            2771
       Neutral            18653
       Positive           13818
dtype: int64