# Full pipeline Reddit analysis    

In [None]:
import pandas as pd
from collections import Counter
import nltk
import nltk.collocations
from nltk.util import ngrams
#nltk.download('stopwords')
#nltk.download('punkt')
from nltk.corpus import stopwords
import string
import re
import numpy as np
import spacy

# 1 Data gathering


See pushshift_data_collection.ipynb

--> four csv files, two manosphere, two female strategy

# 2 Processing

### 2.1 Reading csv 

In [None]:
#data_1 = pd.read_csv('data_1/fds_all.csv') 
#data_2 = pd.read_csv('data_1/flus_all.csv')
data_1 = pd.read_csv('data/mgtow2_all.csv') 
data_2 = pd.read_csv('data/mr_all.csv')

In [None]:
data = pd.concat([data_1, data_2])

### 2.2 Cleaning and lowercasing   
Remove [removed] and [deleted]. In the case of comments, nothing is left. In the case of submissions, the title of the post is still there. 
Then remove the empty lines from the dataframe.



In [None]:
data['Text'].replace('[removed]', '', inplace=True)
data['Text'].replace('[deleted]', '', inplace=True)
data['Text'].replace('', np.nan, inplace=True)
data.dropna(subset=['Text'], inplace=True)
data['Text'] = [x.lower() for x in data['Text']]

### 2.3 Joining all text    



In [None]:
text_list = data['Text'].tolist()
text_list = [str(i) for i in text_list]
text_string = ' '.join(text_list)

### 2.4 Punctuation

In [None]:
text_string = re.sub(r'[^\w\s]','',text_string)

### 2.5 Tokenization and stopword removal


In [None]:
stopset = set(stopwords.words('english'))

In [None]:
extra_stop = {'dont', 'hadnt',  'isnt', 'couldnt', 'doesnt', 'youll', 'shouldnt', 'hadnt', 'wont', 'youre', 'mustnt', 'werent', 'wouldnt', 'wasnt', 'hasnt', 'havent', 'shouldve', 'didnt', 'arent', 'shant', 'youve', 'weve', 'shes', 'neednt', 'thall', 'ill', 'ive', 'dont', 'gon', 'na', 'im'}
stopset.update(extra_stop)

In [None]:
tokens = nltk.word_tokenize(text_string)

In [None]:
filtered_tokens = [w for w in tokens if not w in stopset] 

# 3 Corpus Ling

### 3.1 Word list  
Only used for the lexicon analysis and keywords

In [None]:
word_freq = Counter(filtered_tokens)

In [None]:
word_freq_df = pd.DataFrame(list(word_freq.items()),columns = ['Word','Freq_target'])

### 3.2 Keywords   
https://alvinntnu.github.io/NTNU_ENC2036_LECTURES/keyword-analysis.html  

#### 3.2.1 Load reference corpus and preprocess it like the other data

In [None]:
df_askreddit = data = pd.read_csv('data/askreddit_all.csv')

In [None]:
# lowercase
df_askreddit['Text'] = [str(x) for x in df_askreddit['Text']]
df_askreddit['Text'] = [x.lower() for x in df_askreddit['Text']]

In [None]:
df_askreddit

In [None]:
# joining
text_list_rf = df_askreddit['Text'].tolist()
text_list_rf = [str(i) for i in text_list_rf]
text_string_rf = ' '.join(text_list_rf)

In [None]:
# punctuation
text_string_rf = re.sub(r'[^\w\s]','',text_string_rf)

## Tokens 1-grams

In [None]:
# tokenization
tokens_rf = nltk.word_tokenize(text_string_rf)

In [None]:
# stopwords
filtered_tokens_rf = [w for w in tokens_rf if not w in stopset] 

In [None]:
len(tokens_rf)

In [None]:
len(filtered_tokens_rf)

#### 3.2.2 Word frequencies of reference corpus

In [None]:
word_freq_rf = Counter(filtered_tokens_rf)
word_freq_reference = pd.DataFrame(list(word_freq_rf.items()),columns = ['Word','Freq_reference'])

#### 3.2.3 Contingency table

In [None]:
contingency_table = pd.merge(word_freq_df, word_freq_reference, how="outer")
contingency_table = contingency_table.fillna(0)

In [None]:
# adding the neccesary columns

contingency_table['Rest_target'] = len(tokens) - contingency_table.Freq_target
contingency_table['Rest_reference'] = 177409956 - contingency_table.Freq_reference

In [None]:
contingency_table

In [None]:
# changing column names to make it easier later on
# a is n word in target corpus
# b is n word in reference corpus
# c all other words in target corpus
# d is all other words in reference corpus
contingency_table = contingency_table.rename(columns={"Freq_target": "a", "Freq_reference": "b", "Rest_target": "c", "Rest_reference": "d"})

#### 3.2.4 Statistics

In [None]:
# calculating expected frequencies
contingency_table['a_exp'] = contingency_table.eval('((a+b)*(a+c))/(a+b+c+d)')
contingency_table['b_exp'] = contingency_table.eval('((a+b)*(b+d))/(a+b+c+d)')
contingency_table['c_exp'] = contingency_table.eval('((c+d)*(a+c))/(a+b+c+d)')
contingency_table['d_exp'] = contingency_table.eval('((c+d)*(b+d))/(a+b+c+d)')

# calculating chi-squared
contingency_table['Chi2'] = contingency_table.eval('((a-a_exp)**2/a_exp)+((b-b_exp)**2/b_exp)+((c-c_exp)**2/c_exp)+((d-d_exp)**2/d_exp)')

In [None]:
contingency_table = contingency_table.sort_values(by='Chi2', ascending=False) # highest chi2 is most key 

In [None]:
filename_keywords = 'output/keywords_female.csv'
#filename_keywords = 'output/keywords_mano.csv'

In [None]:
contingency_table.to_csv(filename_keywords)

## Bigrams

#### Target corpus

In [None]:
bigrams = list(nltk.bigrams(tokens)) 

In [None]:
len(bigrams)

In [None]:
bigram_freq = Counter(bigrams)

In [None]:
bigram_freq_target = pd.DataFrame(list(bigram_freq.items()),columns = ['Bigram','Freq_target'])

#### Reference corpus

In [None]:
bigrams_rf = list(nltk.bigrams(tokens_rf))

In [None]:
len(bigrams_rf)

In [None]:
bigram_freq_counter = Counter(bigrams_rf)

In [None]:
bigram_freq_rf = pd.DataFrame(list(bigram_freq_counter.items()),columns = ['Bigram','Freq_reference'])

#### Contingency table

In [None]:
bi_contingency_table = pd.merge(bigram_freq_target, bigram_freq_rf, how="outer")
bi_contingency_table = bi_contingency_table.fillna(0)

In [None]:
# adding the neccesary columns

bi_contingency_table['Rest_target'] = 25641129 - bi_contingency_table.Freq_target # 25641129 is number of bigrams of target corpus
bi_contingency_table['Rest_reference'] = 177409954 - bi_contingency_table.Freq 

In [None]:
# changing column names to make it easier later on
# a is n bigram in target corpus
# b is n bigram in reference corpus
# c all other bigrams in target corpus
# d is all other bigrams in reference corpus
bi_contingency_table = bi_contingency_table.rename(columns={"Freq_target": "a", "Freq_reference": "b", "Rest_target": "c", "Rest_reference": "d"})

In [None]:
# calculating expected frequencies
bi_contingency_table['a_exp'] = bi_contingency_table.eval('((a+b)*(a+c))/(a+b+c+d)')
bi_contingency_table['b_exp'] = bi_contingency_table.eval('((a+b)*(b+d))/(a+b+c+d)')
bi_contingency_table['c_exp'] = bi_contingency_table.eval('((c+d)*(a+c))/(a+b+c+d)')
bi_contingency_table['d_exp'] = bi_contingency_table.eval('((c+d)*(b+d))/(a+b+c+d)')

# calculating chi-squared
bi_contingency_table['Chi2'] = bi_contingency_table.eval('((a-a_exp)**2/a_exp)+((b-b_exp)**2/b_exp)+((c-c_exp)**2/c_exp)+((d-d_exp)**2/d_exp)')

In [None]:
bi_contingency_table = bi_contingency_table.sort_values(by='Chi2', ascending=False) # highest chi2 is most key 

In [None]:
#filename_bigrams = 'output/keybigrams_female.csv'
filename_bigrams = 'output/keybigrams_mano.csv'

In [None]:
bi_contingency_table.to_csv(filename_bigrams)

### 3.3 Gendered words

I made smaller files with only sentences that contain some word (men, women, male, female or an abusive term)

In [None]:
nlp = spacy.load('en_core_web_sm')
nlp.remove_pipe('ner') # prevents memory error

#### Adjectives for men/women

In [None]:
data_adj = pd.read_csv('data/female_strategy_word_women.csv') 
#data_adj = pd.read_csv('data/female_strategy_word_men.csv')
#data_adj = pd.read_csv('data/manosphere_word_men.csv')
#data_adj = pd.read_csv('data/manosphere_word_women.csv')

In [None]:
text_list = data_adj['0'].tolist()

In [None]:
text_list_s = set(text_list) #remove duplicates, there can be duplicate sentences when the target word occured more than once in the same sentence
text_list = list(text_list_s)

In [None]:
def adjectives(target_word, filename_adj):
    adj = []
    for sentence in text_list:
        doc = nlp(sentence)
        for possible_adj in doc:
            if possible_adj.dep_ == 'amod' and possible_adj.head.text == target_word:
                adj.append(possible_adj.text)
                adj_count = Counter(adj)
                adj_df = pd.DataFrame(list(adj_count.items()),columns = ['ADJ','Freq'])
                adj_df.to_csv(filename_adj)

In [None]:
adjectives('women', 'output/female_strategy_adj_women.csv')

In [None]:
adjectives('men', 'output/female_strategy_adj_men.csv') # CHANGE FILE FIRST

In [None]:
adjectives('women', 'output/manosphere_adj_women.csv') # CHANGE FILE FIRST

In [None]:
adjectives('men', 'output/manosphere_adj_men.csv') # CHANGE FILE FIRST

#### Nouns for male/female

In [None]:
data_noun = pd.read_csv('data/female_strategy_word_male.csv')
#data_noun = pd.read_csv('data/female_strategy_word_female.csv')
#data_noun = pd.read_csv('data/manosphere_word_male.csv')
#data_noun = pd.read_csv('data/manosphere_word_female.csv')

In [None]:
text_list = data['0'].tolist()
text_list_s = set(text_list) #remove duplicates
text_list = list(text_list_s)

In [None]:
def nouns(target_word, filename_nouns):
    noun = []
    for sentence in text_list:
        doc = nlp(sentence)
        for possible_adj in doc:
            if possible_adj.text == target_word and possible_adj.head.pos_ == 'NOUN':
                noun.append(possible_adj.head.text)
                nouns_count = Counter(noun)
                nouns_df = pd.DataFrame(list(nouns_count.items()),columns = ['NOUN','Freq'])
                nouns_df.to_csv(filename_nouns)

In [None]:
nouns('male', 'output/female_strategy_noun_male.csv') 

In [None]:
nouns('female', 'output/female_strategy_noun_female.csv') # CHANGE FILE FIRST

In [None]:
nouns('male', 'output/manosphere_noun_male.csv') # CHANGE FILE FIRST

In [None]:
nouns('female', 'output/manosphere_noun_female.csv') # CHANGE FILE FIRST

# 4 Lexicon

### 4.1 read lexicon and find abuse in text
txt file   
one word on each line

In [None]:
def lexicon(filename_lex, dict_abuse, filename_abuse):
    with open(filename_lex, 'r', encoding = 'utf-8') as infile:
        lexicon = infile.read()
        lexicon_list = lexicon.split('\n')
        stripped = [w.strip() for w in lexicon_list] # i noticed that not all words were found, turns out some of the words in the lexicon have a trailing whitespace
        for word, freq in word_freq.items():
            if word in stripped:
                dict_abuse[word] = freq
    df_abuse = pd.DataFrame(list(dict_abuse.items()),columns = ['Word','Freq'])
    df_abuse.to_csv(filename_abuse)

In [None]:
misogyny_dict = {}
lexicon('../lexicons/abuse_misogyny.txt', misogyny_dict, 'output/manosphere_misogyny.csv')
#lexicon('../lexicons/abuse_misogyny.txt', misogyny_dict, 'output/female_strategy_misogyny.csv')

In [None]:
general_dict = {}
lexicon('../lexicons/abuse_general.txt', general_dict, 'output/manosphere_general.csv')
#lexicon('../lexicons/abuse_general.txt', general_dict, 'output/female_strategy_general.csv')

In [None]:
misandry_dict = {}
lexicon('../lexicons/abuse_misandry.txt', misandry_dict, 'output/manosphere_misandry.csv')
#lexicon('../lexicons/abuse_misandry.txt', misandry_dict, 'output/female_strategy_misandry.csv')

### 4.2 reduce to short phrases
Only shown for female strategy corpus, misogynistic terms

In [None]:
data_miso = pd.read_csv('data/female_strategy_misogyny.csv') #not the same file as above, this is subset of the data with only sentences that contain misogynistic terms


In [None]:
text_list = data_miso['0'].tolist()
text_list_s = set(text_list)
text_list = list(text_list_s)

#### direct object

In [None]:
dobj_abuse = []
for sentence in text_list:
    doc = nlp(sentence)
    for word in doc:
        if word.text in stripped and word.dep_ == 'dobj':
            for child in word.head.children:
                if child.dep_ == 'nsubj':
                    dobj_abuse.append((child.text, word.head.text, word.text))

In [None]:
dobj_abuse_count = Counter(dobj_abuse)
dobj_abuse_df = pd.DataFrame(list(dobj_abuse_count.items()), columns = ['0', '1'])
dobj_abuse_df.to_csv('output/female_strategy_miso_dobj.csv')

#### attribute

In [None]:
attr_abuse = []
for sentence in text_list:
    doc = nlp(sentence)
    for word in doc:
        if word.text in stripped and word.dep_ == 'attr':
            for child in word.head.children:
                if child.dep_ == 'nsubj':
                    attr_abuse.append((child.text, word.head.text, word.text))

In [None]:
attr_abuse_count = Counter(attr_abuse)
attr_abuse_df = pd.DataFrame(list(attr_abuse_count.items()), columns = ['0', '1'])
attr_abuse_df.to_csv('output/female_strategy_miso_attr.csv')

#### compound

In [None]:
comp_abuse = []
for sentence in text_list:
    doc = nlp(sentence)
    for word in doc:
        if word.text in stripped and word.dep_ == 'compound': 
            comp_abuse.append((word.text, word.head.text))

In [None]:
comp_abuse_count = Counter(comp_abuse)
comp_abuse_df = pd.DataFrame(list(comp_abuse_count.items()), columns = ['0', '1'])
comp_abuse_df.to_csv('output/female_strategy_miso_compound.csv')

#### attributive adj

In [None]:
amod_abuse = []
for sentence in text_list:
    doc = nlp(sentence)
    for word in doc:
        if word.text in stripped and word.dep_ == 'amod': 
            amod_abuse.append((word.text, word.head.text))

In [None]:
amod_abuse_count = Counter(amod_abuse)
amod_abuse_df = pd.DataFrame(list(amod_abuse_count.items()), columns = ['0', '1'])
amod_abuse_df.to_csv('output/female_strategy_miso_amod.csv')

#### predicative adj

In [None]:
acomp_abuse = []
for sentence in text_list:
    doc = nlp(sentence)
    for word in doc:
        if word.text in stripped and word.dep_ == 'acomp': 
            for child in word.head.children:
                if child.dep_ == 'nsubj':
                    acomp_abuse.append((child.text, word.head.text, word.text))

In [None]:
acomp_abuse_count = Counter(acomp_abuse)
acomp_abuse_df = pd.DataFrame(list(acomp_abuse_count.items()), columns = ['0', '1'])
acomp_abuse_df.to_csv('output/female_strategy_miso_acomp.csv')