In [None]:
import pandas as pd
import pickle
import os

#preprocessing
import re
import contractions
import unidecode
from gensim.parsing.preprocessing import remove_stopwords, STOPWORDS
import nltk
from nltk import word_tokenize, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk.corpus import wordnet

In [78]:
#load subreddit files into dictionary of dfs
csvs = [x for x in os.listdir('.') if x.endswith('.pkl')]
fns = [os.path.splitext(os.path.basename(x))[0] for x in csvs]

d = {}
for i in range(len(fns)):
    d[fns[i]] = pd.read_pickle(csvs[i])

In [46]:
KotakuInAction = pd.DataFrame(d['KotakuInAction'], columns=['comment_text'])
MensRights = pd.DataFrame(d['MensRights'], columns=['comment_text'])
NoFap = pd.DataFrame(d['NoFap'], columns=['comment_text'])
TrollXChromosomes = pd.DataFrame(d['TrollXChromosomes'], columns=['comment_text'])
TumblrInAction = pd.DataFrame(d['TumblrInAction'], columns=['comment_text'])

In [47]:
pd.set_option('display.max_colwidth', None)
subreddits = [KotakuInAction, MensRights, NoFap, TumblrInAction, TrollXChromosomes]

for subreddit in subreddits:
    print(subreddit.head(5))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     comment_text
0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     

In [48]:
#remove rows with comments that are deleted
for subreddit in subreddits:
    pattern_del = "\[deleted\]"
    filter = subreddit['comment_text'].str.contains(pattern_del)
    subreddit = subreddit[~filter]
    subreddit = subreddit.sample(100000)

In [59]:
#update clean_text function
def clean_text(raw_text):
    
    #convert all characters to lowercase
    text = raw_text.lower()
    
    #remove http url links
    text = re.sub('http.*.com', '',text)
    
    #remove subreddit references
    text = re.sub('r\/.*', '', text)
    
    #remove new line '\n'
    text = re.sub('\n', ' ', text)
        
    #convert accented characters to ASCII characters
    text = unidecode.unidecode(text)
    
    #expand contractions
    text = contractions.fix(text)
    
    # Fix other contractions / possessive
    text = text.replace("'s", '')
    
    #remove special characters and punctuations
    text = re.sub('[^A-Za-z0-9]+', ' ', text)
    
    #remove numbers
    text = re.sub('\d+', '', text)
    
    #remove extra white space
    text = re.sub('\s+', ' ', text)
    
    #remove stop words
    cleaned_text = remove_stopwords(text)
    
    return cleaned_text

In [60]:
lemma = WordNetLemmatizer()
  
# Define function to lemmatize each word with its POS tag
  
def lemmatize_word(tagged_token):
    """ Returns lemmatized word given its tag"""
    root = []
    for token in tagged_token:
        tag = token[1][0]
        word = token[0]
        if tag.startswith('J'):
            root.append(lemma.lemmatize(word, wordnet.ADJ))
        elif tag.startswith('V'):
            root.append(lemma.lemmatize(word, wordnet.VERB))
        elif tag.startswith('N'):
            root.append(lemma.lemmatize(word, wordnet.NOUN))
        elif tag.startswith('R'):
            root.append(lemma.lemmatize(word, wordnet.ADV))
        else:          
            root.append(word)
    return root

def lemmatize_doc(document):
    """ Tags words then returns sentence with lemmatized words"""
    lemmatized_list = []
    tokenized_sent = sent_tokenize(document)
    for sentence in tokenized_sent:
        no_punctuation = re.sub(r"[`'\",.!?()]", " ", sentence)
        tokenized_word = word_tokenize(no_punctuation)
        tagged_token = pos_tag(tokenized_word)
        lemmatized = lemmatize_word(tagged_token)
        lemmatized_list.extend(lemmatized)
    return " ".join(lemmatized_list)

In [63]:
#combine clean_text and lemmatize_doc functions
def preprocess(data):
    data['cleaned_text'] = data['comment_text'].apply(lambda x: clean_text(x))
    data['lemmatized_text'] = data['cleaned_text'].apply(lambda x: lemmatize_doc(x))
    cleaned_data = data
    return(cleaned_data)

for subreddit in subreddits:
    preprocess(subreddit)

# Score subreddits

In [69]:
#load model
lr_model = pickle.load(open('lr_model.sav', 'rb'))

def subreddit_toxicity_percent(df):
    sub_text= df.lemmatized_text.values
    sub_preds = lr_model.predict(sub_text)
    return round(sub_preds.sum()/sub_preds.shape[0],2)*100

In [83]:
data = {'Subreddits': ['KotakuInAction', 'MensRights', 'NoFap', 'TrollXChromosomes', 'TumblrInAction'], 
     'Toxicity Percentage': [subreddit_toxicity_percent(KotakuInAction), subreddit_toxicity_percent(MensRights), subreddit_toxicity_percent(NoFap), subreddit_toxicity_percent(TrollXChromosomes), subreddit_toxicity_percent(TumblrInAction)]}
subreddit_score = pd.DataFrame(data=data)
subreddit_score

Unnamed: 0,Subreddits,Toxicity Percentage
0,KotakuInAction,38.0
1,MensRights,41.0
2,NoFap,31.0
3,TrollXChromosomes,40.0
4,TumblrInAction,41.0
