In [117]:
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import subjectivity
from nltk.sentiment import SentimentAnalyzer
from nltk.sentiment.util import *
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk import tokenize

import pandas as pd
from nltk import WordNetLemmatizer
from nltk.stem import PorterStemmer

In [135]:
def read_dataset(subreddit, end_point, run):
    return pd.read_csv("./datasets/"+subreddit+"_"+ end_point + "_"+ run+ ".csv")

def clean_words(s):
    #remove english
    # Import stopwords.
    if type(s) is not str:
        return str(" ")
    
    s = s.lower()
    # Use regular expressions to do a find-and-replace
    words = re.sub("[^a-zA-Z]",           # The pattern to search for
                          " ",            # The pattern to replace it with
                      s)   # The text to search
    list_words = words.split(" ")
    remove_more_words = ["https", "edu", "com", "http", "org", 'www', 'amp']
    return " ".join([w for w in list_words if w not in remove_more_words])

def lemmatize(s):
    list_words = s.split(",")
    lemmatizer = WordNetLemmatizer()
    return ','.join([lemmatizer.lemmatize(word) for word in list_words])

def stemmer(s):
    list_words = s.split(",")
    stemmer = PorterStemmer()
    return ','.join([stemmer.stem(word) for word in list_words])

def negative_tone_percentage(sercol):
    sid = SentimentIntensityAnalyzer()
    total_neg = 0
    total_neu = 0
    total_pos = 0
    total_compound = 0 
    total_count = 0
    total_negative_sentiments = 0
    for sentence in sercol:
    #     print(sentence)
        if type(sentence) is str:
            ss = sid.polarity_scores(sentence)
    #         print(ss)
            total_neg += ss["neg"]
            total_neu += ss["neu"]
            total_pos += ss["pos"]
            total_compound += ss["compound"]
            total_count += 1
            if ss["compound"] < 0:
                total_negative_sentiments += 1

    return (total_negative_sentiments/total_count)* 100

In [155]:
evo_sub_df = read_dataset("evolution", "submission", "3")
crea_sub_df = read_dataset("creation", "submission", "3")
debate_sub_df = read_dataset("DebateEvolution", "submission", "3")

In [156]:
evo_sub_df["full_row"] = evo_sub_df["title"] + evo_sub_df["selftext"]
crea_sub_df["full_row"] = crea_sub_df["title"] + crea_sub_df["selftext"]
debate_sub_df["full_row"] = debate_sub_df["title"] + debate_sub_df["selftext"]

In [157]:
evo_sub_df["full_row"] = evo_sub_df["full_row"].map(clean_words)
crea_sub_df["full_row"] = crea_sub_df["full_row"].map(clean_words)
debate_sub_df["full_row"] = debate_sub_df["full_row"].map(clean_words)

evo_sub_df["full_row_lemma"] = evo_sub_df["full_row"].map(lemmatize)
crea_sub_df["full_row_lemma"] = crea_sub_df["full_row"].map(lemmatize)
debate_sub_df["full_row_lemma"] = debate_sub_df["full_row"].map(lemmatize)

evo_sub_df["full_row_stemmer"] = evo_sub_df["full_row"].map(stemmer)
crea_sub_df["full_row_stemmer"] = crea_sub_df["full_row"].map(stemmer)
debate_sub_df["full_row_stemmer"] = debate_sub_df["full_row"].map(stemmer)

In [158]:
negative_tone_percentage(evo_sub_df["full_row"])

11.600000000000001

In [159]:
negative_tone_percentage(crea_sub_df["full_row"])

10.672853828306264

In [165]:
negative_tone_percentage(crea_sub_df["full_row"])

10.672853828306264

In [164]:
negative_tone_percentage(debate_sub_df["full_row"])

22.6

In [161]:
negative_tone_percentage(crea_sub_df["full_row_lemma"])

10.672853828306264

In [166]:
negative_tone_percentage(debate_sub_df["full_row_lemma"])

22.6

In [162]:
negative_tone_percentage(evo_sub_df["full_row_stemmer"])

11.55

In [163]:
negative_tone_percentage(crea_sub_df["full_row_stemmer"])

10.672853828306264

In [168]:
negative_tone_percentage(debate_sub_df["full_row_stemmer"])

22.650000000000002

# Conclusion : Looks like moderators did a pretty good job. Could not find any big negative scored discussions

Next [Page_3](./Page_3.ipynb)