In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re

from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
# import data
df_geo_comm_00 = pd.read_csv('../data/raw/df_geo_comm_00.csv')
df_geo_comm_01 = pd.read_csv('../data/raw/df_geo_comm_01.csv')

df_geo_subm_00 = pd.read_csv('../data/raw/df_geo_subm_00.csv')
df_geo_subm_01 = pd.read_csv('../data/raw/df_geo_subm_01.csv')

df_wor_comm_00 = pd.read_csv('../data/raw/df_wor_comm_00.csv')
df_wor_comm_01 = pd.read_csv('../data/raw/df_wor_comm_01.csv')

df_wor_subm_00 = pd.read_csv('../data/raw/df_wor_subm_00.csv')
df_wor_subm_01 = pd.read_csv('../data/raw/df_wor_subm_01.csv')

In [3]:
df_geo_comm = pd.concat([df_geo_comm_00, df_geo_comm_01], axis = 0, ignore_index = True)

df_geo_subm = pd.concat([df_geo_subm_00, df_geo_subm_01], axis = 0, ignore_index = True)

df_wor_comm = pd.concat([df_wor_comm_00, df_wor_comm_01], axis = 0, ignore_index = True)

df_wor_subm = pd.concat([df_wor_subm_00, df_wor_subm_01], axis = 0, ignore_index = True)

In [4]:
# create classifier column, positive indicates is r/worldnews
df_geo_subm['is_news'] = 0
df_wor_subm['is_news'] = 1

# comments
df_geo_comm['is_news'] = 0
df_wor_comm['is_news'] = 1

In [5]:
# concat both subm columns
df_all_subm = pd.concat([df_geo_subm[['title', 'is_news']], df_wor_subm[['title', 'is_news']]], axis = 0, ignore_index = True)
df_all_comm = pd.concat([df_geo_comm[['body', 'is_news']], df_wor_comm[['body', 'is_news']]], axis = 0, ignore_index = True)

In [6]:
# drop any rows with value '[removed]'
# [removed] indicates that post was removed from subreddit as rules violation
df_all_comm.drop(df_all_comm[df_all_comm['body'] == '[removed]'].index, inplace = True)

df_all_comm.reset_index(drop = True, inplace = True)

In [7]:
# remove linebreaks from comments, and replace digits
for i in range(len(df_all_comm['body'])):
    df_all_comm.loc[i, 'body'] = df_all_comm.loc[i, 'body'].replace('\n', ' ')
    df_all_comm.loc[i, 'body'] = re.sub('\d', ' @ ', df_all_comm.loc[i, 'body'])
    
for i in range(len(df_all_subm['title'])):
    df_all_subm.loc[i,'title'] = re.sub('\d', ' @ ', df_all_subm.loc[i,'title'])
    
    #chaining operations can result in copy warning

In [8]:
# function to drop rows below non-ascii:len(string) ratio threshold of 4.5%
    # will drop non-english postings and bot-generated advertisements that have heavy use of emojis

def non_ascii_dropper(dataframe, column, threshold_ratio):
    '''
    Identifies non-ascii characters in dataframe[column], replaces non-ascii characters withan
    underscore in a view, and drops the value if the ratio of non-ascii : ascii characters is above
    treshold_ratio.
    Helpful in removing non-English postings or emoji-laden posts, which are signs of bot-generated
    advertisements.
    '''
    for i in range(len(dataframe[column])):
        for char in range(len(dataframe[column][i])):
            if dataframe[column][i][char].isascii() == False:
                non_ascii = dataframe[column][i][char]
                dataframe.loc[i, column] = dataframe[column][i].replace(non_ascii, '_')
    for i in range(len(dataframe[column])):
        measure = len(dataframe[column][i])
        count = 0
        for char in dataframe[column][i]:
            if char == '_':
                count += 1
        if count/measure >= threshold_ratio:
            dataframe.drop([i], axis = 0, inplace = True)

In [9]:
non_ascii_dropper(df_all_subm, 'title', 0.06)
non_ascii_dropper(df_all_comm, 'body', 0.06)

In [10]:
# remove any ascii typecast by non_ascii_dropper
# prevents our model from learning on non-ascii

df_all_subm['title'] = df_all_subm['title'].map(lambda i: i.replace('_', ''))
df_all_comm['body'] = df_all_comm['body'].map(lambda i: i.replace('_', ''))

In [11]:
# reset indexes
df_all_subm.reset_index(drop = True, inplace = True)
df_all_comm.reset_index(drop = True, inplace = True)

In [12]:
# add sentiment intensity scores column

sid = SentimentIntensityAnalyzer()

df_all_subm['compound_si'] = [sid.polarity_scores(df_all_subm['title'][i])['compound'] for i in range(len(df_all_subm['title']))]
df_all_comm['compound_si'] = [sid.polarity_scores(df_all_comm['body'][i])['compound'] for i in range(len(df_all_comm['body']))]

In [13]:
# export

df_all_subm.to_csv('../data/df_all_subm.csv', index = False)
df_all_comm.to_csv('../data/df_all_comm.csv', index = False)

In [15]:
# create and export single .csv with all text
df_all = pd.concat([df_all_subm.rename(columns = {'title' :'text'}), df_all_comm.rename(columns = {'body' :'text'})], ignore_index = True)
df_all.to_csv('../data/df_all_text.csv', index = False)