# Data Processing for AskReddit dataset

In [2]:
#Mount drive containing dataset
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
#Import datasets
import shutil

shutil.copy('/content/drive/My Drive/AskReddit_comments.csv', '/content/')
shutil.copy('/content/drive/My Drive/AskReddit_posts.csv', '/content/')
shutil.copy('/content/drive/My Drive/Colab Notebooks/lexicon.json', '/content/')

'/content/lexicon.json'

In [4]:
import pandas as pd
pd.options.display.max_columns = None
import numpy as np
import json
import re
import nltk
nltk.download('stopwords')
import math
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
import ast
import collections
import time
import datetime as dt
from matplotlib import pyplot as plt
import matplotlib.dates as mdates

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
#Function for data pre-processing

def process_and_display_data(file):


  data = pd.read_csv(file)

  #Drop useless columns
  data = data.drop(['author_flair_background_color',
                    'author_flair_css_class',
                    'author_flair_text',
                    'brand_safe',
                    'contest_mode',
                    'gilded',
                    'is_reddit_media_domain',
                    'is_video',
                    'link_flair_richtext',
                    'link_flair_text_color',
                    'link_flair_type',
                    'locked',
                    'media_embed',
                    'no_follow',
                    'num_crossposts',
                    'parent_whitelist_status',
                    'secure_media_embed',
                    'send_replies',
                    'spoiler',
                    'stickied',
                    'subreddit',
                    'subreddit_id',
                    'subreddit_type',
                    'suggested_sort'
                    ], axis = 1)

  #Convert utc to datetime
  for i in range(len(data['author_created_utc'])):
    if data['author_created_utc'][i] != data['author_created_utc'][i]:
      continue
    data['author_created_utc'][i] = dt.datetime.fromtimestamp(int(float(data['author_created_utc'][i])))

  for i in range(len(data['author_created_utc'])):
    if data['created_utc'][i] != data['created_utc'][i]:
      continue
    data['created_utc'][i] = dt.datetime.fromtimestamp(int(float(data['created_utc'][i]) ))

  #Sort rows by date of post created
  data.sort_values(by="created_utc", inplace = True)

  vocab_size = []
  avg_words = 0
  data['num_words'] = 0

  #Process comments
  for i in range(len(data['comments'])):
      print("Process",i)
      clean = re.sub("[^a-zA-Z]+"," ",str(data['comments'][i]))
      clean = clean.lower().split()
      data['num_words'][i] = len(clean)
      avg_words += len(clean)
      vocab_size = vocab_size + clean
      data['comments'][i] = clean

  #Drop existing index
  data = data.reset_index(drop=True)

  #Add year and week columns
  data['week'] = data['created_utc'].map(lambda x: x.isocalendar()[1])
  data['year'] = data['created_utc'].map(lambda x: x.isocalendar()[0])


  #Add required columns
  data['is_misogynist'] = 0
  data['is_cat_1'] = 0
  data['is_cat_2'] = 0
  data['is_cat_3'] = 0
  data['is_cat_4'] = 0
  data['is_cat_5'] = 0
  data['is_cat_6'] = 0
  data['is_cat_7'] = 0
  data['is_cat_8'] = 0
  data['is_cat_9'] = 0
  data['num_cat_1'] = 0
  data['num_cat_2'] = 0
  data['num_cat_3'] = 0
  data['num_cat_4'] = 0
  data['num_cat_5'] = 0
  data['num_cat_6'] = 0
  data['num_cat_7'] = 0
  data['num_cat_8'] = 0
  data['num_cat_9'] = 0
  data['total_mis_words'] = 0

  #Display data
  
  display(data.head())

  print("Num posts : {}".format(len(data['comments'])))
  print("Mindate : {}".format(min(data['created_utc'])))
  print("Maxdate : {}".format(max(data['created_utc'])))
  print("Avg num comments : {}".format(data['num_comments'].mean()))
  print("Max num comments : {}".format(data['num_comments'].max()))
  print("Min num comments : {}".format(data['num_comments'].min()))
  print("Vocab size : {}".format(len(set(vocab_size))))
  print("Avg num words per comment: {}".format(avg_words/len(data['comments'])))
  print("Avg score : {}".format(data['score'].mean()))
  print("Max score : {}".format(data['score'].max()))
  print("Min score : {}".format(data['score'].min()))

  return data

In [0]:
#Function to load lexicon
def load_lexicon():
  with open('lexicon.json', 'r') as fp:
    lexicon = json.load(fp)
  return lexicon

In [0]:
#Function to fill table with number of mis. words per category for each post

def fill_table(data, lexicon):
  mis_words_used = {1:[],2:[],3:[],4:[],
                    5:[],6:[],7:[], 8:[],9:[]}

  for i in range(len(data['comments'])):

    for category in range(1,10):

      list_mis_words = lexicon[str(category)]
      common_words = [x for x in list_mis_words if x in data['comments'][i]]

      if len(common_words) != 0:
        data['is_misogynist'][i] = 1
        column_name = "is_cat_{}".format(category)
        data[column_name][i] = 1
        column_name = "num_cat_{}".format(category)
        data[column_name][i] += len(common_words)
        data['total_mis_words'][i] += len(common_words)
        
        mis_words_used[category] = mis_words_used[category] + common_words

    # data['total_mis_words'][i] =  data.apply(lambda row: row.num_cat_1 + row.num_cat_2 + row.num_cat_3 +
                                                    # row.num_cat_4 + row.num_cat_5 + row.num_cat_6 +
                                                    # row.num_cat_7 + row.num_cat_8 + row.num_cat_9 , axis = 1)
  

  return data, mis_words_used

In [0]:
#Function to plot date vs freq of mis. words per category graph

def plot_graph(data):

  fig, ax = plt.subplots(figsize = (12,8))
  data.plot(x='created_utc', y='num_cat_1', ax=ax, title = 'AskReddit')
  data.plot(x='created_utc', y='num_cat_2', ax=ax)
  data.plot(x='created_utc', y='num_cat_3', ax=ax)
  data.plot(x='created_utc', y='num_cat_4', ax=ax)
  data.plot(x='created_utc', y='num_cat_5', ax=ax)
  data.plot(x='created_utc', y='num_cat_6', ax=ax)
  data.plot(x='created_utc', y='num_cat_7', ax=ax)
  data.plot(x='created_utc', y='num_cat_8', ax=ax)
  data.plot(x='created_utc', y='num_cat_9', ax=ax )

  ax.legend(['Belitting', 'Flipping the narrative', 'Homophobia', 'Hostility', 
            'Patriarchy', 'Physical Violence', 'Racism', 'Sexual Violence', 'Stoicism' ])
  ax.set_xlabel('Year-Week')
  ax.set_ylabel('Frequency of misogynist posts')

  # Format the x-ticks
  myFmt = mdates.DateFormatter('%Y - %U')
  ax.xaxis.set_major_formatter(myFmt)

In [0]:
#Function to display most commonly used words as a table

def display_most_common_words(data, mis_words_used):

  dfs = []

  for i in range(9):
    counter = collections.Counter(mis_words_used[i+1])
    x = counter.most_common()[:5]
    df = pd.DataFrame(data = x, columns = ['Word','Freq'])
    dfs.append(df)

  cols = ['Belitting','FlippingNarr','Homophobia', 'Hostility', 
                                       'Patriarchy', 'P. Violence', 'Racism', 'S. Violence', 'Stoicism']

  result = {}

  for df, name in zip(dfs, cols):
      d = df.to_dict()
      for key, value in d.items():
          result[(name, key)] = value

  dff = pd.DataFrame(result)
  display(dff)
    


In [0]:
#Helper function

def perc(values, total):
  for i in range(len(values)):
    values[i] = "{:.0f}%".format(values[i]*100/total)
  return values


In [0]:
#Function to find perc. of mis words table

def add_to_percentage_table(data, name, cols):
  total_posts = len(data['comments'])
  total_words = np.sum([len(x) for x in list(data['comments'])])
  total_mis_posts = np.sum(data['is_misogynist'])
  total_mis_posts_by_cat = [np.sum(data['is_cat_1']), np.sum(data['is_cat_2']), np.sum(data['is_cat_3']),
                            np.sum(data['is_cat_4']), np.sum(data['is_cat_5']), np.sum(data['is_cat_6']),
                            np.sum(data['is_cat_7']), np.sum(data['is_cat_8']), np.sum(data['is_cat_9'])]

  total_mis_words_by_cat = [np.sum(data['num_cat_1']), np.sum(data['num_cat_2']), np.sum(data['num_cat_3']),
                            np.sum(data['num_cat_4']), np.sum(data['num_cat_5']), np.sum(data['num_cat_6']),
                            np.sum(data['num_cat_7']), np.sum(data['num_cat_8']), np.sum(data['num_cat_9'])]
  total_mis_posts_by_cat = perc(total_mis_posts_by_cat, total_posts)

  print("Table required. Percentage of mis words used in comments")
  print(np.log(np.asarray(total_mis_words_by_cat)*100/total_words))
  df = pd.DataFrame([[name, total_posts, total_mis_posts] + total_mis_posts_by_cat], columns = cols )

  return df

In [0]:
#Function to display perc table

def display_perc_table(table):
  fig = plt.figure(figsize = (20, 1))
  ax = fig.add_subplot(111)

  ax.table(cellText = table.values,
            rowLabels = table.index,
            colLabels = table.columns,
            loc = "center"
          )
  ax.set_title("Percentage of misogynist posts according to category")
  ax.axis("off")

In [0]:
#Function to find barplot of mis. words for over 18 or under 18

def plot_above18_graph(data):
  list1 = list(map(str, set(data['over_18'])))
  list2 = list(data.groupby('over_18')['total_mis_words'].mean())
  plt.bar(list1, list2)
  # plt.title('Average number of misogynous words used per age group')
  plt.xlabel('Over 18')
  plt.ylabel('Number of misogynous words')
  plt.show()

In [0]:
#Function of mis. words vs score plot
def plot_score_negativity(data):
  data2 = data.sort_values(by = 'score')
  list1 = np.asarray(data2['score'])
  list2 = np.asarray(data2['total_mis_words'])/np.asarray(data2['num_words'])
  plt.plot(list1, list2)
  plt.xlabel('Score')
  plt.ylabel('Ratio of misogynous words used in comments')
  plt.show()

In [0]:
#Add csv files here
csv_files = ['AskReddit_posts.csv']

community_names = ['AskReddit']

cols = ['Subreddit', 'Total Posts', 'Mis. Posts','Belitting','FlippingNarr','Homophobia', 'Hostility', 
                                       'Patriarchy', 'P. Violence', 'Racism', 'S. Violence', 'Stoicism']

percentage_table = pd.DataFrame(columns =cols) 

#Fill the table with freq of mis. words
for i in range(len(csv_files)):

  data = process_and_display_data(csv_files[i])
  lexicon = load_lexicon()
  data, mis_words_used = fill_table(data, lexicon)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Commentmap 2840/4670


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Saved!
Commentmap 2841/4670


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Commentmap 2842/4670
Commentmap 2843/4670
Commentmap 2844/4670
Commentmap 2845/4670
Commentmap 2846/4670
Commentmap 2847/4670
Commentmap 2848/4670
Commentmap 2849/4670
Commentmap 2850/4670
Saved!
Commentmap 2851/4670
Commentmap 2852/4670
Commentmap 2853/4670
Commentmap 2854/4670
Commentmap 2855/4670
Commentmap 2856/4670
Commentmap 2857/4670
Commentmap 2858/4670
Commentmap 2859/4670
Commentmap 2860/4670
Saved!
Commentmap 2861/4670
Commentmap 2862/4670
Commentmap 2863/4670
Commentmap 2864/4670
Commentmap 2865/4670
Commentmap 2866/4670
Commentmap 2867/4670
Commentmap 2868/4670
Commentmap 2869/4670
Commentmap 2870/4670
Saved!
Commentmap 2871/4670
Commentmap 2872/4670
Commentmap 2873/4670
Commentmap 2874/4670
Commentmap 2875/4670
Commentmap 2876/4670
Commentmap 2877/4670
Commentmap 2878/4670
Commentmap 2879/4670
Commentmap 2880/4670
Saved!
Commentmap 2881/4670
Commentmap 2882/4670
Commentmap 2883/4670
Commentmap 2884/4670
Commentmap 2885/4670
Commentmap 2886/4670
Commentmap 2887/4670
Commen

KeyboardInterrupt: ignored

In [0]:
#Uncomment to get log of mis. words used
# new_row = add_to_percentage_table(data, community_names, cols)

#Uncomment to get percentage of mis. words table
# percentage_table = pd.DataFrame(columns =cols) 
# percentage_table = percentage_table.append(new_row, ignore_index= True)
# display_perc_table(percentage_table)

#Uncomment to display table for most commonly used words
# display_most_common_words(data, mis_words_used)


#Uncomment to get date-time vs word frequency plot
# plot_graph(data)

#Uncomment to get over_18 plot
# plot_above18_graph(data)

#uncomment to get score vs negativity plot
# plot_score_negativity(data)

# Comment Processing to find the users making the most hate comments

In [88]:
file = '/content/drive/My Drive/AskReddit_comments.csv'
data = pd.read_csv(file, lineterminator= '\n')

with open('lexicon.json', 'r') as fp:
    lexicon = json.load(fp)

all_words = []
for category in range(1,10):
  list_mis_words = lexicon[str(category)]
  all_words += list_mis_words

vocab_size = []
avg_words = 0
data['Hurtful'] = 0
data['Freq hurtful words'] = 0

#Process comments
for i in range(len(data['comment_body'])):
    print("Process {}/{}".format(i, len(data['comment_body'])))
    clean = re.sub("[^a-zA-Z]+"," ",str(data['comment_body'][i]))
    clean = clean.lower().split()
    data['comment_body'][i] = clean

    common_words = [x for x in list_mis_words if x in data['comment_body'][i]]

    if len(common_words) != 0:
      data['Hurtful'][i] = 1
      data['Freq hurtful words'][i] += len(common_words)


Process 0/354434
Process 1/354434
Process 2/354434
Process 3/354434
Process 4/354434
Process 5/354434
Process 6/354434
Process 7/354434
Process 8/354434
Process 9/354434
Process 10/354434
Process 11/354434
Process 12/354434
Process 13/354434
Process 14/354434
Process 15/354434
Process 16/354434
Process 17/354434
Process 18/354434
Process 19/354434
Process 20/354434
Process 21/354434
Process 22/354434
Process 23/354434
Process 24/354434
Process 25/354434
Process 26/354434
Process 27/354434
Process 28/354434
Process 29/354434
Process 30/354434
Process 31/354434
Process 32/354434
Process 33/354434
Process 34/354434
Process 35/354434
Process 36/354434
Process 37/354434
Process 38/354434
Process 39/354434
Process 40/354434
Process 41/354434
Process 42/354434
Process 43/354434
Process 44/354434
Process 45/354434
Process 46/354434


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Process 47/354434
Process 48/354434
Process 49/354434
Process 50/354434
Process 51/354434
Process 52/354434
Process 53/354434
Process 54/354434
Process 55/354434
Process 56/354434
Process 57/354434
Process 58/354434
Process 59/354434
Process 60/354434
Process 61/354434
Process 62/354434
Process 63/354434
Process 64/354434
Process 65/354434
Process 66/354434
Process 67/354434
Process 68/354434
Process 69/354434
Process 70/354434
Process 71/354434
Process 72/354434
Process 73/354434
Process 74/354434
Process 75/354434
Process 76/354434
Process 77/354434
Process 78/354434
Process 79/354434
Process 80/354434
Process 81/354434
Process 82/354434
Process 83/354434
Process 84/354434
Process 85/354434
Process 86/354434
Process 87/354434
Process 88/354434
Process 89/354434
Process 90/354434
Process 91/354434
Process 92/354434
Process 93/354434
Process 94/354434
Process 95/354434
Process 96/354434
Process 97/354434
Process 98/354434
Process 99/354434
Process 100/354434
Process 101/354434
Process 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Process 349435/354434
Process 349436/354434
Process 349437/354434
Process 349438/354434
Process 349439/354434
Process 349440/354434
Process 349441/354434
Process 349442/354434
Process 349443/354434
Process 349444/354434
Process 349445/354434
Process 349446/354434
Process 349447/354434
Process 349448/354434
Process 349449/354434
Process 349450/354434
Process 349451/354434
Process 349452/354434
Process 349453/354434
Process 349454/354434
Process 349455/354434
Process 349456/354434
Process 349457/354434
Process 349458/354434
Process 349459/354434
Process 349460/354434
Process 349461/354434
Process 349462/354434
Process 349463/354434
Process 349464/354434
Process 349465/354434
Process 349466/354434
Process 349467/354434
Process 349468/354434
Process 349469/354434
Process 349470/354434
Process 349471/354434
Process 349472/354434
Process 349473/354434
Process 349474/354434
Process 349475/354434
Process 349476/354434
Process 349

In [0]:
data.to_csv('AskReddit_comments_processed.csv')

In [90]:
#Display authors using the most hurtful words 

data = pd.read_csv('AskReddit_comments_processed.csv')
data.drop(['url', 'comment_map', 'comment_body', 'Unnamed: 0'], axis = 1, inplace= True)
data = data.groupby(['comment_author']).sum()
data.sort_values(by =['Freq hurtful words', 'Hurtful'], ascending = False, inplace = True)
display(data)
data.to_csv('AskReddit_list_comment_authors.csv')

Unnamed: 0_level_0,Hurtful,Freq hurtful words
comment_author,Unnamed: 1_level_1,Unnamed: 2_level_1
nefariousmango,7,7
smacksaw,7,7
suninabox,5,5
throwaway-o,4,4
Akseba,3,3
...,...,...
zzephyr,0,0
zzork_,0,0
zzyy,0,0
zzzeumph,0,0
