In [2]:
import pandas as pd
import requests
import time
import numpy as np

import regex as re
import nltk
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer

In [4]:
# A function to scrape Reddit posts, adapted from code by Tim Book, GA intructor

url = 'https://api.pushshift.io/reddit/search/comment/'

def get_comments(subreddit, length):
    df_list = []
    current_time = 1587430567
    for i in range(length):
        res = requests.get(
            url,
            params={
                "subreddit": subreddit,
                "size": 1000,
                "lang": True,
                "before": current_time
            }
        )
        try:
            data = res.json()['data']
        except:
            pass
        df = pd.DataFrame(data)
        df = df[["created_utc", "body", "subreddit", "author"]]
        df_list.append(df)
        current_time = df.created_utc.min()
    return pd.concat(df_list, axis=0)

In [5]:
# A function to convert the comments into one list of tokenized words

def convert_to_tokens(series):
    ls = series.tolist()
    lower_ls = [i.lower() for i in ls]
    lists_of_words = [RegexpTokenizer('[a-z]\w+').tokenize(i) for i in lower_ls]
    small_list_of_words = [i for i in lists_of_words if i not in stopwords.words('english')]
    big_list = []
    for chunk in small_list_of_words:
        big_list += chunk
    return big_list

In [30]:
# A function to convert the comments into tokenized and lemmatized words

appo = {
    're' : 'are', 's' : 'is', 't' : 'not',
    'd' : 'would', 've': 'had', 'll': 'will'
}

def clean_to_lemmas(text):
    low = text.lower()
    clean = low.replace('\n', '')
    tokens = RegexpTokenizer('\w+').tokenize(clean)
    words = ' '.join(tokens)
    word = words.split(' ')
    ls = [appo[word] if word in appo else word for word in word]
    lemmer = WordNetLemmatizer()
    lems = [lemmer.lemmatize(word) for word in ls]
    clean_words = ' '.join(lems)
    return clean_words

# Data Mining and Variable Preparation

In [7]:
# Scraping 20,000 comments from the "conservatives" subreddit 

df_con = get_comments('conservatives', 20)

# Scraping 20,000 comments from the "socialism" subreddit

df_lib = get_comments('socialism', 20)

In [9]:
# Exporting the dataframes of scraped comments

df_con.to_csv('./data/df_con.csv', index = False)
df_lib.to_csv('./data/df_lib.csv', index = False)

In [31]:
# Combining the individual subreddit dataframes

df = pd.concat([df_lib, df_con], ignore_index = True)

In [32]:
# Assessing null values

df.isna().sum()

created_utc    0
body           0
subreddit      0
author         0
dtype: int64

In [33]:
# Mapping the Y variable to binary values

df['subreddit'] = df['subreddit'].map({'conservatives' : 0,
                                                'socialism' : 1})

In [34]:
# Dropping comments that were removed or deleted

df = df[df['author'] !='[deleted]']
df = df[df['body']!='[removed]']

In [16]:
# Exporting the combined dataframe

df.to_csv('./data/data.csv', index = False)

In [35]:
# Assigning the variables

X = df['body']
y = df['subreddit']

In [36]:
# Lemmatizing the comments by mapping the conversion function to the X variable

X = X.map(clean_to_lemmas)

In [37]:
# Checking the output of the cleaning and lemmatizing function

X[0]

'teen vogue is actually laying it out man the kid have said this shit is broken and not working and instead of taking the just get a better job line teen vogue is like you are right here is specifically why you can get into the capitalism sell the rope to hang itself line if you want but we should be treating teen vogue a an ally'

In [40]:
# Creating a dataframe for export with the lemmatized comments

df_amazon_export = pd.concat([y, X], axis = 1)

In [41]:
# Exporting the lemmatized data for modeling using AWS

df_amazon_export.to_csv('./df_export.csv', index = False)