## Imports

In [1402]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import requests
import time
import re
import httpimport
toolkit = 'https://raw.githubusercontent.com/zach-brown-18/class-toolkit/main/eda/'
with httpimport.remote_repo('nan', toolkit):
    import nan

Importing a GA DSI 1019 course toolkit above, started courtesy of [Zach B](https://github.com/zach-brown-18/class-toolkit).

## Data Scraping

In [1403]:
url = 'https://api.pushshift.io/reddit/search/submission' # Setting the Pushshift API URL

In [1404]:
# Function that takes the subreddit name and the number of posts wanted (in returns of 100 posts).
# Function structure partially developed with help from Will Hanley.

def scan_posts(subreddit, num_posts):
    total = []
    new_epoch = 1606799061 
    for i in range(num_posts):
        params = {'subreddit' : subreddit,
             'size' : 100,
             'before': new_epoch}
        get = requests.get(url, params).json()
        new_epoch = get['data'][-1]['created_utc']
        total.append(pd.DataFrame(get['data'])[['subreddit','title', 'selftext']])
        time.sleep(2)
    return pd.concat(total).reset_index()

In [1405]:
# Setting data for 1200 posts from sudoku subreddit
sud_posts = scan_posts('sudoku', 12)

In [1406]:
# Setting data for 1200 posts from crossword subreddit
cw_posts = scan_posts('crossword', 12)

In [1449]:
cw_posts.to_csv('./Data/cw_posts.csv', index=False)
cw_posts.drop(columns='index', inplace=True)

In [1450]:
sud_posts.to_csv('./Data/sud_posts.csv', index=False)
sud_posts.drop(columns='index', inplace=True)

In [1451]:
cw_posts.head()

Unnamed: 0,subreddit,title,selftext
0,crossword,Milestone: 500!,"Could’ve been more like 800, but I missed a fe..."
1,crossword,NYT Tuesday 12/01/2020 Discussion,"Spoilers are welcome in here, beware!\n\nHow w..."
2,crossword,NYT Tuesday 12/01/2020 Discussion,"Spoilers are welcome in here, beware!\n\nHow w..."
3,crossword,NYT Tuesday 12/01/2020 Discussion,"Spoilers are welcome in here, beware!\n\nHow w..."
4,crossword,NYT Tuesday 12/01/2020 Discussion,"Spoilers are welcome in here, beware!\n\nHow w..."


In [1452]:
sud_posts.head()

Unnamed: 0,subreddit,title,selftext
0,sudoku,Need help and want to learn advanced technique...,
1,sudoku,Can someone help me with this?,
2,sudoku,Need help,
3,sudoku,Kingdom Battle Sudoku (3 PUZZLES INCLUDED),Check all the puzzles in here!\n\n[https://dri...
4,sudoku,This 1-star sudoku puzzle in a Chinese Sudoku ...,


In [1454]:
# Each dataframe has 1200 rows and three columns
print(cw_posts.shape)
print(sud_posts.shape)

(1200, 3)
(1200, 3)


In [1455]:
# Combining Sudoku and Crossword reddit post datasets
posts_both = pd.concat([cw_posts, sud_posts], axis = 0)

In [1456]:
posts_both.shape

(2400, 3)

## Data Cleaning

In [1457]:
# There are four null rows in the selftext column
nulls = nan.investigate_null(posts_both)
nulls

Unnamed: 0,column,null_count
0,selftext,4


In [1458]:
# First, I'm creating two dataframes, one with the crossword/sudoku titles and matching subreddits, and the other
# with the selftexts and matching subreddits. I'm then concatenating those two dataframes so that the title and selftext
# data is combined to simply be 'text'

titles_df = posts_both[['title', 'subreddit']]
titles_df.rename(columns={'title' : 'text'}, inplace=True)

selftext_df = posts_both[['selftext', 'subreddit']]
selftext_df.rename(columns={'selftext' : 'text'}, inplace=True)

titles_and_text = pd.concat([titles_df, selftext_df], axis=0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [1459]:
titles_and_text.head()

Unnamed: 0,text,subreddit
0,Milestone: 500!,crossword
1,NYT Tuesday 12/01/2020 Discussion,crossword
2,NYT Tuesday 12/01/2020 Discussion,crossword
3,NYT Tuesday 12/01/2020 Discussion,crossword
4,NYT Tuesday 12/01/2020 Discussion,crossword


In [1460]:
titles_and_text.tail()

Unnamed: 0,text,subreddit
1195,I love the website sudokuexchange.com that /u/...,sudoku
1196,,sudoku
1197,"Hey everyone! I recently got into sudokus, and...",sudoku
1198,,sudoku
1199,,sudoku


In [1430]:
# Dropping the four rows with NA values in the text column
titles_and_text.dropna(inplace=True)

In [1431]:
titles_and_text.shape

(4796, 2)

In [1432]:
# Binarizing subreddit column
titles_and_text['subreddit'] = titles_and_text['subreddit'].map({'sudoku' : 0, 'crossword' : 1})

In [1434]:
# This function makes all text lower case and makes unwanted rows blank

def drop_noise(column):
    clean = []
    for text in column:
        # Make text lower case
        text = text.lower()
        
        # Remove urls from text
        text = re.sub('https?://\S+\www\.\S+', '', str(text)) 
        
        # Remove ampersand references
        text = re.sub('&amp;\S*', '', text)
        
        # Remove text enclosed by brackets
        text = re.sub('\[.*?\]', '', text)
        
        # Keep text containing letters A-z and digits that represent dollar amounts
        tokenizer = RegexpTokenizer(r'[A-z]+|\$[0-9]*\S*\d')
        text = ' '.join(tokenizer.tokenize(text))
        
        clean.append(str(text))
        
    return clean

In [1435]:
# This function removes all instances of the words 'crossword' and 'sudoku' from
# the column of combined text, with the intent to make a model's subreddit predictions less obvious

def drop_obvious(column):
    no_crossword_or_sudoku = []
    
    for text in column:
        text = re.sub('crossword', '', text)
        text = re.sub('sudoku', '', text)
        no_crossword_or_sudoku.append(str(text))
        
    return no_crossword_or_sudoku

Above function partially taken from [stack overflow](https://stackoverflow.com/questions/11331982/how-to-remove-any-url-within-a-string-in-python) and this [site](https://kanoki.org/2019/11/12/how-to-use-regex-in-pandas/). Code formatting developed with some help from classmate Aziz Maredia.

In [1436]:
# Running drop_noise on the column of subreddit text
titles_and_text['text'] = drop_noise(titles_and_text['text'])

In [1438]:
# Dropping all blank text rows
titles_and_text = titles_and_text[titles_and_text['text'] != '']

# Many rows had nothing but '[', so removing all of those as well
titles_and_text = titles_and_text[titles_and_text['text'] != '[']

In [1439]:
# There were 1268 blank text rows after running drop_noise() on titles_and_text['text'], so now 1241 have been 
# dropped. 
titles_and_text.shape

(3528, 2)

In [1440]:
# Making a separate DataFrame, which is the same as titles_and_text, except all instances of 
# the words 'crossword' and 'sudoku' are omitted from the text

data = {
    'text' : drop_obvious(titles_and_text['text']),
    'subreddit' : titles_and_text['subreddit']
}

titles_and_text_no_cw_or_sudoku = pd.DataFrame(data) 

titles_and_text_no_cw_or_sudoku = titles_and_text_no_cw_or_sudoku[titles_and_text_no_cw_or_sudoku['text'] != '']

# Many rows had nothing but '[', so removing all of those
titles_and_text_no_cw_or_sudoku = titles_and_text_no_cw_or_sudoku[titles_and_text_no_cw_or_sudoku['text'] != '[']

## Lemmatize data 

In [1441]:
# This function iterates through text documents, lemmatizes every word in each document, 
# and returns the entire lemmatized column 

def lemmatize(column):
    lemmatizer = WordNetLemmatizer()
    lemmatized = []
    for text in column:
        lemmatized_text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
        lemmatized.append(lemmatized_text)
    return lemmatized

In [1442]:
# Making a separate column in my DataFrames with lemmatized versions of the text
titles_and_text['lemmatized_text'] = lemmatize(titles_and_text['text'])
titles_and_text_no_cw_or_sudoku['lemmatized_text'] = lemmatize(titles_and_text_no_cw_or_sudoku['text'])

Above function formatting developed with some help from Aziz Maredia. 

In [1443]:
# Resetting index for DataFrames, since there were duplicate index values present
titles_and_text = titles_and_text.reset_index(drop=True)
titles_and_text_no_cw_or_sudoku = titles_and_text_no_cw_or_sudoku.reset_index(drop=True)

In [1444]:
titles_and_text.to_csv('./Data/titles_and_text.csv', index=False)
titles_and_text_no_cw_or_sudoku.to_csv('./Data/titles_and_text_no_cw_or_sud.csv', index=False)