# Project 3: Webscraping, NLP and classification modelling

# Contents:

1) Fresh reddit pull  
2) New DF pre-processing  

In [32]:
# library imports
import requests
import time
import pandas as pd
import numpy as np
import ast
import re
from tqdm import tqdm

# preprocessing imports
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords 


In [2]:
# create header parameter for API
headers_dict = {'User-agent':'hello-reddit-i-am-still-totally-not-a-bot'}

In [4]:
# instantiate API variables
url = 'https://reddit.com/'
sub1_url = url + 'r/talesfromcallcenters'           # setting sub1 
sub2_url = url + 'r/talesfromtechsupport'        # setting sub2 

limit_num = 100     # API 'limit' parameter

sub1_after = None  # instantiate empty counters for API 'after' parameter
sub2_after = None

sub1_pages = []    # instantiate empty lists to save API results
sub2_pages = []

for i in range(4): # pull from API 4 times
    
    # add 'after' parameters if an id has been saved - starts as None
    if sub1_after and sub2_after:
        # create full API url for sub1
        sub1_after_url = sub1_url + '.json?limit=' \
                            + str(limit_num) + '&after=' \
                            + sub1_after
        print(sub1_after_url)
        
        # create full API url for sub2
        sub2_after_url = sub2_url + '.json?limit=' \
                            + str(limit_num) + '&after=' \
                            + sub2_after
        print(sub2_after_url)
    
    # if one after is logged and the other is not
    elif bool(sub1_after) != bool(sub2_after):
        print('After reference out of sync.')
        break
    
    else:
        # create first run url
        sub1_after_url = sub1_url + '.json?limit=' + str(limit_num)
        sub2_after_url = sub2_url + '.json?limit=' + str(limit_num)
    
    # pull json from sub1
    sub1_res = requests.get(sub1_after_url, headers=headers_dict)
    print(i, sub1_res.status_code)
    
    # if sub1 connection is established
    if sub1_res.status_code == 200:
        # add page to list
        sub1_pages.append(sub1_res.json()['data'])
        print('sub1_pages length: ', len(sub1_pages))
        
        # set 'after' parameter for next run
        sub1_after = sub1_res.json()['data']['after']
        print('sub01_after: ', sub1_after)
        
    else:        
        print('Connection failed.\n')
    
    # sleep one second
    time.sleep(1)
    
    # pull json from sub2
    sub2_res = requests.get(sub2_after_url, headers=headers_dict)
    print(i, sub2_res.status_code)
    
    # if sub2 connection is established
    if sub2_res.status_code == 200:
        # add page to list
        sub2_pages.append(sub2_res.json()['data'])
        print('sub2_pages length: ', len(sub2_pages))
        
        # set 'after' parameter for next run
        sub2_after = sub2_res.json()['data']['after']
        print('sub2_after: ', sub2_after)
    else:
        print('Connection failed.\n')
        
    # sleep 2 seconds    
    time.sleep(2)

0 200
sub1_pages length:  1
sub01_after:  t3_jugbhz
0 200
sub2_pages length:  1
sub2_after:  t3_jnz4ft
https://reddit.com/r/talesfromcallcenters.json?limit=100&after=t3_jugbhz
https://reddit.com/r/talesfromtechsupport.json?limit=100&after=t3_jnz4ft
1 200
sub1_pages length:  2
sub01_after:  t3_jh4zah
1 200
sub2_pages length:  2
sub2_after:  t3_j7tcft
https://reddit.com/r/talesfromcallcenters.json?limit=100&after=t3_jh4zah
https://reddit.com/r/talesfromtechsupport.json?limit=100&after=t3_j7tcft
2 200
sub1_pages length:  3
sub01_after:  t3_j6atup
2 200
sub2_pages length:  3
sub2_after:  t3_it4p2o
https://reddit.com/r/talesfromcallcenters.json?limit=100&after=t3_j6atup
https://reddit.com/r/talesfromtechsupport.json?limit=100&after=t3_it4p2o
3 200
sub1_pages length:  4
sub01_after:  t3_iw3exa
3 200
sub2_pages length:  4
sub2_after:  t3_ie9ne4


In [5]:
# create DataFrames from posting lists
test_sub1_df = pd.DataFrame(sub1_pages)
test_sub2_df = pd.DataFrame(sub2_pages)

In [6]:
# save API data to files
test_sub1_df.to_csv('../datasets/sub1_scrape_4dec.csv', index=False)
test_sub2_df.to_csv('../datasets/sub2_scrape_4dec.csv', index=False)

# Checkpoint after scraping

In [10]:
sub1_df = pd.read_csv('../datasets/sub1_scrape_4dec.csv')
sub2_df = pd.read_csv('../datasets/sub2_scrape_4dec.csv')

In [11]:
sub1_df['children'] = sub1_df.children.map(lambda x: ast.literal_eval(x))
sub2_df['children'] = sub2_df.children.map(lambda x: ast.literal_eval(x))

In [12]:

# save post dictionaries in arrays

sub1 = sub1_df['children']
sub2 = sub2_df['children']

In [13]:
#create list of titles
sub1_titles = [sub1[i][j]['data']['title'] for i in range(len(sub1))
            for j in range(len(sub1[i]))]


sub2_titles = [sub2[i][j]['data']['title'] for i in range(len(sub2)) 
            for j in range(len(sub2[i]))]

In [14]:
# create list of post using nested comprehensions
sub1_posts = [sub1[i][j]['data']['selftext'] for i in range(len(sub1)) 
            for j in range(len(sub1[i]))]

sub2_posts = [sub2[i][j]['data']['selftext'] for i in range(len(sub2)) 
            for j in range(len(sub2[i]))]

In [15]:
# create list of upvotes using nested comprehensions
sub1_ups = [sub1[i][j]['data']['ups'] for i in range(len(sub1)) 
            for j in range(len(sub1[i]))]

sub2_ups = [sub2[i][j]['data']['ups'] for i in range(len(sub2)) 
            for j in range(len(sub2[i]))]

In [16]:
# create list of upvotes using nested comprehensions
sub1_gilded = [sub1[i][j]['data']['gilded'] for i in range(len(sub1)) 
            for j in range(len(sub1[i]))]

sub2_gilded = [sub2[i][j]['data']['gilded'] for i in range(len(sub2)) 
            for j in range(len(sub2[i]))]

In [17]:
# compile lists into DataFrame
sub1_df = pd.DataFrame([sub1_titles, sub1_posts, sub1_ups, sub1_gilded], index=['title','post','upvotes','gilded'])

In [18]:
#transpose DF
sub1_df = sub1_df.T

In [19]:
sub1_df.head()

Unnamed: 0,title,post,upvotes,gilded
0,Your Son is Seven and He's Getting WHAT?,So last night I had a very bizarre call from s...,412,0
1,No. You don’t get to speak to a manager.,"So I had this call from a third party, which i...",755,0
2,I am losing faith in humanity!,Why people give their SSN and DOB to random pe...,18,0
3,"Thanks for being Racist, Susan.",I am 50% white but I have an uncommon first na...,8,0
4,Let me vent to you about a dumb call I had today,So I am a team leader. We will call the custom...,5,0


In [20]:
# compile lists into DataFrame
sub2_df = pd.DataFrame([sub2_titles, sub2_posts, sub2_ups, sub2_gilded], index=['title','post','upvotes', 'gilded'])

In [21]:
#transpose DF
sub2_df = sub2_df.T

In [22]:
# binarize the classifier: 'belongs_to_sub2' 
sub1_df['belongs_to_sub2'] = 0
sub2_df['belongs_to_sub2'] = 1

In [23]:
sub1_df.to_csv('../datasets/sub1_df_4_dec.csv', index=False)
sub2_df.to_csv('../datasets/sub2_df_4_dec.csv', index=False)

In [24]:
#combine the two subs
df = pd.concat([sub1_df, sub2_df])

In [25]:
df.post.fillna(' ', inplace=True)

In [26]:
df['title_x_post'] = df['title'] + ' ' + df['post']

In [27]:
df.belongs_to_sub2.value_counts()

1    401
0    400
Name: belongs_to_sub2, dtype: int64

In [28]:
#check distribution of target variable
df.belongs_to_sub2.value_counts(normalize= True)

1    0.500624
0    0.499376
Name: belongs_to_sub2, dtype: float64

In [29]:
#save new test data
df.to_csv('../datasets/combined_test_4dec_df.csv', index=False)

In [33]:
def preprocessing(raw_text):
    
    # 1. Convert to lowercase
    lower_text = raw_text.lower()
    
    # 2. Remove punctuation
    letters_only = re.sub("[^a-z]",     # The pattern to search for
                          " ",          # The pattern to replace it with
                          lower_text )  # The text to search
    
    # 3. Split and lemmatize words
    words = letters_only.split()
    lemmatizer = WordNetLemmatizer()
    words_lem = [lemmatizer.lemmatize(i) for i in words]
     
    # 4. Remove stop words
    stops = set(stopwords.words('english'))
    meaningful_words = [w for w in words if w not in stops]

    
    # 5. Join the words back into one string separated by space, 
    # and return the result.
    return (' '.join(meaningful_words))

In [34]:
df['cleaned_text']=df['title_x_post'].map(preprocessing)

In [36]:
df.head()

Unnamed: 0,title,post,upvotes,gilded,belongs_to_sub2,title_x_post,cleaned_text
0,Your Son is Seven and He's Getting WHAT?,So last night I had a very bizarre call from s...,412,0,0,Your Son is Seven and He's Getting WHAT? So la...,son seven getting last night bizarre call some...
1,No. You don’t get to speak to a manager.,"So I had this call from a third party, which i...",755,0,0,No. You don’t get to speak to a manager. So I ...,get speak manager call third party nothing new...
2,I am losing faith in humanity!,Why people give their SSN and DOB to random pe...,18,0,0,I am losing faith in humanity! Why people give...,losing faith humanity people give ssn dob rand...
3,"Thanks for being Racist, Susan.",I am 50% white but I have an uncommon first na...,8,0,0,"Thanks for being Racist, Susan. I am 50% white...",thanks racist susan white uncommon first name ...
4,Let me vent to you about a dumb call I had today,So I am a team leader. We will call the custom...,5,0,0,Let me vent to you about a dumb call I had tod...,let vent dumb call today team leader call cust...


In [37]:
df.to_csv('../datasets/fresh_test_data_cleaned_for_modelling_4dec_df.csv', index=False)