# I. Data Collection from Subreddits

### Import Relevant Libraries

In [2]:
#this code was used from the breakfast hour using pushshift api
import pandas as pd
import requests # Pushshift accesses Reddit via a url so this is needed
import json # JSON manipulation
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords

# nltk.download('stopwords')
# nltk.download('wordnet')

##### Max columns and rows

In [3]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

### Function Using Pushshift API for Data Collection

In [4]:
#this code was used from Katie Sylvia's breakfast hour using pushshift api
def get_pushshift_data(start, end, subreddit):

    # URL 
    url='https://api.pushshift.io/reddit/search/submission/'

    # Add params
    params={'subreddit':subreddit,
            'size':100,
            'after':start,
            'before':end,
            'filter':['subreddit','title'],
            'num_comments': '>0',
            'score': '>10',
           }
     
    # Get the data
    res=requests.get(url,params)
    # Convert the request into a list of dict objects
    data=res.json()
    # Go deeper
    posts=data['data']
    # Turn into DataFrame
    df=pd.DataFrame(posts)

    return df


##### I pulled 5 years of subreddit titles for r/metalgearsolid and r/Skyrim

In [5]:
mgs_2021 = get_pushshift_data('2021-01-01', '2021-10-10', 'metalgearsolid')
mgs_2020 = get_pushshift_data('2020-01-01', '2020-12-31', 'metalgearsolid')
mgs_2019 = get_pushshift_data('2019-01-01', '2019-12-31', 'metalgearsolid')
mgs_2018 = get_pushshift_data('2018-01-01', '2018-12-31', 'metalgearsolid')
mgs_2017 = get_pushshift_data('2017-01-01', '2017-12-31', 'metalgearsolid')

In [6]:
skyrim_2021 = get_pushshift_data('2021-01-01', '2021-10-10', 'Skyrim')
skyrim_2020 = get_pushshift_data('2020-01-01', '2020-12-31', 'Skyrim')
skyrim_2019 = get_pushshift_data('2019-01-01', '2019-12-31', 'Skyrim')
skyrim_2018 = get_pushshift_data('2018-01-01', '2018-12-31', 'Skyrim')
skyrim_2017 = get_pushshift_data('2017-01-01', '2017-12-31', 'Skyrim')

##### I created a DataFrame for each subreddit

In [7]:
mgs_df=pd.concat([mgs_2017,mgs_2018,mgs_2019,mgs_2020,mgs_2021])
skyrim_df=pd.concat([skyrim_2017,skyrim_2018,skyrim_2019,skyrim_2020,skyrim_2021])

##### I concatinated the DataFrames to make 1 large one

In [8]:
df=pd.concat([skyrim_df,mgs_df])
df

Unnamed: 0,subreddit,title
0,skyrim,Hired thugs make bad choices lol
1,skyrim,TIL you can find a madwoman out in the wildern...
2,skyrim,I Have All Skyrim Books in EPUB Format
3,skyrim,Foggy night
4,skyrim,First trophy was not a month after release. To...
5,skyrim,Tried mead for the first time tonight...
6,skyrim,Saw this while eating out for breakfast
7,skyrim,An apology from r/tf2
8,skyrim,&lt;Boyfriend looking at picture of The Mask&g...
9,skyrim,It's a social experiment


##### I created a column which will be the target feature

In [9]:
df['is_mgs']=[1 if i=='metalgearsolid' else 0 for i in df['subreddit']]

##### I used function to make the data more readable for NLP models

In [10]:
#from breakfast hour with Katie Sylvia
def cleaner_rev(review):
    my_tokenizer = RegexpTokenizer("[\w']+|\$[\d\.]+")
    lemmatizer = WordNetLemmatizer()
    
    words = my_tokenizer.tokenize(review.lower())
    
    # What about stop words??
    stop_word_list = stopwords.words('english')
    no_stops = [i for i in words if i not in stop_word_list]
    
    rev_lem = [lemmatizer.lemmatize(i) for i in no_stops]
    return " ".join(rev_lem)

##### I added the new data to a column and reset the index so it all matches on one DataFrame

In [11]:
df['clean_title'] = df['title'].map(cleaner_rev)

In [12]:
df.reset_index(drop=True)

Unnamed: 0,subreddit,title,is_mgs,clean_title
0,skyrim,Hired thugs make bad choices lol,0,hired thug make bad choice lol
1,skyrim,TIL you can find a madwoman out in the wildern...,0,til find madwoman wilderness sheogorath's ques...
2,skyrim,I Have All Skyrim Books in EPUB Format,0,skyrim book epub format
3,skyrim,Foggy night,0,foggy night
4,skyrim,First trophy was not a month after release. To...,0,first trophy month release today completed ody...
5,skyrim,Tried mead for the first time tonight...,0,tried mead first time tonight
6,skyrim,Saw this while eating out for breakfast,0,saw eating breakfast
7,skyrim,An apology from r/tf2,0,apology r tf2
8,skyrim,&lt;Boyfriend looking at picture of The Mask&g...,0,lt boyfriend looking picture mask gt quest get...
9,skyrim,It's a social experiment,0,social experiment


In [13]:
final_df=df

In [15]:
final_df.shape

(1000, 4)

In [14]:
final_df.to_csv('./data/final_df',index=False)