# Project 3: Web APIs & Classification
### Part 1: Data Collection

_Authors: Evonne_

## Import Libraries

In [1]:
import requests
import time
import pandas as pd

## Data Extraction - r/democrats

#### Initial Check

In [2]:
democrats_url = ('https://www.reddit.com/r/democrats/.json')
headers = {'User-agent': 'Evonne'}

In [3]:
democrats_res = requests.get(democrats_url, headers=headers)
democrats_res.status_code

200

The 200 code indicates that we have successfully accessed the API.

In [4]:
democrats_json = democrats_res.json()
sorted(democrats_json.keys())

['data', 'kind']

In [5]:
sorted(democrats_json['data'].keys())

['after', 'before', 'children', 'dist', 'modhash']

In [6]:
democrats_json['data']['children'][0]['data']

{'approved_at_utc': None,
 'subreddit': 'democrats',
 'selftext': '',
 'author_fullname': 't2_tkz7y',
 'saved': False,
 'mod_reason_title': None,
 'gilded': 0,
 'clicked': False,
 'title': 'Hey All! I made a documentary about Joe Biden. I hope my fellow Dems love this as much as I loved making it.',
 'link_flair_richtext': [],
 'subreddit_name_prefixed': 'r/democrats',
 'hidden': False,
 'pwls': 6,
 'link_flair_css_class': None,
 'downs': 0,
 'thumbnail_height': 105,
 'top_awarded_type': None,
 'hide_score': False,
 'name': 't3_hdge6q',
 'quarantine': False,
 'link_flair_text_color': 'dark',
 'upvote_ratio': 0.72,
 'author_flair_background_color': None,
 'subreddit_type': 'public',
 'ups': 25,
 'total_awards_received': 1,
 'media_embed': {'content': '&lt;iframe width="600" height="338" src="https://www.youtube.com/embed/5JRgue60YBo?feature=oembed&amp;enablejsapi=1" frameborder="0" allow="accelerometer; autoplay; encrypted-media; gyroscope; picture-in-picture" allowfullscreen&gt;&lt;/if

These are the different data features available.
Interested features to be extracted to be analyzed:
- id: comment unique id
- title: title of the post
- subreddit: subreddit that the comment was in. this will be the target variable.
- score: how many upvotes the comment has
- created_utc: When was post created (in unix timestamp format 1592572142)

#### Set up the code to scrape 1,000 posts from r/democrats at a time
Reddit only gives 25 posts per request, with a cap 1,000 posts for each subreddit. Hence a loop is set up of a range of 40 to get 1000 post. Furthermore, to prevent our script from overwhelming the API server (and potentially getting blocked), delay between requests of 2 seconds was introduced with time.sleep.

In [7]:
democrats_post = []
after = None
for i in range(40):
    print("(" + str(i) + ")" 'Successful')
    if after == None: 
        params = {}
    else:
        params = {'after': after}
    democrats_url = ('https://www.reddit.com/r/democrats/.json')
    democrats_res = requests.get(democrats_url, params=params, headers=headers)
    if democrats_res.status_code == 200:
        democrats_json = democrats_res.json()
        democrats_post.extend(democrats_json['data']['children'])
        after = democrats_json['data']['after']
    else:
        print(democrats_res.status_code)
        break
    time.sleep(2)

(0)Successful
(1)Successful
(2)Successful
(3)Successful
(4)Successful
(5)Successful
(6)Successful
(7)Successful
(8)Successful
(9)Successful
(10)Successful
(11)Successful
(12)Successful
(13)Successful
(14)Successful
(15)Successful
(16)Successful
(17)Successful
(18)Successful
(19)Successful
(20)Successful
(21)Successful
(22)Successful
(23)Successful
(24)Successful
(25)Successful
(26)Successful
(27)Successful
(28)Successful
(29)Successful
(30)Successful
(31)Successful
(32)Successful
(33)Successful
(34)Successful
(35)Successful
(36)Successful
(37)Successful
(38)Successful
(39)Successful


## Data Extraction - r/Republicans

#### Initial Check

In [8]:
republican_url = ('https://www.reddit.com/r/Republican/.json')
republican_res = requests.get(republican_url, headers=headers)
republican_res.status_code

200

The 200 code indicates that we have successfully accessed the API.

In [9]:
republican_json = republican_res.json()
sorted(republican_json.keys())

['data', 'kind']

In [10]:
republican_json['data']['children'][0]

{'kind': 't3',
 'data': {'approved_at_utc': None,
  'subreddit': 'Republican',
  'selftext': '',
  'author_fullname': 't2_1s4usext',
  'saved': False,
  'mod_reason_title': None,
  'gilded': 0,
  'clicked': False,
  'title': 'Just putting this out there.',
  'link_flair_richtext': [],
  'subreddit_name_prefixed': 'r/Republican',
  'hidden': False,
  'pwls': 6,
  'link_flair_css_class': None,
  'downs': 0,
  'thumbnail_height': 140,
  'top_awarded_type': None,
  'hide_score': False,
  'name': 't3_hf616r',
  'quarantine': False,
  'link_flair_text_color': 'dark',
  'upvote_ratio': 0.73,
  'author_flair_background_color': None,
  'subreddit_type': 'public',
  'ups': 2377,
  'total_awards_received': 0,
  'media_embed': {},
  'thumbnail_width': 140,
  'author_flair_template_id': None,
  'is_original_content': False,
  'user_reports': [],
  'secure_media': None,
  'is_reddit_media_domain': True,
  'is_meta': False,
  'category': None,
  'secure_media_embed': {},
  'link_flair_text': None,
  

#### Set up the code to scrape 1,000 posts from r/Republican at a time

In [11]:
# Set up the code to scrape 1,000 posts from r/Republican at a time:

republican_post = []
after = None
for i in range(40):
    print("(" + str(i) + ")" 'Successful')
    if after == None: 
        params = {}
    else:
        params = {'after': after}
    republican_url = ('https://www.reddit.com/r/Republican/.json')
    republican_res = requests.get(republican_url, params=params, headers=headers)
    if republican_res.status_code == 200:
        republican_json = republican_res.json()
        republican_post.extend(republican_json['data']['children'])
        after = republican_json['data']['after']
    else:
        print('Status error', republican_res.status_code)
        break
    time.sleep(2)

(0)Successful
(1)Successful
(2)Successful
(3)Successful
(4)Successful
(5)Successful
(6)Successful
(7)Successful
(8)Successful
(9)Successful
(10)Successful
(11)Successful
(12)Successful
(13)Successful
(14)Successful
(15)Successful
(16)Successful
(17)Successful
(18)Successful
(19)Successful
(20)Successful
(21)Successful
(22)Successful
(23)Successful
(24)Successful
(25)Successful
(26)Successful
(27)Successful
(28)Successful
(29)Successful
(30)Successful
(31)Successful
(32)Successful
(33)Successful
(34)Successful
(35)Successful
(36)Successful
(37)Successful
(38)Successful
(39)Successful


## Convert Webscrapped Data into a DataFrame

#### Democrats DataFrame

In [12]:
democrats_id = []
democrats_title = []
democrats_comment = []
democrats_time = []
democrats_score = []
democrats_subreddits=[]

for post in range(len(democrats_post)):
    democrats_id.append(democrats_post[post]['data']['id'])
    democrats_title.append(democrats_post[post]['data']['title'])
    democrats_comment.append(democrats_post[post]['data']['selftext'])
    democrats_time.append(democrats_post[post]['data']['created_utc'])
    democrats_score.append(democrats_post[post]['data']['score'])
    democrats_subreddits.append(democrats_post[post]['data']['subreddit'])


democrats_dict = {'id': democrats_id,
                  'title': democrats_title,
                  'comment': democrats_comment,
                  'date_created': democrats_time,
                  'score': democrats_score,
                  'subreddit': democrats_subreddits}


democrats_df = pd.DataFrame(democrats_dict)
democrats_df.head()

Unnamed: 0,id,title,comment,date_created,score,subreddit
0,hdge6q,Hey All! I made a documentary about Joe Biden....,,1592780000.0,29,democrats
1,hff6bt,It’s Time for a Blue Wave to Restore America.,,1593055000.0,629,democrats
2,hf7pni,"In Scathing Letter, More Than 80-Percent of Fa...",,1593027000.0,1336,democrats
3,hfgpv1,Mark Cuban endorses Biden on Hannity: He 'actu...,,1593064000.0,70,democrats
4,hf6wes,Judiciary Democrat calls for House to pursue i...,,1593025000.0,201,democrats


In [13]:
print("Total number of posts: ", len(democrats_df['title']))
print("Unique number of posts: ", len(democrats_df['title'].unique()))

Total number of posts:  990
Unique number of posts:  979


#### Republican DataFrame

In [14]:
republican_id = []
republican_title = []
republican_comment = []
republican_time = []
republican_score = []
republican_subreddits=[]

for post in range(len(republican_post)):
    republican_id.append(republican_post[post]['data']['id'])
    republican_title.append(republican_post[post]['data']['title'])
    republican_comment.append(republican_post[post]['data']['selftext'])
    republican_time.append(republican_post[post]['data']['created_utc'])
    republican_score.append(republican_post[post]['data']['score'])
    republican_subreddits.append(republican_post[post]['data']['subreddit'])


republican_dict = {'id': republican_id,
                  'title': republican_title,
                   'comment': republican_comment,
                  'date_created': republican_time,
                  'score': republican_score,
                  'subreddit': republican_subreddits}


republican_df = pd.DataFrame(republican_dict)
republican_df.head()

Unnamed: 0,id,title,comment,date_created,score,subreddit
0,hf616r,Just putting this out there.,,1593022000.0,2381,Republican
1,hflqaf,New evidence shows it is time to charge Joe Bi...,,1593090000.0,15,Republican
2,hf68bl,Vroom vroom,,1593023000.0,223,Republican
3,hezlce,Pelosi says Republicans 'trying to get away wi...,,1592999000.0,659,Republican
4,hfkbyi,"Al Gore, UN Secretary-General, others now dema...",,1593084000.0,4,Republican


In [15]:
print("Total number of posts: ", len(republican_df['title']))
print("Unique number of posts: ", len(republican_df['title'].unique()))

Total number of posts:  988
Unique number of posts:  805


#### Save DataFrame as csv file

In [16]:
democrats_df.to_csv('./datasets/df_a.csv', index = False)
republican_df.to_csv('./datasets/df_b.csv', index = False)

----> Proceed to the next notebook for [Data cleaning and EDA](./02_Data_Cleaning_and_EDA.ipynb)