# Project 3 - 1. Data collecting using Web API (reddit.com)
## Collecting Data

### Import library

In [1]:
import requests
import time
import pandas as pd
import ast

### Collect data

In [2]:
url = 'https://www.reddit.com/r/FoodNYC.json'
headers = {'User-agent': 'Bleep blorp bot 0.1'}
res = requests.get(url, headers=headers)
res.status_code

200

In [3]:
the_json = res.json()

# Check the keys in data.
the_json.keys()

dict_keys(['kind', 'data'])

In [25]:
# Check the keys in data
the_json['data'].keys()

dict_keys(['modhash', 'dist', 'children', 'after', 'before'])

In [26]:
# Check the length of the data
len(the_json['data']['children'])

25

In [27]:
[post['data']['name'] for post in the_json['data']['children']]

['t3_a7gkn5',
 't3_a73swu',
 't3_a5vtbi',
 't3_a613q7',
 't3_a5xt9u',
 't3_a5ljr9',
 't3_a4m8hr',
 't3_a4fzo1',
 't3_a3pzql',
 't3_a3jpre',
 't3_a3ct3k',
 't3_a2ifnf',
 't3_a2lf71',
 't3_a2hwh4',
 't3_a0pym9',
 't3_9zjqpk',
 't3_9znhwr',
 't3_9ytotc',
 't3_9ysv5x',
 't3_9ylntz',
 't3_9y57jg',
 't3_9y79xe',
 't3_9xdqaq',
 't3_9x2mgt',
 't3_9x643u']

In [28]:
# Check the after key for iteration
the_json['data']['after']

't3_9x643u'

In [29]:
# Set the after key as a parameter
param = {'after': 't3_9x20h1'}

In [30]:
# Check the response if it is 200.
requests.get(url, params=param, headers=headers)

<Response [200]>

In [31]:
# Code to get the posts from reddit.com/r/FoodNYC
afters = []                   # List of the afters
posts_1 = []                  # List of the posts
after = None
for i in range(35):           # Iterate 40 times to get around 1000 posts.
    if after == None:
        params = {}
    else:
        params = {'after': after}
    
    url = 'http://www.reddit.com/r/FoodNYC.json'
    res = requests.get(url, params=params, headers=headers)
    if res.status_code == 200:
        the_json = res.json()
        posts_1.extend(the_json['data']['children'])
        after = the_json['data']['after']
        afters.append(after)
    else:
        print(res.status_code)
        break
    
    time.sleep(1)

In [32]:
# Check the number of unique posts.
len(set([p['data']['name'] for p in posts_1]))

856

> We expected to collect around 1000 post but got only 856 posts. Let's see if there is any problem in the previous process.

In [33]:
# Check the after values.
pd.Series(afters).value_counts()

t3_6t1zgz    1
t3_8bc0pf    1
t3_90z57q    1
t3_3o1k2e    1
t3_533yut    1
t3_3v6zt5    1
t3_5soh4n    1
t3_5zrjkw    1
t3_7pb9r7    1
t3_4w1906    1
t3_796b76    1
t3_6mbpbm    1
t3_8tfb2q    1
t3_66z211    1
t3_7jxwzm    1
t3_49sq84    1
t3_4j1w6s    1
t3_7d4751    1
t3_9rix4m    1
t3_4p524q    1
t3_4408cx    1
t3_8hx8o9    1
t3_80f1lz    1
t3_74sxsn    1
t3_5n1iep    1
t3_86p7zu    1
t3_9l2pzz    1
t3_6dw9kf    1
t3_4dxu1z    1
t3_9x643u    1
t3_3jjcao    1
t3_7vog3b    1
t3_99116t    1
t3_5hqm66    1
dtype: int64

> Some after key values are duplicated. It seems reddit provided me the duplicated keys and that cause me to get less than 1000 posts. We eliminate the duplicates.

In [34]:
# Check how many after keys used.
len(pd.Series(afters).value_counts())

34

> We iterate the web scraping 34 times to get no duplicates.

In [35]:
# Check the posts
posts_1

[{'kind': 't3',
  'data': {'approved_at_utc': None,
   'subreddit': 'FoodNYC',
   'selftext': "My boyfriend and I are going to NYC tomorrow through friday and are looking for some cool (and affordable) restaurant suggestions! We are staying in Koreatown but our schedule is wide open so we're open to places anywhere that we can get to via subway. One thing I'd really like to try is soul food in Harlem and would also like to find a cheap, generic new york pizza place for a quick bite. We're open to literally anything unless its crazy expensive. Thanks in advance!!! :)",
   'author_fullname': 't2_l0i2u8p',
   'saved': False,
   'mod_reason_title': None,
   'gilded': 0,
   'clicked': False,
   'title': 'First timer looking for suggestions!',
   'link_flair_richtext': [],
   'subreddit_name_prefixed': 'r/FoodNYC',
   'hidden': False,
   'pwls': None,
   'link_flair_css_class': None,
   'downs': 0,
   'thumbnail_height': None,
   'hide_score': False,
   'name': 't3_a7gkn5',
   'quarantine': 

In [36]:
posts_1[0]['data']['name']

't3_a7gkn5'

In [37]:
# Convert the data to pandas DataFrame
df_1 = pd.DataFrame(posts_1)

In [38]:
df_1.shape

(856, 2)

## We will run the whole same process with the other subreddit.

### Collect data

In [40]:
url = 'https://www.reddit.com/r/FoodNYC.json'
headers = {'User-agent': 'Bleep blorp bot 0.1'}
res = requests.get(url, headers=headers)
res.status_code

200

In [41]:
the_json = res.json()

# Check the keys in data.
the_json.keys()

dict_keys(['kind', 'data'])

In [42]:
# Check the keys in data
the_json['data'].keys()

dict_keys(['modhash', 'dist', 'children', 'after', 'before'])

In [43]:
# Check the length of the data
len(the_json['data']['children'])

25

In [44]:
[post['data']['name'] for post in the_json['data']['children']]

['t3_a7gkn5',
 't3_a73swu',
 't3_a5vtbi',
 't3_a613q7',
 't3_a5xt9u',
 't3_a5ljr9',
 't3_a4m8hr',
 't3_a4fzo1',
 't3_a3pzql',
 't3_a3jpre',
 't3_a3ct3k',
 't3_a2ifnf',
 't3_a2lf71',
 't3_a2hwh4',
 't3_a0pym9',
 't3_9zjqpk',
 't3_9znhwr',
 't3_9ytotc',
 't3_9ysv5x',
 't3_9ylntz',
 't3_9y57jg',
 't3_9y79xe',
 't3_9xdqaq',
 't3_9x2mgt',
 't3_9x643u']

In [45]:
# Check the after key for iteration
the_json['data']['after']

't3_9x643u'

In [46]:
# Set the after key as a parameter
param = {'after': 't3_9x643u'}

In [47]:
# Check the response if it is 200.
requests.get(url, params=param, headers=headers)

<Response [200]>

In [59]:
# Code to get the posts from reddit.com/r/FoodNYC
afters = []                   # List of the afters
posts_2 = []                  # List of the posts
after = None
for i in range(36):           # Iterate 40 times to get around 1000 posts.
    if after == None:
        params = {}
    else:
        params = {'after': after}
    
    url_2 = 'http://www.reddit.com/r/FoodLosAngeles.json'
    res = requests.get(url_2, params=params, headers=headers)
    if res.status_code == 200:
        the_json = res.json()
        posts_2.extend(the_json['data']['children'])
        after = the_json['data']['after']
        afters.append(after)
    else:
        print(res.status_code)
        break
    
    time.sleep(1)

In [60]:
# Check the number of unique posts.
len(set([p['data']['name'] for p in posts_2]))

882

> Some after key values are duplicated. It seems reddit provided me the duplicated keys and that cause me to get less than 1000 posts. We might need to eliminate the duplicate later.

In [61]:
# Check the after values.
pd.Series(afters).value_counts()

t3_2hx03d    1
t3_2djvhi    1
t3_9lcvb9    1
t3_8qyqvm    1
t3_6qkurs    1
t3_3i8thh    1
t3_329cy0    1
t3_3s8up5    1
t3_2lelzv    1
t3_766j0d    1
t3_a2quvp    1
t3_2fn4jd    1
t3_91p1cu    1
t3_9rekpy    1
t3_9x2sg8    1
t3_8jqmjy    1
t3_87c45q    1
t3_7l04fi    1
t3_4lqrtt    1
t3_99zgur    1
t3_43svx1    1
t3_2oefvf    1
t3_8e4zgp    1
t3_2uuk28    1
t3_57c2nt    1
t3_382gyo    1
t3_2ds7k3    1
t3_63v46j    1
t3_9fjm40    1
t3_4ci6fv    1
t3_4ym84j    1
t3_6hxx87    1
t3_5h2oau    1
t3_5sx01g    1
t3_9it04c    1
dtype: int64

In [62]:
# Check how many after keys used.
len(pd.Series(afters).value_counts())

35

> We iterate the web scraping 35 times to get no duplicates.

In [63]:
posts_2[0]['data']['name']

't3_a7duh5'

In [64]:
# Convert the data to pandas DataFrame
df_2 = pd.DataFrame(posts_2)

In [65]:
df_2.shape

(882, 2)

#### Combine two dataframes and then save as csv file

In [72]:
final_df = pd.concat([df_1, df_2], axis=0)

In [75]:
final_df.reset_index(drop=True, inplace=True)

In [78]:
final_df.shape

(1738, 2)

In [80]:
final_df.to_csv('./reddit_posts.csv', index=False)