# Collect Data
---

## Import libraries

In [3]:
import pandas as pd
import requests
import time

## Create function to fetch subreddits

Credit: Daniel Kim

In [11]:
features = ['author', 'title', 'selftext', 'created_utc', 'num_comments', 'score']

In [39]:
def subreddit_submissions(subreddit, n):
    base_url = 'https://api.pushshift.io/reddit/submission/search'
    
    before_timestamp = round(time.time())
    
    all_dfs = []
    
    for _ in range(n):
        print(before_timestamp)
        params = {
        'subreddit' : subreddit,
        'size' : 1000,
        'after': None,
        'before': before_timestamp,
        'lang': 'english',
            }
        
        res = requests.get(base_url,params)
        
        if res.status_code == 200:
            
            data = res.json()['data']
            
            df = pd.DataFrame(data)[features]
            before_timestamp = df['created_utc'].min()
            
            
            all_dfs.append(df)
            
        else:
            print(f'ERROR CODE FOR RES: {res.status_code}')
            continue
        
        time.sleep(3)
    
    dfs = pd.concat(all_dfs)
    
    return dfs.drop(columns = 'created_utc')

## Fetch `AsianBeauty` subreddit

In [42]:
azn = subreddit_submissions('AsianBeauty', 50)

1587615962
1585341815
1582909416
1580581888
1578057044
1575226235
1572678142
1569870800
1567165281
1564576402
1562303170
1559990350
1557619375
1555326057
1553054759
1550652325
1548093980
1545572234
1542841292
1540566120
1538159501
1535577108
1532993457
1530456974
1528163049
1525962159
1523753142
1521230396
1519104892
1516861142
1514764981
1512571472
1510711317
1509036704
1506805986
1504371852
1501786443
1499736162
1497615675
1495500675
1493778594
1492539514
1491316549
1490045020
1488884245
1487618841
1486168255
1484567892
1482857782
1480731115


In [43]:
azn.head(2)

Unnamed: 0,author,title,selftext,num_comments,score
0,AutoModerator,"Anti-Haul Monthly April 23, 2020",Are you on a no buy? Trying to stick to a more...,0,1
1,BlueswEC,[Discussion] Does BHA (salicylic acid) work we...,[removed],0,1


## Fetch `SkincareAddiction` subreddit

In [44]:
skin = subreddit_submissions('SkincareAddiction', 50)

1587616786
1587442452
1587278830
1587130869
1586963359
1586800518
1586637423
1586461053
1586263746
1586069033
1585877235
1585677081
1585478123
1585265645
1585042997
1584818624
1584561733
1584272414
1584011435
1583758282
1583517329
1583277219
1583063487
1582829064
1582590697
1582388597
1582158718
1581946405
1581691522
1581449726
1581261789
1581037273
1580825525
1580603179
1580408656
1580165218
1579899277
1579663333
1579460447
1579232066
1579002925
1578794409
1578595274
1578353161
1578159332
1577948226
1577684593
1577468564
1577157571
1576866857


In [45]:
skin.head(2)

Unnamed: 0,author,title,selftext,num_comments,score
0,laurtay7166,[Routine Help] Suggestions for dehydrated to n...,,1,1
1,atrevz,[B&amp;A] Did the Fifty Shades of Snail sebace...,,1,1


## Label data

Labeling subreddits,
 - AsianBeauty as `is_ab` = 1
 - SkincareAddiction as `is_ab` = 0
 
NOTE: ab is short for AsianBeauty.

In [46]:
azn['subreddit'] = 'asianbeauty'
azn['is_ab'] = 1

azn.head()

Unnamed: 0,author,title,selftext,num_comments,score,subreddit,is_ab
0,AutoModerator,"Anti-Haul Monthly April 23, 2020",Are you on a no buy? Trying to stick to a more...,0,1,asianbeauty,1
1,BlueswEC,[Discussion] Does BHA (salicylic acid) work we...,[removed],0,1,asianbeauty,1
2,Baumannb,Going to try Laneige,[removed],2,1,asianbeauty,1
3,invicktus7,Acne Scars and Sun Spots Before and After Photos!,[removed],0,1,asianbeauty,1
4,Jenny_0831,Thoughts on L'occitane skincare,[removed],0,1,asianbeauty,1


In [47]:
skin['subreddit'] = 'skincareaddiction'
skin['is_ab'] = 0
skin.head()

Unnamed: 0,author,title,selftext,num_comments,score,subreddit,is_ab
0,laurtay7166,[Routine Help] Suggestions for dehydrated to n...,,1,1,skincareaddiction,0
1,atrevz,[B&amp;A] Did the Fifty Shades of Snail sebace...,,1,1,skincareaddiction,0
2,nonstickswag,[MISC] AHA/BHA not working on closed comedones,"As the title says, I have closed comedones on ...",1,1,skincareaddiction,0
3,atrevz,Did the Fifty Shades of Snail sebaceous filame...,,1,1,skincareaddiction,0
4,unclepube,My legs have been like this ever since I was a...,,1,1,skincareaddiction,0


In [48]:
skin.shape

(50000, 7)

## Combine DataFrames and reset index

In [49]:
skincare = pd.concat([skin, azn], axis=0).reset_index(drop=True)

skincare.head()

Unnamed: 0,author,title,selftext,num_comments,score,subreddit,is_ab
0,laurtay7166,[Routine Help] Suggestions for dehydrated to n...,,1,1,skincareaddiction,0
1,atrevz,[B&amp;A] Did the Fifty Shades of Snail sebace...,,1,1,skincareaddiction,0
2,nonstickswag,[MISC] AHA/BHA not working on closed comedones,"As the title says, I have closed comedones on ...",1,1,skincareaddiction,0
3,atrevz,Did the Fifty Shades of Snail sebaceous filame...,,1,1,skincareaddiction,0
4,unclepube,My legs have been like this ever since I was a...,,1,1,skincareaddiction,0


In [50]:
skincare.shape

(100000, 7)

## Save as .csv file

In [53]:
skincare.to_csv('../data/skincare.csv', index = False)