# Data Collection

<!-- Promoted: <span class="_2oEYZXchPfHwcf9mTMGMg8 V0WjfoF5BV7_qbExmbmeR"><span style="color: rgb(120, 124, 126);">promoted</span></span>
User: <a class="_2tbHP6ZydRpjI44J3syuqC  _23wugcdiaj44hdfugIAlnX oQctV4n0yUb0uiHDdGnmE" data-click-id="user" data-testid="post_author_link" href="/user/pedrofski12/" style="color: rgb(120, 124, 126);">u/pedrofski12</a>
Timestamp: <span class="_2VF2J19pUIMSLJFky-7PEI" data-testid="post_timestamp" data-click-id="timestamp" style="color: rgb(120, 124, 126);">28 minutes ago</span>
Flair: <div class="_2X6EB3ZhEeXCh1eIVA64XM _2hSecp_zkPm_s5ddV2htoj _2VqfzH0dZ9dIl3XWNxs42y aJrgrewN9C8x1Fusdx4hh _1Dl-kvSxyJMWO9nuoTof8N " style="background-color: rgb(0, 91, 161); color: rgb(255, 255, 255);"><span>Media</span></div>
Text: <h3 class="_eYtD2XCVieq6emjKBH3m">Chapecoense 1-[2] Vila Nova - Kaio Nunes 62'</h3>
Comments: <span class="FHCV02u6Cp2zYL0fhQPsO"><span></span><span id="comment_t3_x14heb_count_anim"></span>4 comments</span> -->

### Imports

In [1]:
import pandas as pd

import requests
import requests.auth
import json

from time import sleep

### Exploration of Available Data

In [2]:
# set base url
url = 'https://old.reddit.com/r/soccer/new/.json'

In [3]:
# intial request
response = requests.get(url, headers = {'User-agent': 'yourbot'})

In [4]:
# post title
response.json()['data']['children'][0]['data']['title']

"Benfica [1]-1 Vizela - David Neres Great goal 76'"

In [5]:
# post number of comments
response.json()['data']['children'][0]['data']['num_comments']

9

- Issue: The newest posts have negatively-biased number of comments
    - Fix: Start on 2nd or 3rd page

In [6]:
# check number of children per page
response.json()['data']['dist']

25

- Note: Can get all necessary data from one page on Old Reddit

### Data Collection Testing

**Data to collect from each child:**
1. num_comments
2. title
3. score
4. author_flair_text
5. link_flair_text
6. created_utc

**Getting data from more than one page:**
1. Initialize df
2. Set base url
1. Get data from first page using base url and store in df
    - make request and store in response
    - extract data from response.json()
        - initialize a page list that will contain rows
        - iterate over page children
            - create row list for each child
            - append row list to page list
    - create page df from page list
    - concat page df to existing data df initialized outside loop
3. Update url using ?after=, Get data from next page using base url + new 'after' string
    - update url
    - make new request and store in response
    - extract data from new response.json()
        - initialize a list that will contain rows
        - intialize a list that will be a row
        - append each data value to row
        - append each row to list that will contain rows
    - create a df for the page out of list of rows
    - concat page df to existing data df initialized outside loop
4. Repeat 4 until enough data collected
5. Write data to csv

In [7]:
# initialize df
soccer_posts_df = pd.DataFrame(columns=['title', 'score', 'author_flair_text', 'link_flair_text', 'created_utc', 'num_comments'])

In [8]:
# set base url
url_base = 'https://www.reddit.com/r/soccer/new/.json'

In [9]:
# get data from first page using base url and store in df
response = requests.get(url, headers = {'User-agent': 'yourbot'})

page = []
for i in range(response.json()['data']['dist']):
    row = [response.json()['data']['children'][i]['data'][col_name] for col_name in soccer_posts_df.columns]
    page.append(row)
df = pd.DataFrame(page, columns=soccer_posts_df.columns)
soccer_posts_df = pd.concat([soccer_posts_df, df])

In [10]:
soccer_posts_df

Unnamed: 0,title,score,author_flair_text,link_flair_text,created_utc,num_comments
0,Benfica [1]-1 Vizela - David Neres Great goal 76',33,:r_soccer_user:,Media,1662147483.0,9
1,Edson Álvarez did NOT strike in order to make ...,16,,Transfers,1662147482.0,2
2,[Tariq Panja] Inter and Roma affected harder t...,14,,News,1662147323.0,6
3,Spain [2] - 0 Hungary | Irene Paredes 26',5,:Real_Madrid:,Media,1662147224.0,4
4,Fortuna Sittard 1-[2] FC Utrecht - Anastasios ...,10,:Ajax:,Media,1662147042.0,14
5,Spain [1] - 0 Hungary | Esther González 23',6,:Real_Madrid:,Media,1662146994.0,4
6,Fortuna Sittard [1]-1 FC Utrecht - Rodrigo Gut...,11,:Ajax:,Media,1662146867.0,1
7,[Italian Serie B]Cagliari 1-0 Modena - Marko R...,6,:Reading_FC:,Media,1662146600.0,2
8,Fortuna Sittard - FC Utrecht suspended for sho...,14,:Ajax:,Media,1662146654.0,10
9,[El Chiringuito TV] Exclusive: Marcelo is clos...,27,:Olympiacos:,Transfers,1662146447.0,8


In [11]:
soccer_posts_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 25 entries, 0 to 24
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   title              25 non-null     object
 1   score              25 non-null     object
 2   author_flair_text  22 non-null     object
 3   link_flair_text    25 non-null     object
 4   created_utc        25 non-null     object
 5   num_comments       25 non-null     object
dtypes: object(6)
memory usage: 1.4+ KB


- Note: Some nulls showing up in author_flair_text

In [12]:
# get data for the next three pages (collection method testing)
for i in range(3):
    url = f'{url_base}?after=' + response.json()['data']['after']

    response = requests.get(url, headers = {'User-agent': 'yourbot'})

    page = []
    for i in range(response.json()['data']['dist']):
        row = [response.json()['data']['children'][i]['data'][col_name] for col_name in soccer_posts_df.columns]
        page.append(row)
    df = pd.DataFrame(page, columns=soccer_posts_df.columns)
    soccer_posts_df = pd.concat([soccer_posts_df, df])

In [13]:
soccer_posts_df

Unnamed: 0,title,score,author_flair_text,link_flair_text,created_utc,num_comments
0,Benfica [1]-1 Vizela - David Neres Great goal 76',33,:r_soccer_user:,Media,1662147483.0,9
1,Edson Álvarez did NOT strike in order to make ...,16,,Transfers,1662147482.0,2
2,[Tariq Panja] Inter and Roma affected harder t...,14,,News,1662147323.0,6
3,Spain [2] - 0 Hungary | Irene Paredes 26',5,:Real_Madrid:,Media,1662147224.0,4
4,Fortuna Sittard 1-[2] FC Utrecht - Anastasios ...,10,:Ajax:,Media,1662147042.0,14
...,...,...,...,...,...,...
20,Daily Discussion,59,:RUFC:,Daily Discussion,1662112812.0,1705
21,QPR sign Tim Iroegbunam on loan from Aston Villa,26,:Haverfordwest_County_FC:,:Queens_Park_Rangers:Official Source,1662112802.0,0
22,Port Vale sign Dennis Politic on loan from Cre...,17,:Haverfordwest_County_FC:,:Port_Vale:Official Source,1662112682.0,0
23,Peterborough sign Kelland Watts on loan from N...,16,:Haverfordwest_County_FC:,:Peterborough_United:Official Source,1662112590.0,4


In [14]:
# fix df indices
soccer_posts_df.reset_index(inplace=True)
soccer_posts_df.drop(axis=1, columns='index', inplace=True)

In [15]:
soccer_posts_df

Unnamed: 0,title,score,author_flair_text,link_flair_text,created_utc,num_comments
0,Benfica [1]-1 Vizela - David Neres Great goal 76',33,:r_soccer_user:,Media,1662147483.0,9
1,Edson Álvarez did NOT strike in order to make ...,16,,Transfers,1662147482.0,2
2,[Tariq Panja] Inter and Roma affected harder t...,14,,News,1662147323.0,6
3,Spain [2] - 0 Hungary | Irene Paredes 26',5,:Real_Madrid:,Media,1662147224.0,4
4,Fortuna Sittard 1-[2] FC Utrecht - Anastasios ...,10,:Ajax:,Media,1662147042.0,14
...,...,...,...,...,...,...
95,Daily Discussion,59,:RUFC:,Daily Discussion,1662112812.0,1705
96,QPR sign Tim Iroegbunam on loan from Aston Villa,26,:Haverfordwest_County_FC:,:Queens_Park_Rangers:Official Source,1662112802.0,0
97,Port Vale sign Dennis Politic on loan from Cre...,17,:Haverfordwest_County_FC:,:Port_Vale:Official Source,1662112682.0,0
98,Peterborough sign Kelland Watts on loan from N...,16,:Haverfordwest_County_FC:,:Peterborough_United:Official Source,1662112590.0,4


In [16]:
# Write soccer_posts_df to csv:
soccer_posts_df.to_csv('data/first100.csv', index=False)

### Putting it all together to collect 10,000+ posts

1. Get and store data from first page
2. Get and store data from next 60 pages (1500 posts)
3. Sleep one minute (60 requests per minute is compliance max for clients)
4. Repeat steps 2 and 3 ten times (15,000 posts after first page)
5. Write data to csv

In [2]:
# request a token
client_auth = requests.auth.HTTPBasicAuth('WLnCkgzuzO15JrhCBVw32w', 'E_zur7VJmfugyRlP7cAwxssMGwQQpw')
post_data = {'grant_type': 'password', 'username': 'gtaylor38', 'password': '7teen4_2@'}
headers = {'User-Agent': 'ScraperClient/1.0 by gtaylor38'}
response_token = requests.post('https://www.reddit.com/api/v1/access_token', auth=client_auth, data=post_data, headers=headers)
response_token.json()

{'access_token': '60700017415-W1rhcIQwMo4V-a2LdiYQ1YUnSv_clQ',
 'token_type': 'bearer',
 'expires_in': 86400,
 'scope': '*'}

In [18]:
access_token = response_token.json()['access_token']
token_type = response_token.json()['token_type']

In [19]:
headers = {"Authorization": f'{token_type} {access_token}', 'User-Agent': 'ScraperClient/1.0 by gtaylor38'}

In [31]:
# initialize df
soccer_posts_df = pd.DataFrame(columns=['title', 'score', 'author_flair_text', 'link_flair_text', 'created_utc', 'num_comments'])

# set base url
url_base = 'https://oauth.reddit.com/r/soccer/new/.json'

# initialize count
#count = 0

# get data from first page using base url and store in df
response = requests.get(url_base, headers=headers)

page = []

for i in range(response.json()['data']['dist']):
    row = [response.json()['data']['children'][i]['data'][col_name] for col_name in soccer_posts_df.columns]
    page.append(row)
df = pd.DataFrame(page, columns=soccer_posts_df.columns)
soccer_posts_df = pd.concat([soccer_posts_df, df])

# collect data 60 pages at a time (sleeping in between), ten times
for i in range(3):
    # get and store data for the next sixty pages
    for j in range(30):
        # update count, after, url
        #count += 25
        after = response.json()['data']['after']
        url_current = f'{url_base}?after={after}'
        
        # make new request and reset page list
        response = requests.get(url_current, headers=headers)
        page_new = []

        # collect and store data in page list
        for k in range(response.json()['data']['dist']):
            row = [response.json()['data']['children'][k]['data'][col_name] for col_name in soccer_posts_df.columns]
            page_new.append(row)
        
        # store page list as df and concat df to soccer_posts_df
        df = pd.DataFrame(page_new, columns=soccer_posts_df.columns)
        soccer_posts_df = pd.concat([soccer_posts_df, df])

    # sleep the remainder of the minute
    sleep(60)
    
# Write soccer_posts_df to csv:
soccer_posts_df.to_csv('data/soccer_posts.csv', index=False)

KeyError: 'data'

In [28]:
type(after)

str

In [29]:
soccer_posts_df.shape

(2265, 6)

In [30]:
len(soccer_posts_df.created_utc.unique())

993

In [24]:
len(soccer_posts_df.title.unique())

993