In [13]:
import pandas as pd # require pandas 1.5.3 
import numpy as np
import decouple
import requests
import json
import warnings
import datetime as dt
import os
import re
warnings.filterwarnings("ignore")

import reddit_requests as rr


In [33]:
config = decouple.AutoConfig(' ')
key = config('APIKEY')
pub = config('PUBLICKEY')
user = config('USERNAME')
pw = config('PW')

In [34]:
# use 'kind' key t1_ = comment, t2=acct, t3=link, t4=message, t5=sub, t6=award
auth = requests.auth.HTTPBasicAuth(pub, key)
data = {
    'grant_type': 'password',
    'username': user,
    'password': pw
}

headers = {'User-Agent': 'MYAPI/0.0.1'}
res = requests.post('https://www.reddit.com/api/v1/access_token',
                    auth=auth, data=data, headers=headers)

TOKEN = res.json()['access_token']
headers = {**headers, **{'Authorization': f"bearer {TOKEN}"}}


In [None]:
### scrape lakers stats ###
# historical standings https://www.basketball-reference.com/friv/standings.fcgi?month=10&day=18&year=2022&lg_id=NBA
# current roster https://www.basketball-reference.com/teams/LAL/2023.html#all_roster
# win streak https://www.basketball-reference.com/teams/LAL/2023_games.html
# would be pretty easy to manually do roster changes so not too worried ab that
# otherwise just need game logs for each player and I can code up function to give stats up to that date

In [117]:
### Use pushshift directly ###
subreddit = 'lakers'
start_date = dt.datetime(2022, 10, 15)
end_date = dt.datetime(2022, 10, 19)
start = int(start_date.timestamp())
end = int(end_date.timestamp())

api_query = 'https://api.pushshift.io/reddit/submission/search/' \
            + '?subreddit={}&limit=200&after={}&before={}'.format(subreddit, start, end)

r = requests.get(api_query)
json= r.json()
df = pd.DataFrame(json['data'])


In [118]:
df.shape

(128, 99)

In [119]:
df.utc_datetime_str

0      2022-10-19 06:50:14
1      2022-10-19 06:40:13
2      2022-10-19 06:27:07
3      2022-10-19 06:25:07
4      2022-10-19 06:22:24
              ...         
123    2022-10-15 13:49:40
124    2022-10-15 13:08:29
125    2022-10-15 10:50:36
126    2022-10-15 10:44:58
127    2022-10-15 10:42:55
Name: utc_datetime_str, Length: 128, dtype: object

In [89]:
start_date = dt.datetime(2022, 10, 15)
end_date = dt.datetime(2022, 10, 18)
start = int(start_date.timestamp())
end = int(end_date.timestamp())

print(start, end)

1665990000 1666076400


In [4]:
def get_more_posts(start, end, subreddit='lakers', limit=50):
    """ Get reddit posts back to the passed start_date.  
        
        Parameters
        
        -start_date must be in YYYY-MM-DD string format
    
    """
    start = int(start.timestamp())
    end = int(end.timestamp())
    
    api_query = 'https://api.pushshift.io/reddit/submission/search/' \
                + '?subreddit={}&limit={}&after={}&before={}'.format(subreddit, limit, start, end)
    
    try:
        r = requests.get(api_query)
        json= r.json()
        df = pd.DataFrame(json['data'])
    
        df = df[['utc_datetime_str', 'id', 'title', 'author', 'selftext', 'upvote_ratio']]
        print("Successfully pulled data")
        return df
    except:
        print("Upload failed")



In [10]:
# run through a hacky for loop
base = dt.datetime(2023, 4, 12)
date_list = [base - dt.timedelta(days=x) for x in range(186)]

bad_dates = []
for date in date_list:
    start = date
    end = start + dt.timedelta(days=1)
    df = get_more_posts(start, end)
    try:
        start_string = start.strftime('%Y_%m_%d')
        df.to_csv('data/r_lakers_{}'.format(start_string))
        print("Successfully uploaded data for {}".format(start_string))
    except:
        print("Failed to Upload Data for {}".format(start_string))
        bad_dates.append(start_string.replace('_', '-'))


Successfully pulled data
Successfully uploaded data for 2022_10_19
Upload failed
Failed to Upload Data
Upload failed
Failed to Upload Data
Successfully pulled data
Successfully uploaded data for 2022_10_22
Successfully pulled data
Successfully uploaded data for 2022_10_23
Upload failed
Failed to Upload Data
Upload failed
Failed to Upload Data
Upload failed
Failed to Upload Data
Successfully pulled data
Successfully uploaded data for 2022_10_27
Successfully pulled data
Successfully uploaded data for 2022_10_28
Upload failed
Failed to Upload Data
Successfully pulled data
Successfully uploaded data for 2022_10_30
Upload failed
Failed to Upload Data
Successfully pulled data
Successfully uploaded data for 2022_11_01
Upload failed
Failed to Upload Data
Upload failed
Failed to Upload Data
Successfully pulled data
Successfully uploaded data for 2022_11_04
Successfully pulled data
Successfully uploaded data for 2022_11_05
Successfully pulled data
Successfully uploaded data for 2022_11_06
Upload

Upload failed
Failed to Upload Data
Upload failed
Failed to Upload Data
Upload failed
Failed to Upload Data
Successfully pulled data
Successfully uploaded data for 2023_03_22
Successfully pulled data
Successfully uploaded data for 2023_03_23
Upload failed
Failed to Upload Data
Successfully pulled data
Successfully uploaded data for 2023_03_25
Successfully pulled data
Successfully uploaded data for 2023_03_26
Successfully pulled data
Successfully uploaded data for 2023_03_27
Successfully pulled data
Successfully uploaded data for 2023_03_28
Upload failed
Failed to Upload Data
Upload failed
Failed to Upload Data
Successfully pulled data
Successfully uploaded data for 2023_03_31
Upload failed
Failed to Upload Data
Upload failed
Failed to Upload Data
Upload failed
Failed to Upload Data
Successfully pulled data
Successfully uploaded data for 2023_04_04
Upload failed
Failed to Upload Data
Successfully pulled data
Successfully uploaded data for 2023_04_06
Successfully pulled data
Successfully

In [33]:
# check for min files that need to be re-uploaded
def check_for_max_posts(limit = 50)
    directory = '/Users/dylanjorling/NBASA_reddit/data'
    files = os.listdir(directory)
    files = [x for x in files if len(re.findall(r"r_lakers", x)) == 1]
    min_files = []
    for file in files:
        try:
            x = pd.read_csv('data/' + file)
            if x.shape[0] == limit:
                min_files.append(file)
        except:
            continue
    min_files_date = [file[9:] for file in min_files]
    min_files_date = [file.replace('_', '-') for file in min_files_date]
    return min_files_date


['2023-02-26',
 '2023-02-16',
 '2022-10-23',
 '2023-04-06',
 '2023-03-11',
 '2022-11-15',
 '2023-03-18',
 '2022-12-04',
 '2022-11-23',
 '2023-04-09',
 '2022-12-02',
 '2023-03-26',
 '2023-03-10',
 '2023-04-07',
 '2023-03-17',
 '2022-12-11',
 '2022-11-09',
 '2022-12-18',
 '2022-11-06',
 '2022-12-21',
 '2023-03-02',
 '2023-03-05',
 '2022-11-30',
 '2022-12-28',
 '2022-10-30',
 '2023-01-15',
 '2023-01-23',
 '2023-02-03',
 '2022-10-19',
 '2022-10-28',
 '2023-02-15',
 '2022-10-27',
 '2023-02-23',
 '2022-11-29',
 '2023-03-12',
 '2022-11-11',
 '2022-11-18',
 '2022-11-27',
 '2023-03-23',
 '2022-11-20',
 '2023-03-22',
 '2022-11-26',
 '2023-03-14',
 '2022-11-10',
 '2022-11-28',
 '2023-04-04',
 '2023-03-31',
 '2023-04-19',
 '2023-04-17',
 '2022-11-04',
 '2023-04-11',
 '2023-03-01',
 '2022-12-13',
 '2023-02-01',
 '2023-01-29',
 '2023-02-09',
 '2023-01-20']

In [42]:
base = dt.datetime(2023, 4, 12)
date_list = [base - dt.timedelta(days=x) for x in range(186)]
date_list = [x.strftime('%Y-%m-%d') for x in date_list]
file_list = [file.replace('_', '-')[9:] for file in files]
file_list.remove('')

new_date_list = [x for x in date_list if x not in file_list]
print(len(new_date_list))
new_date_list.append(min_files_date)
print(len(new_date_list))

78
79


In [51]:
new_date_list = new_date_list[:78] 
for x in min_files_date:
    new_date_list.append(x)
print(len(new_date_list))

135


In [None]:
# run through a hacky for loop

bad_dates = []
for date in new_date_list:
    start = dt.datetime.strptime(date, "%Y-%m-%d")
    end = start + dt.timedelta(days=1)
    df = get_more_posts(start, end, limit=200)
    try:
        start_string = start.strftime('%Y_%m_%d')
        df.to_csv('data/r_lakers_{}'.format(start_string))
        print("Successfully uploaded data for {}".format(start_string))
    except:
        start_string2 = start.strftime('%Y-%m-%d')
        print("Failed to Upload Data for {}".format(start_string2))
        bad_dates.append(start_string2)


Upload failed
Failed to Upload Data for 2023-04-12
Successfully pulled data
Successfully uploaded data for 2023_04_10
Successfully pulled data
Successfully uploaded data for 2023_04_05
Upload failed
Failed to Upload Data for 2023-04-03
Successfully pulled data
Successfully uploaded data for 2023_04_02
Upload failed
Failed to Upload Data for 2023-04-01
Successfully pulled data
Successfully uploaded data for 2023_03_30
Upload failed
Failed to Upload Data for 2023-03-29
Upload failed
Failed to Upload Data for 2023-03-24
Successfully pulled data
Successfully uploaded data for 2023_03_21
Successfully pulled data
Successfully uploaded data for 2023_03_20
Successfully pulled data
Successfully uploaded data for 2023_03_19
Successfully pulled data
Successfully uploaded data for 2023_03_16
Successfully pulled data
Successfully uploaded data for 2023_03_15
Upload failed
Failed to Upload Data for 2023-03-09
Upload failed
Failed to Upload Data for 2023-03-07
Successfully pulled data
Successfully up

In [55]:
dt.datetime.strptime('2022-01-01', '%Y-%m-%d')

datetime.datetime(2022, 1, 1, 0, 0)