In [1]:
import pandas as pd
import warnings
import requests
import json
import datetime
from newspaper import Article

warnings.filterwarnings('ignore')

In [2]:
# get starting and ending timestamps for our reddit api pull
start = int(datetime.datetime(2022, 1, 1, 0, 0).timestamp())
end = int(datetime.datetime(2023, 1, 1, 0, 0).timestamp())

onion = 'theonion'
nottheonion = 'nottheonion'


In [3]:
def getPushshiftData(after, before, sub):
    ''' takes in a starting and ending timestamp and a subreddit name and returns a list of dictionaries
    args.
        after (int): starting timestamp
        before (int): ending timestamp
        sub (string): subreddit name 
    returns.
        data['data'] (list): list of dictionaries
    '''
    # build url
    url = 'https://api.pushshift.io/reddit/search/submission/?size=1000&after='+str(after)+'&before='+str(before)+'&subreddit='+str(sub)+'&sort=score'+'&order=desc'
    print(url)
    
    # get data
    r = requests.get(url)
    data = json.loads(r.text)
    return data['data']

def get_text(url):
    ''' takes in a url and returns the text of the article
        fails silently if the url is not an article
    args.
        url (string): url of article
    returns.
        article.text (string): text of article
    '''
    try:
        article = Article(url)
        article.download()
        article.parse()
        return article.text
    except:
        return None

def build_dataframe(data):
    ''' takes in a list of dictionaries and returns a dataframe
    args.
        data: list of dictionaries
    returns.
        df: pandas dataframe
    '''
    df = pd.DataFrame(data)
    
    # filter only necessary columns
    df = df[['title', 'url', 'subreddit']]
    return df

In [4]:
# build dataframes
df_onion = build_dataframe(getPushshiftData(start, end, onion))
df_nottheonion = build_dataframe(getPushshiftData(start, end, nottheonion))

# concat dataframes
full_df = pd.concat([df_nottheonion, df_onion], ignore_index=True)

# get text from urls
full_df['text'] = full_df['url'].map(get_text)
full_df.dropna(inplace=True)
full_df

https://api.pushshift.io/reddit/search/submission/?size=1000&after=1641013200&before=1672549200&subreddit=theonion&sort=score&order=desc
https://api.pushshift.io/reddit/search/submission/?size=1000&after=1641013200&before=1672549200&subreddit=nottheonion&sort=score&order=desc


Unnamed: 0,title,url,subreddit,text
0,Meta's threat to close down Facebook and Insta...,https://www.cityam.com/metas-threat-to-close-d...,nottheonion,Meta’s threat to close down Facebook and Insta...
1,Pregnant Texas woman driving in HOV lane told ...,https://www.chron.com/news/houston-texas/artic...,nottheonion,Is an unborn fetus a human being in the eyes o...
2,Mark Zuckerberg Says Meta Employees “Lovingly”...,https://consequence.net/2022/04/mark-zuckerber...,nottheonion,Mark Zuckerberg hasn’t always had a squeaky-cl...
3,Police didn't immediately confront the gunman ...,https://www.insider.com/texas-shooting-police-...,nottheonion,Law enforcement is getting slammed for its res...
4,Shaquille O'Neal says gorillas freak out when ...,https://www.insider.com/gorillas-afraid-of-sha...,nottheonion,Shaquille O'Neal says gorillas always freak ou...
...,...,...,...,...
1992,Elton John Awarded Medal By Joe Biden For Work...,https://www.theonion.com/elton-john-awarded-me...,TheOnion,President Biden has awarded Sir Elton John wit...
1993,What Republicans Are Saying About The Paul Pel...,https://www.theonion.com/what-republicans-are-...,TheOnion,“How many innocent people have to die before w...
1994,What To Say To Someone Struggling With Inflation,https://www.theonion.com/what-to-say-to-someon...,TheOnion,We may earn a commission from links on this pa...
1995,Herschel Walker Quietly Asking Around For D.C....,https://www.theonion.com/herschel-walker-quiet...,TheOnion,"WRIGHTSVILLE, GA—In a display of confidence ah..."


In [5]:
full_df.to_csv('articles.csv', index=False)