### Connecting to the Reddit API

In [6]:
# !pip install praw
# !pip install pandas

import praw

# temporary credentials file for local development
import secret
import pandas as pd

# This authentication provides read-only Reddit instances to the API
# To get credentials, need to make a bot here: https://www.reddit.com/prefs/apps/

# PRAW Documentation https://praw.readthedocs.io/en/stable/getting_started/quick_start.html#read-only-reddit-instances
reddit = praw.Reddit(
    client_id=secret.client_id,
    client_secret=secret.client_secret,
    user_agent=secret.user_agent
)

### Extracting The Top 10 Posts from r/computerscience (The first post is empty)

In [35]:
# this endpoint returns a submissions object
# https://praw.readthedocs.io/en/latest/code_overview/models/submission.html

submissions = reddit.subreddit("computerscience").hot(limit=100)

# one for posts
text = []
author = []
date = []
title = []
score = []
upvote_ratio = []
url = []
num_comments = []


for submission in submissions:
    text.append(submission.selftext)
    date.append(submission.created_utc)
    author.append(submission.author)
    score.append(submission.score)
    upvote_ratio.append(submission.upvote_ratio)
    title.append(submission.title)
    url.append(submission.url)
    num_comments.append(submission.num_comments)

    

In [36]:
submissions_df = pd.DataFrame(
    {
        'created_utc': date,
        'title': title,
        'text': text,
        'author': author,
        'score': score,
        'upvote_ratio': upvote_ratio,
        'num_comments': num_comments,
        'url': url
    }
)
submissions_df.head()

Unnamed: 0,created_utc,title,text,author,score,upvote_ratio,num_comments,url
0,1673829000.0,"Looking for books, videos, or other resources ...",,mobotsar,93,0.99,113,https://www.reddit.com/r/computerscience/comme...
1,1686431000.0,/r/ComputerScience will be going dark starting...,"## Update (June 16th, 2023):\n\nThis subreddit...",nuclear_splines,290,0.97,21,https://www.reddit.com/r/computerscience/comme...
2,1686512000.0,How computers measure time,Can someone explain this to me? I've been told...,RunDiscombobulated67,86,0.98,27,https://www.reddit.com/r/computerscience/comme...
3,1686514000.0,Question About Registers,Hello everyone. There is a misunderstanding I ...,mellowhorses,62,0.97,24,https://www.reddit.com/r/computerscience/comme...
4,1686507000.0,Learning a new skill,"Hey guys,\n\nWanted to ask what a good compute...",Haunting_Document142,30,0.9,38,https://www.reddit.com/r/computerscience/comme...


In [37]:
submissions_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   created_utc   100 non-null    float64
 1   title         100 non-null    object 
 2   text          100 non-null    object 
 3   author        97 non-null     object 
 4   score         100 non-null    int64  
 5   upvote_ratio  100 non-null    float64
 6   num_comments  100 non-null    int64  
 7   url           100 non-null    object 
dtypes: float64(2), int64(2), object(4)
memory usage: 6.4+ KB


In [38]:
submissions_df.to_csv('data/computerscience_hot_posts.csv')

### Extracting Data from Comments

In [42]:
body_markdown = []
body_html = []
comment_date = []
ids = []
submission_title = []
comment_author = []
comment_score = []


submissions = reddit.subreddit("computerscience").hot(limit=10)

for submission in submissions:
    for comment in submission.comments:
        body_markdown.append(comment.body)
        body_html.append(comment.body_html)
        comment_date.append(comment.created_utc)
        ids.append(comment.id)
        comment_author.append(comment.author)
        comment_score.append(comment.score)
        
        
    submission_title.append(title)

comments

In [43]:
comments_df = pd.DataFrame(
    {
        'id': ids,
        'created_utc': comment_date,
        'body_markdown': body_markdown,
        'body_html': body_html,
        'author': comment_author,
        'score': comment_score
    }
)
comments_df.head()

Unnamed: 0,id,created_utc,body_markdown,body_html,author,score
0,k3gl8a0,1696443000.0,https://chat.whatsapp.com/FE4klWAub8uJiORK3ehP...,"<div class=""md""><p><a href=""https://chat.whats...",Next_Construction888,1
1,k02uvjf,1694415000.0,"Uhh, i am looking for some free resources to s...","<div class=""md""><p>Uhh, i am looking for some ...",0xParthS,1
2,k11w28o,1694992000.0,I graduated with my bachelor's degree in compu...,"<div class=""md""><p>I graduated with my bachelo...",Frick-Fracker73,1
3,k36v1al,1696279000.0,\nHey im juniar secondery school major compute...,"<div class=""md""><p>Hey im juniar secondery sch...",Substantial_Dress223,1
4,jbo89ru,1678456000.0,Can someone recommend a good book on advanced ...,"<div class=""md""><p>Can someone recommend a goo...",haircut_giver,1


In [46]:
comments_df.to_csv('data/computerscience_hot_posts_comments.csv')

In [47]:
pd.read_csv("data/computerscience_hot_posts_comments.csv")

Unnamed: 0.1,Unnamed: 0,id,created_utc,body_markdown,body_html,author,score
0,0,k3gl8a0,1.696443e+09,https://chat.whatsapp.com/FE4klWAub8uJiORK3ehP...,"<div class=""md""><p><a href=""https://chat.whats...",Next_Construction888,1
1,1,k02uvjf,1.694415e+09,"Uhh, i am looking for some free resources to s...","<div class=""md""><p>Uhh, i am looking for some ...",0xParthS,1
2,2,k11w28o,1.694992e+09,I graduated with my bachelor's degree in compu...,"<div class=""md""><p>I graduated with my bachelo...",Frick-Fracker73,1
3,3,k36v1al,1.696279e+09,\nHey im juniar secondery school major compute...,"<div class=""md""><p>Hey im juniar secondery sch...",Substantial_Dress223,1
4,4,jbo89ru,1.678456e+09,Can someone recommend a good book on advanced ...,"<div class=""md""><p>Can someone recommend a goo...",haircut_giver,1
...,...,...,...,...,...,...,...
147,147,jnnwewp,1.686408e+09,> If you don't provide an internal input for t...,"<div class=""md""><blockquote>\n<p>If you don&#3...",finedesignvideos,2
148,148,jt2lsrj,1.690078e+09,"Imagine we claim to have a magical machine, le...","<div class=""md""><p>Imagine we claim to have a ...",Rorza0093,2
149,149,jnl4me4,1.686348e+09,[deleted],"<div class=""md""><p>[deleted]</p>\n</div>",,-7
150,150,jnnzftl,1.686410e+09,I think the confusion is stemming from the dif...,"<div class=""md""><p>I think the confusion is st...",Kroutoner,1
