# r/NTU Subreddit Scraper

In [2]:
import pandas as pd
import praw
import os
from datetime import datetime, timezone
from dotenv import load_dotenv

In [3]:
load_dotenv()

True

In [4]:
reddit_read_only = praw.Reddit(client_id = os.getenv('REDDIT_CLIENT_ID'),
                               client_secret = os.getenv('REDDIT_CLIENT_SECRET'),
                               user_agent = os.getenv('REDDIT_USER_AGENT'))

In [5]:
subreddit = reddit_read_only.subreddit("NTU")
 
# Display the name of the Subreddit
print("Display Name:", subreddit.display_name)
 
# Display the title of the Subreddit
print("Title:", subreddit.title)
 
# Display the description of the Subreddit
print("Description:", subreddit.description)

Display Name: NTU
Title: Nanyang Technological University Singapore
Description: **Nanyang Technological University, Singapore** 

The official subreddit for **NTU**

* Student? 
* Professor?
* Procrastinating PhD?
* RA?
* Camp Sec?
* Cleaning Auntie? 

Everyone is welcome here!
___________________________________

**Related Subreddits:**

[/r/Singapore](http://www.reddit.com/r/singapore/)
[/r/NUS](http://www.reddit.com/r/NUS)



## June DF

In [18]:
target_year = 2024
target_month = 6

posts = subreddit.top(time_filter="all", limit=1000)
data = []

for post in posts:
    post_timestamp = post.created_utc
    post_datetime = datetime.fromtimestamp(post_timestamp, tz=timezone.utc)
    if post_datetime.year == target_year and post_datetime.month == target_month:
        data.append({
            'Type': 'Post',
            'Post_id': post.id,
            'Title': post.title,
            'Author': post.author.name if post.author else 'Unknown',
            'Timestamp': post_datetime,
            'Text': post.selftext,
            'Score': post.score,
            'Total_comments': post.num_comments,
            'Post_URL': post.url
        })

        if post.num_comments > 0:
            post.comments.replace_more(limit=None)
            for comment in post.comments.list():
                data.append({
                    'Type': 'Comment',
                    'Post_id': post.id,
                    'Title': post.title,
                    'Author': comment.author.name if comment.author else 'Unknown',
                    'Timestamp': pd.to_datetime(comment.created_utc, unit='s'),
                    'Text': comment.body,
                    'Score': comment.score,
                    'Total_comments': 0, #Comments don't have this attribute
                    'Post_URL': None  #Comments don't have this attribute
                })


jun_df = pd.DataFrame(data)

In [19]:
jun_df.info()
jun_df.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 836 entries, 0 to 835
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Type            836 non-null    object
 1   Post_id         836 non-null    object
 2   Title           836 non-null    object
 3   Author          836 non-null    object
 4   Timestamp       836 non-null    object
 5   Text            836 non-null    object
 6   Score           836 non-null    int64 
 7   Total_comments  836 non-null    int64 
 8   Post_URL        28 non-null     object
dtypes: int64(2), object(7)
memory usage: 58.9+ KB


Unnamed: 0,Type,Post_id,Title,Author,Timestamp,Text,Score,Total_comments,Post_URL
0,Post,1de0fe8,An NTU student’s letter to Singapore’s Ministr...,EverySink,2024-06-12 07:00:14+00:00,,331,185,https://i.redd.it/jvmr0tk8a36d1.jpeg
1,Comment,1de0fe8,An NTU student’s letter to Singapore’s Ministr...,shuixian515,2024-06-12 08:12:48,"How is that Home affairs problem, shouldn't th...",118,0,
2,Comment,1de0fe8,An NTU student’s letter to Singapore’s Ministr...,vxrnnn_,2024-06-12 10:20:32,whats ntu gonna do,52,0,
3,Comment,1de0fe8,An NTU student’s letter to Singapore’s Ministr...,Smooth_Barnacle_4093,2024-06-12 07:30:34,![gif](giphy|VIPfTy8y1Lc5iREYDS|downsized)\n\n...,156,0,
4,Comment,1de0fe8,An NTU student’s letter to Singapore’s Ministr...,Fearless_Day528,2024-06-12 10:10:24,"Just curious, how is SG complicit?",36,0,
5,Comment,1de0fe8,An NTU student’s letter to Singapore’s Ministr...,sriracha_cucaracha,2024-06-12 07:12:54,When you can't get an internship during this v...,139,0,
6,Comment,1de0fe8,An NTU student’s letter to Singapore’s Ministr...,TotallyAuric,2024-06-12 12:10:08,I mean I love the sentiment but honestly what ...,47,0,
7,Comment,1de0fe8,An NTU student’s letter to Singapore’s Ministr...,Forsaken-Profit9312,2024-06-12 11:15:44,I’m pretty sure NTU is just waiting for E to g...,29,0,
8,Comment,1de0fe8,An NTU student’s letter to Singapore’s Ministr...,CaptainBroady,2024-06-12 08:20:48,"""fellows students"" \n\n""growing knowledge for ...",39,0,
9,Comment,1de0fe8,An NTU student’s letter to Singapore’s Ministr...,Unknown,2024-06-12 13:47:04,There is no university in Sudan too. Bunch of ...,17,0,


## July DF

In [20]:
target_year = 2024
target_month = 7

posts = subreddit.top(time_filter="all", limit=1000)
data = []

for post in posts:
    post_timestamp = post.created_utc
    post_datetime = datetime.fromtimestamp(post_timestamp, tz=timezone.utc)
    if post_datetime.year == target_year and post_datetime.month == target_month:
        data.append({
            'Type': 'Post',
            'Post_id': post.id,
            'Title': post.title,
            'Author': post.author.name if post.author else 'Unknown',
            'Timestamp': post_datetime,
            'Text': post.selftext,
            'Score': post.score,
            'Total_comments': post.num_comments,
            'Post_URL': post.url
        })

        if post.num_comments > 0:
            post.comments.replace_more(limit=None)
            for comment in post.comments.list():
                data.append({
                    'Type': 'Comment',
                    'Post_id': post.id,
                    'Title': post.title,
                    'Author': comment.author.name if comment.author else 'Unknown',
                    'Timestamp': pd.to_datetime(comment.created_utc, unit='s'),
                    'Text': comment.body,
                    'Score': comment.score,
                    'Total_comments': 0, #Comments don't have this attribute
                    'Post_URL': None  #Comments don't have this attribute
                })


jul_df = pd.DataFrame(data)

In [21]:
jul_df.info()
jul_df.head(10)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 277 entries, 0 to 276
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Type            277 non-null    object
 1   Post_id         277 non-null    object
 2   Title           277 non-null    object
 3   Author          277 non-null    object
 4   Timestamp       277 non-null    object
 5   Text            277 non-null    object
 6   Score           277 non-null    int64 
 7   Total_comments  277 non-null    int64 
 8   Post_URL        11 non-null     object
dtypes: int64(2), object(7)
memory usage: 19.6+ KB


Unnamed: 0,Type,Post_id,Title,Author,Timestamp,Text,Score,Total_comments,Post_URL
0,Post,1dvuzoj,BFFR,suspenz,2024-07-05 10:40:39+00:00,I cannot believe people like this get accepted...,364,35,https://i.redd.it/sumhycyiioad1.jpeg
1,Comment,1dvuzoj,BFFR,Relative-Parfait-385,2024-07-05 10:50:42,Bru speed run icebreaker,211,0,
2,Comment,1dvuzoj,BFFR,thesgtrends,2024-07-05 12:35:29,https://preview.redd.it/19x0mln03pad1.jpeg?wid...,124,0,
3,Comment,1dvuzoj,BFFR,Competitive_Pair7874,2024-07-05 11:16:03,I left the grp. It has no value add,92,0,
4,Comment,1dvuzoj,BFFR,Sharp_Appearance7212,2024-07-05 11:05:01,school haven’t start yet wtf,76,0,
5,Comment,1dvuzoj,BFFR,vajraadhvan,2024-07-05 11:33:44,Garry Tan biting the curb 4k,77,0,
6,Comment,1dvuzoj,BFFR,ramenrami22,2024-07-05 11:34:17,Rookie mistake. That one go ntu chatbot ask. T...,65,0,
7,Comment,1dvuzoj,BFFR,TOFU-area,2024-07-05 11:46:06,iq != eq,56,0,
8,Comment,1dvuzoj,BFFR,stateofbrave,2024-07-06 01:40:47,How is someone 20+ years old and still a dumb ...,34,0,
9,Comment,1dvuzoj,BFFR,Heblehblehbleh,2024-07-05 12:03:09,NTU does not accept people based on character.,53,0,


In [23]:
combined_df = pd.concat([jun_df, jul_df])
combined_df.info()
combined_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 1113 entries, 0 to 276
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Type            1113 non-null   object
 1   Post_id         1113 non-null   object
 2   Title           1113 non-null   object
 3   Author          1113 non-null   object
 4   Timestamp       1113 non-null   object
 5   Text            1113 non-null   object
 6   Score           1113 non-null   int64 
 7   Total_comments  1113 non-null   int64 
 8   Post_URL        39 non-null     object
dtypes: int64(2), object(7)
memory usage: 87.0+ KB


Unnamed: 0,Type,Post_id,Title,Author,Timestamp,Text,Score,Total_comments,Post_URL
0,Post,1de0fe8,An NTU student’s letter to Singapore’s Ministr...,EverySink,2024-06-12 07:00:14+00:00,,331,185,https://i.redd.it/jvmr0tk8a36d1.jpeg
1,Comment,1de0fe8,An NTU student’s letter to Singapore’s Ministr...,shuixian515,2024-06-12 08:12:48,"How is that Home affairs problem, shouldn't th...",118,0,
2,Comment,1de0fe8,An NTU student’s letter to Singapore’s Ministr...,vxrnnn_,2024-06-12 10:20:32,whats ntu gonna do,52,0,
3,Comment,1de0fe8,An NTU student’s letter to Singapore’s Ministr...,Smooth_Barnacle_4093,2024-06-12 07:30:34,![gif](giphy|VIPfTy8y1Lc5iREYDS|downsized)\n\n...,156,0,
4,Comment,1de0fe8,An NTU student’s letter to Singapore’s Ministr...,Fearless_Day528,2024-06-12 10:10:24,"Just curious, how is SG complicit?",36,0,
