# r/NTU Subreddit Scraper

In [1]:
import pandas as pd
import praw
import os
from datetime import datetime, timezone
from dotenv import load_dotenv

### Loading environment variables

In [2]:
load_dotenv()
reddit = praw.Reddit(client_id = os.getenv('REDDIT_CLIENT_ID'),
                               client_secret = os.getenv('REDDIT_CLIENT_SECRET'),
                               user_agent = os.getenv('REDDIT_USER_AGENT'))

### Accessing subreddit

In [3]:
subreddit = reddit.subreddit("NTU")
 
# Display the name of the Subreddit
print("Display Name:", subreddit.display_name)
 
# Display the title of the Subreddit
print("Title:", subreddit.title)
 
# Display the description of the Subreddit
print("Description:", subreddit.description)

Display Name: NTU
Title: Nanyang Technological University Singapore
Description: **Nanyang Technological University, Singapore** 

The official subreddit for **NTU**

* Student? 
* Professor?
* Procrastinating PhD?
* RA?
* Camp Sec?
* Cleaning Auntie? 

Everyone is welcome here!
___________________________________

**Related Subreddits:**

[/r/Singapore](http://www.reddit.com/r/singapore/)
[/r/NUS](http://www.reddit.com/r/NUS)



### Scraping posts by year and month

In [4]:
def scrape_reddit_posts(subreddit, start_date, end_date):
    posts = subreddit.top(time_filter="all", limit=1000)
    data = []

    for post in posts:
        post_datetime = datetime.fromtimestamp(post.created_utc, tz=timezone.utc)
        if start_date <= post_datetime <= end_date:
            data.append({
                'Type': 'Post',
                'Post_id': post.id,
                'Title': post.title,
                'Author': post.author.name if post.author else 'Unknown',
                'Timestamp': post_datetime,
                'Text': post.selftext,
                'Score': post.score,
                'Total_comments': post.num_comments,
                'Post_URL': post.url
            })

            if post.num_comments > 0:
                post.comments.replace_more(limit=None)
                for comment in post.comments.list():
                    data.append({
                        'Type': 'Comment',
                        'Post_id': post.id,
                        'Title': post.title,
                        'Author': comment.author.name if comment.author else 'Unknown',
                        'Timestamp': pd.to_datetime(comment.created_utc, unit='s'),
                        'Text': comment.body,
                        'Score': comment.score,
                        'Total_comments': 0, #Comments don't have this attribute
                        'Post_URL': None  #Comments don't have this attribute
                    })
    return pd.DataFrame(data)
    

### Building combined dataframe

In [6]:
months = [
    ('2023-09-01', '2023-09-30'),
    ('2023-10-01', '2023-10-31'),
    ('2023-11-01', '2023-11-30'),
    ('2023-12-01', '2023-12-31'),
    ('2024-01-01', '2024-01-31'),
    ('2024-02-01', '2024-02-28'),
    ('2024-03-01', '2024-03-31'),
    ('2024-04-01', '2024-04-30'),
    ('2024-05-01', '2024-05-31'),
    ('2024-06-01', '2024-06-30'),
    ('2024-07-01', '2024-07-31'),
    ('2024-08-01', '2024-08-31')
]

# Scrape data for each month and combine into a single dataframe
dataframes = []
for start_str, end_str in months:
    start_date = datetime.strptime(start_str, '%Y-%m-%d').replace(tzinfo=timezone.utc)
    # Adjust the end date to the last second of the last day of the month
    end_date = datetime.strptime(end_str, '%Y-%m-%d').replace(tzinfo=timezone.utc).replace(hour=23, minute=59, second=59)
    
    df = scrape_reddit_posts(subreddit, start_date, end_date)
    dataframes.append(df)

# Concatenate all dataframes
yearly_data = pd.concat(dataframes)

In [8]:
yearly_data.to_csv('data/reddit_data.csv')

In [7]:
yearly_data['Timestamp'] = pd.to_datetime(yearly_data['Timestamp'], utc=True)

# Create new columns for year and month
yearly_data['Year'] = yearly_data['Timestamp'].dt.year
yearly_data['Month'] = yearly_data['Timestamp'].dt.month

# Group by Year and Month and count the number of posts in each month
monthly_counts = yearly_data.groupby(['Year', 'Month']).size().reset_index(name='Post_Count')

# Display the result
print(monthly_counts)

    Year  Month  Post_Count
0   2023      9        1811
1   2023     10        1391
2   2023     11        1774
3   2023     12         382
4   2024      1        1022
5   2024      2        1403
6   2024      3        1522
7   2024      4        1073
8   2024      5         662
9   2024      6         832
10  2024      7         298
11  2024      8        1568
12  2024      9          18


In [None]:
months_2024 = [1, 2, 3, 4, 5, 6, 7, 8]
dataframes = []
for month in months_2024:
    month_df = scrape_month_data(subreddit, 2024, month)
    dataframes.append(month_df)

df_2023 = pd.concat(dataframes, ignore_index=True)

In [7]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2666 entries, 0 to 2665
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Type            2666 non-null   object
 1   Post_id         2666 non-null   object
 2   Title           2666 non-null   object
 3   Author          2666 non-null   object
 4   Timestamp       2666 non-null   object
 5   Text            2666 non-null   object
 6   Score           2666 non-null   int64 
 7   Total_comments  2666 non-null   int64 
 8   Post_URL        95 non-null     object
dtypes: int64(2), object(7)
memory usage: 187.6+ KB


In [8]:
all_data.head(10)

Unnamed: 0,Type,Post_id,Title,Author,Timestamp,Text,Score,Total_comments,Post_URL
0,Post,1de0fe8,An NTU student’s letter to Singapore’s Ministr...,EverySink,2024-06-12 07:00:14+00:00,,325,184,https://i.redd.it/jvmr0tk8a36d1.jpeg
1,Comment,1de0fe8,An NTU student’s letter to Singapore’s Ministr...,shuixian515,2024-06-12 08:12:48,"How is that Home affairs problem, shouldn't th...",118,0,
2,Comment,1de0fe8,An NTU student’s letter to Singapore’s Ministr...,vxrnnn_,2024-06-12 10:20:32,whats ntu gonna do,52,0,
3,Comment,1de0fe8,An NTU student’s letter to Singapore’s Ministr...,Smooth_Barnacle_4093,2024-06-12 07:30:34,![gif](giphy|VIPfTy8y1Lc5iREYDS|downsized)\n\n...,156,0,
4,Comment,1de0fe8,An NTU student’s letter to Singapore’s Ministr...,Fearless_Day528,2024-06-12 10:10:24,"Just curious, how is SG complicit?",35,0,
5,Comment,1de0fe8,An NTU student’s letter to Singapore’s Ministr...,sriracha_cucaracha,2024-06-12 07:12:54,When you can't get an internship during this v...,136,0,
6,Comment,1de0fe8,An NTU student’s letter to Singapore’s Ministr...,TotallyAuric,2024-06-12 12:10:08,I mean I love the sentiment but honestly what ...,43,0,
7,Comment,1de0fe8,An NTU student’s letter to Singapore’s Ministr...,Forsaken-Profit9312,2024-06-12 11:15:44,I’m pretty sure NTU is just waiting for E to g...,28,0,
8,Comment,1de0fe8,An NTU student’s letter to Singapore’s Ministr...,CaptainBroady,2024-06-12 08:20:48,"""fellows students"" \n\n""growing knowledge for ...",35,0,
9,Comment,1de0fe8,An NTU student’s letter to Singapore’s Ministr...,Unknown,2024-06-12 13:47:04,There is no university in Sudan too. Bunch of ...,18,0,


### Group the comments by post

In [17]:
grouped_text = all_data.groupby('Title')['Text'].apply(lambda x: ' '.join(x)).reset_index()
grouped_text.head()

Unnamed: 0,Title,Text
0,1 Red bus running at 6pm,Been waiting for 20min+ and still no bus 🫠 6pm...
1,AMA: Nanyang Business School (NBS),"Hi incoming freshmen, congratulations to those..."
2,Academic Termination,I just got my results. My previous 2 sem was d...
3,Advice to freshers,This post is for freshmen joining this year! C...
4,Aight I’m bored let’s stir the pot,Ah yes\n\nPrimarily an engineer problem Thoug...


In [20]:
grouped_text.to_csv('data/subreddit_comments.csv', index=False)