# r/NTU Subreddit Scraper
Building the original dataset 

In [1]:
import pandas as pd
import praw
import os
from datetime import datetime, timezone
from dotenv import load_dotenv

### Loading environment variables

In [2]:
load_dotenv()
reddit = praw.Reddit(client_id = os.getenv('REDDIT_CLIENT_ID'),
                               client_secret = os.getenv('REDDIT_CLIENT_SECRET'),
                               user_agent = os.getenv('REDDIT_USER_AGENT'))

### Accessing subreddit

In [3]:
subreddit = reddit.subreddit("NTU")
 
# Display the name of the Subreddit
print("Display Name:", subreddit.display_name)
 
# Display the title of the Subreddit
print("Title:", subreddit.title)
 
# Display the description of the Subreddit
print("Description:", subreddit.description)

Display Name: NTU
Title: Nanyang Technological University Singapore
Description: **Nanyang Technological University, Singapore** 

The official subreddit for **NTU**

* Student? 
* Professor?
* Procrastinating PhD?
* RA?
* Camp Sec?
* Cleaning Auntie? 

Everyone is welcome here!
___________________________________

**Related Subreddits:**

[/r/Singapore](http://www.reddit.com/r/singapore/)
[/r/NUS](http://www.reddit.com/r/NUS)



### Scraping posts by year and month

In [4]:
def scrape_reddit_posts(subreddit, start_date, end_date):
    posts = subreddit.top(time_filter="all", limit=1000)
    data = []

    for post in posts:
        post_datetime = datetime.fromtimestamp(post.created_utc, tz=timezone.utc)
        if start_date <= post_datetime <= end_date:
            data.append({
                'Type': 'Post',
                'Post_id': post.id,
                'Title': post.title,
                'Author': post.author.name if post.author else 'Unknown',
                'Timestamp': post_datetime,
                'Text': post.selftext,
                'Score': post.score,
                'Total_comments': post.num_comments,
                'Post_URL': post.url
            })

            if post.num_comments > 0:
                post.comments.replace_more(limit=None)
                for comment in post.comments.list():
                    data.append({
                        'Type': 'Comment',
                        'Post_id': post.id,
                        'Title': post.title,
                        'Author': comment.author.name if comment.author else 'Unknown',
                        'Timestamp': pd.to_datetime(comment.created_utc, unit='s'),
                        'Text': comment.body,
                        'Score': comment.score,
                        'Total_comments': 0, #Comments don't have this attribute
                        'Post_URL': None  #Comments don't have this attribute
                    })
    return pd.DataFrame(data)
    

### Building combined dataframe

In [6]:
months = [
    ('2023-09-01', '2023-09-30'),
    ('2023-10-01', '2023-10-31'),
    ('2023-11-01', '2023-11-30'),
    ('2023-12-01', '2023-12-31'),
    ('2024-01-01', '2024-01-31'),
    ('2024-02-01', '2024-02-28'),
    ('2024-03-01', '2024-03-31'),
    ('2024-04-01', '2024-04-30'),
    ('2024-05-01', '2024-05-31'),
    ('2024-06-01', '2024-06-30'),
    ('2024-07-01', '2024-07-31'),
    ('2024-08-01', '2024-08-31')
]

# Scrape data for each month and combine into a single dataframe
dataframes = []
for start_str, end_str in months:
    start_date = datetime.strptime(start_str, '%Y-%m-%d').replace(tzinfo=timezone.utc)
    # Adjust the end date to the last second of the last day of the month
    end_date = datetime.strptime(end_str, '%Y-%m-%d').replace(tzinfo=timezone.utc).replace(hour=23, minute=59, second=59)
    
    df = scrape_reddit_posts(subreddit, start_date, end_date)
    dataframes.append(df)

# Concatenate all dataframes
yearly_data = pd.concat(dataframes)

In [7]:
yearly_data['Timestamp'] = pd.to_datetime(yearly_data['Timestamp'], utc=True)

# Create new columns for year and month
yearly_data['Year'] = yearly_data['Timestamp'].dt.year
yearly_data['Month'] = yearly_data['Timestamp'].dt.month

# Group by Year and Month and count the number of posts in each month
monthly_counts = yearly_data.groupby(['Year', 'Month']).size().reset_index(name='Post_Count')

# Display the result
print(monthly_counts)

    Year  Month  Post_Count
0   2023      9        1811
1   2023     10        1391
2   2023     11        1774
3   2023     12         382
4   2024      1        1022
5   2024      2        1403
6   2024      3        1522
7   2024      4        1073
8   2024      5         662
9   2024      6         832
10  2024      7         298
11  2024      8        1568
12  2024      9          18


In [9]:
yearly_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 13756 entries, 0 to 1571
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype              
---  ------          --------------  -----              
 0   Type            13756 non-null  object             
 1   Post_id         13756 non-null  object             
 2   Title           13756 non-null  object             
 3   Author          13756 non-null  object             
 4   Timestamp       13756 non-null  datetime64[ns, UTC]
 5   Text            13756 non-null  object             
 6   Score           13756 non-null  int64              
 7   Total_comments  13756 non-null  int64              
 8   Post_URL        470 non-null    object             
 9   Year            13756 non-null  int32              
 10  Month           13756 non-null  int32              
dtypes: datetime64[ns, UTC](1), int32(2), int64(2), object(6)
memory usage: 1.2+ MB


In [11]:
yearly_data.to_csv('data/reddit_data.csv')