# HW-Reddit API Data Pipeline




## Import Required Libraries

Import all necessary libraries for Reddit API access, CSV handling, and environment variable management.


In [None]:
!python --version

Python 3.12.12


In [None]:
%pip install praw python-dotenv



In [None]:
# -*- coding: utf-8 -*-
import praw
import csv
import os
import pandas as pd
from dotenv import load_dotenv

print("‚úÖ Libraries imported successfully!")


‚úÖ Libraries imported successfully!


## 3.1 Code Implementation

### 1. Secure API Initialization


Create .env file with credentials

transfer .env file to Colab Notebooks folder

Access Google Drive files, .env file located in Colab Notebooks folder

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Load environment variables from .env file
# load_dotenv('reddit_api.env') # This line was commented out
from dotenv import dotenv_values
import os

# Define the path to your .env file in Google Drive
# IMPORTANT: Update this path to the actual location of your reddit_api.env file in your Google Drive
env_file_path = '/content/drive/MyDrive/assignmentfolder/reddit_api_final.env'


# Load environment variables from reddit_api.env file if it exists
if os.path.exists(env_file_path):
    config = dotenv_values(env_file_path)
    print(f"‚úÖ Environment variables loaded from {env_file_path}!")
else:
    config = {}
    print(f"‚ùå Error: '{env_file_path}' not found. Environment variables not loaded.")
    print("Please ensure the 'reddit_api_final.env' file is in the specified Google Drive path.")

‚úÖ Environment variables loaded from /content/drive/MyDrive/Colab Notebooks/reddit_api_final.env!


Load Reddit API credentials from the environment file and establish connection to Reddit using PRAW.


In [None]:
# Authenticate with Reddit using environment variables
reddit = praw.Reddit(
    client_id=config.get('REDDIT_CLIENT_ID'),
    client_secret=config.get('REDDIT_CLIENT_SECRET'),
    username=config.get('REDDIT_USERNAME'),
    password=config.get('REDDIT_PASSWORD'),
    user_agent=config.get('REDDIT_USER_AGENT')
)

print("‚úÖ Reddit API authenticated successfully!")
print(f"Connected as: {reddit.user.me()}")

It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



‚úÖ Reddit API authenticated successfully!
Connected as: dannehrd


## 2. Task 1: Fetching "Hot" Posts

Create a function to download "hot" posts from specified subreddit, with proper error handling and improvements.

https://praw.readthedocs.io/en/stable/code_overview/models/submission.html


In [None]:
def download_hot_posts(subreddit_name, limit=100, filename="hot_posts.csv"):
    """
    Download hot posts from a specified subreddit and save to CSV.

    Args:
        subreddit_name (str): Name of the subreddit to download from
        limit (int): Number of posts to download (default: 100)
        filename (str): Name of the output CSV file

    Returns:
        bool: True if successful, False otherwise
    """
    # Input validation
    if not subreddit_name or not isinstance(subreddit_name, str):
        raise ValueError("Subreddit name must be a non-empty string")
    if not isinstance(limit, int) or limit <= 0:
        raise ValueError("Limit must be a positive integer")
    if not filename or not isinstance(filename, str):
        raise ValueError("Filename must be a non-empty string")

    try:
        # Access the specified subreddit
        subreddit = reddit.subreddit(subreddit_name)

        # Fetch the hot posts
        posts = subreddit.hot(limit=limit)

        # Open a CSV file to save the results
        with open(filename, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            # Write the header row
            writer.writerow(['Title', 'Score', 'Upvote Ratio',
                             'Number of Comments', 'Author', 'Subreddit', 'URL',
                             'Permalink', 'Post ID', 'Created UTC', 'Selfpost?',
                             'Selftext', 'Post Flair', 'Domain', 'Search Query'])

            # Loop through the posts and write to the CSV file
            print(f"Downloading {limit} hot posts from r/{subreddit_name} and saving to '{filename}'...\n")

            post_count = 0
            for post in posts:
                # Handle None authors (deleted accounts)
                author_name = post.author.name if post.author else "[deleted]"

                # Truncate very long text to prevent CSV issues
                text_content = post.selftext[:1000] + "..." if len(post.selftext) > 1000 else post.selftext

                writer.writerow([
                    post.title,
                    post.score,
                    post.upvote_ratio,
                    post.num_comments,
                    author_name,
                    subreddit_name,
                    post.url,
                    post.permalink,
                    post.id,
                    post.created_utc,
                    post.is_self,
                    text_content,
                    post.link_flair_text,
                    post.domain,
                    None          # no search query here since sorted by hot
                ])
                post_count += 1

        print(f"‚úÖ Successfully downloaded {post_count} posts!")
        print(f"üìÅ Data saved to '{filename}'")
        return True

    except praw.exceptions.PRAWException as e:
        print(f"‚ùå Reddit API error: {e}")
        return False
    except ValueError as e:
        print(f"‚ùå Input validation error: {e}")
        return False
    except FileNotFoundError as e:
        print(f"‚ùå File error: {e}")
        return False
    except Exception as e:
        print(f"‚ùå Unexpected error: {e}")
        return False

print("‚úÖ Function defined successfully!")

‚úÖ Function defined successfully!



Configure the initial parameters for data collection

- multiple subreddits, so we create a list of subreddits for now, then when running function we run the one pertaining to current entry in list
- we want limit = 50 for each one so we set that here
- filename depends on which subreddit were on so we set that at time of running function


In [None]:
subreddit_list = ["space", "nasa", "Astronomy", "spaceflight", "spacex"]

limit = 50  # Number of posts to download


Run the data collection function to download posts from each subreddit.


In [None]:
# Execute the data collection

for i in range(len(subreddit_list)):
  success = download_hot_posts(subreddit_list[i], limit, "hot_" + subreddit_list[i] + "_posts_notebook.csv")
  if success:
    print("\nüéâ Data collection completed successfully!")
  else:
    print("\n‚ùå Data collection failed. Please check the error messages above.")





It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Downloading 50 hot posts from r/space and saving to 'hot_space_posts_notebook.csv'...



It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



‚úÖ Successfully downloaded 50 posts!
üìÅ Data saved to 'hot_space_posts_notebook.csv'

üéâ Data collection completed successfully!
Downloading 50 hot posts from r/nasa and saving to 'hot_nasa_posts_notebook.csv'...



It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



‚úÖ Successfully downloaded 50 posts!
üìÅ Data saved to 'hot_nasa_posts_notebook.csv'

üéâ Data collection completed successfully!
Downloading 50 hot posts from r/Astronomy and saving to 'hot_Astronomy_posts_notebook.csv'...



It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



‚úÖ Successfully downloaded 50 posts!
üìÅ Data saved to 'hot_Astronomy_posts_notebook.csv'

üéâ Data collection completed successfully!
Downloading 50 hot posts from r/spaceflight and saving to 'hot_spaceflight_posts_notebook.csv'...



It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



‚úÖ Successfully downloaded 50 posts!
üìÅ Data saved to 'hot_spaceflight_posts_notebook.csv'

üéâ Data collection completed successfully!
Downloading 50 hot posts from r/spacex and saving to 'hot_spacex_posts_notebook.csv'...

‚úÖ Successfully downloaded 50 posts!
üìÅ Data saved to 'hot_spacex_posts_notebook.csv'

üéâ Data collection completed successfully!


## 2. Task 2: Keyword-Based Search

Implement a second method that searches for posts
containing a specific keyword across one or more subreddits.

First create a function that performs like one above, but includes another parameter for search term, and instead of going through "hot" posts, extracts data from posts produced by entering search term into search bar inside the subreddit.

https://praw.readthedocs.io/en/stable/code_overview/models/subreddit.html#praw.models.Subreddit.search

In [None]:
def search_term_posts(subreddit_name, search_term, limit=100, filename="hot_posts.csv"):
    """
    Download posts from specified subreddit using specified
    search term, then save to CSV.

    Args:
        subreddit_name (str): Name of the subreddit to download from
        search_term (str): term to use in subreddit search
        limit (int): Number of posts to download (default: 100)
        filename (str): Name of the output CSV file

    Returns:
        bool: True if successful, False otherwise
    """
    # Input validation
    if not subreddit_name or not isinstance(subreddit_name, str):
        raise ValueError("Subreddit name must be a non-empty string")
    if not isinstance(limit, int) or limit <= 0:
        raise ValueError("Limit must be a positive integer")
    if not filename or not isinstance(filename, str):
        raise ValueError("Filename must be a non-empty string")
    if not search_term or not isinstance(search_term, str):
        raise ValueError("Search term must be a non-empty string")

    try:
        # Access the specified subreddit
        subreddit = reddit.subreddit(subreddit_name)

        # Fetch the search term posts. We want to order by relevance to search term

        # IMPORTANT: Use modifier on search term so that term must appear in
        # post title or body of post. Now if search term only appears in post
        # comments, search will not produce that post.
        posts = subreddit.search(query="title:"+search_term+" OR "+"selftext:"+
                                 search_term, sort = 'relevance', limit=limit)

        # Open a CSV file to save the results
        with open(filename, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            # Write the header row
            writer.writerow(['Title', 'Score', 'Upvote Ratio',
                             'Number of Comments', 'Author', 'Subreddit', 'URL',
                             'Permalink', 'Post ID', 'Created UTC', 'Selfpost?',
                             'Selftext', 'Post Flair', 'Domain', 'Search Query'])

            # Loop through the posts and write to the CSV file
            print(f"Downloading {limit} {search_term} posts from r/{subreddit_name} and saving to '{filename}'...\n")

            post_count = 0
            for post in posts:
                # Handle None authors (deleted accounts)
                author_name = post.author.name if post.author else "[deleted]"

                # Truncate very long text to prevent CSV issues
                text_content = post.selftext[:1000] + "..." if len(post.selftext) > 1000 else post.selftext

                writer.writerow([
                    post.title,
                    post.score,
                    post.upvote_ratio,
                    post.num_comments,
                    author_name,
                    subreddit_name,
                    post.url,
                    post.permalink,
                    post.id,
                    post.created_utc,
                    post.is_self,
                    text_content,
                    post.link_flair_text,
                    post.domain,
                    search_term
                ])
                post_count += 1

        print(f"‚úÖ Successfully downloaded {post_count} posts!")
        print(f"üìÅ Data saved to '{filename}'")
        return True

    except praw.exceptions.PRAWException as e:
        print(f"‚ùå Reddit API error: {e}")
        return False
    except ValueError as e:
        print(f"‚ùå Input validation error: {e}")
        return False
    except FileNotFoundError as e:
        print(f"‚ùå File error: {e}")
        return False
    except Exception as e:
        print(f"‚ùå Unexpected error: {e}")
        return False

print("‚úÖ Function defined successfully!")

‚úÖ Function defined successfully!


Configure the initial parameters for data collection

- multiple subreddits, so we create a list of subreddits for now, then when running function we run the one pertaining to current entry in list
- we want to search for word "rocket", so we set that here
- we want limit = 50 for each one so we set that here
- filename depends on which subreddit were on so we set that at time of running function

In [None]:
subreddit_list = ["space", "nasa", "Astronomy", "spaceflight", "spacex"]

search_term = "rocket"

limit = 50  # Number of posts to download

Run the data collection function to download posts from each subreddit.

In [None]:
# Execute the data collection

for i in range(len(subreddit_list)):
  success = search_term_posts(subreddit_list[i], search_term, limit,
                               search_term + '_' + subreddit_list[i] + "_posts_notebook.csv")
  if success:
    print("\nüéâ Data collection completed successfully!")
  else:
    print("\n‚ùå Data collection failed. Please check the error messages above.")


It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



Downloading 50 rocket posts from r/space and saving to 'rocket_space_posts_notebook.csv'...



It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



‚úÖ Successfully downloaded 50 posts!
üìÅ Data saved to 'rocket_space_posts_notebook.csv'

üéâ Data collection completed successfully!
Downloading 50 rocket posts from r/nasa and saving to 'rocket_nasa_posts_notebook.csv'...



It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



‚úÖ Successfully downloaded 50 posts!
üìÅ Data saved to 'rocket_nasa_posts_notebook.csv'

üéâ Data collection completed successfully!
Downloading 50 rocket posts from r/Astronomy and saving to 'rocket_Astronomy_posts_notebook.csv'...



It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



‚úÖ Successfully downloaded 50 posts!
üìÅ Data saved to 'rocket_Astronomy_posts_notebook.csv'

üéâ Data collection completed successfully!
Downloading 50 rocket posts from r/spaceflight and saving to 'rocket_spaceflight_posts_notebook.csv'...



It is strongly recommended to use Async PRAW: https://asyncpraw.readthedocs.io.
See https://praw.readthedocs.io/en/latest/getting_started/multiple_instances.html#discord-bots-and-asynchronous-environments for more info.



‚úÖ Successfully downloaded 50 posts!
üìÅ Data saved to 'rocket_spaceflight_posts_notebook.csv'

üéâ Data collection completed successfully!
Downloading 50 rocket posts from r/spacex and saving to 'rocket_spacex_posts_notebook.csv'...

‚úÖ Successfully downloaded 50 posts!
üìÅ Data saved to 'rocket_spacex_posts_notebook.csv'

üéâ Data collection completed successfully!


## Task 3: Data Export to CSV

Using the csv's created above, create 10 separate dataframes


In [None]:
hot_Astronomy_df = pd.read_csv("/content/hot_Astronomy_posts_notebook.csv")
hot_nasa_df = pd.read_csv("/content/hot_nasa_posts_notebook.csv")
hot_space_df = pd.read_csv("/content/hot_space_posts_notebook.csv")
hot_spaceflight_df = pd.read_csv("/content/hot_spaceflight_posts_notebook.csv")
hot_spacex_df = pd.read_csv("/content/hot_spacex_posts_notebook.csv")

rocket_Astronomy_df = pd.read_csv("/content/rocket_Astronomy_posts_notebook.csv")
rocket_nasa_df = pd.read_csv("/content/rocket_nasa_posts_notebook.csv")
rocket_space_df = pd.read_csv("/content/rocket_space_posts_notebook.csv")
rocket_spaceflight_df = pd.read_csv("/content/rocket_spaceflight_posts_notebook.csv")
rocket_spacex_df = pd.read_csv("/content/rocket_spacex_posts_notebook.csv")

take a look at one to make sure everything comes out correctly

In [None]:
hot_Astronomy_df.head()

Unnamed: 0,Title,Score,Upvote Ratio,Number of Comments,Author,Subreddit,URL,Permalink,Post ID,Created UTC,Selfpost?,Selftext,Post Flair,Domain,Search Query
0,"Call to Action (Again!): Americans, Call Your ...",39,0.85,2,SAUbjj,Astronomy,https://www.reddit.com/r/Astronomy/comments/1l...,/r/Astronomy/comments/1lxe2ze/call_to_action_a...,1lxe2ze,1752258000.0,True,**Good news for the astronomy research communi...,Astro Research,self.Astronomy,
1,Read the rules sub before posting!,860,0.98,0,VoijaRisa,Astronomy,https://www.reddit.com/r/Astronomy/comments/fq...,/r/Astronomy/comments/fq44oo/read_the_rules_su...,fq44oo,1585338000.0,True,"Hi all,\n\nFriendly mod warning here. In r/Ast...",Mod Post,self.Astronomy,
2,Tycho Crater,260,1.0,9,Slow_Contribution114,Astronomy,https://i.redd.it/vin83d9aiwyf1.jpeg,/r/Astronomy/comments/1omrh2a/tycho_crater/,1omrh2a,1762114000.0,False,Taken with a Skywatcher Skymax 127 and a Canon...,Astrophotography (OC),i.redd.it,
3,Jupiter‚Äôs Rotation for 1 Hour,205,0.99,12,Unlikely-Bee-985,Astronomy,https://v.redd.it/udvqnjv9jtyf1,/r/Astronomy/comments/1omdnn1/jupiters_rotatio...,1omdnn1,1762078000.0,False,Optics: Celestron Nexstar 6 SLT on nexstar Alt...,Astrophotography (OC),v.redd.it,
4,M27 - Dumbbell Nebula,114,0.99,2,corpsmoderne,Astronomy,https://i.redd.it/6xi6d1tv9uyf1.jpeg,/r/Astronomy/comments/1omgaki/m27_dumbbell_neb...,1omgaki,1762087000.0,False,A new processing of data taken last june\n\n48...,Astrophotography (OC),i.redd.it,


Create new dataframe called reddit_data_df with same column headings as above.

In [None]:
reddit_data_df = pd.DataFrame(columns = hot_Astronomy_df.columns)

reddit_data_df.head()

Unnamed: 0,Title,Score,Upvote Ratio,Number of Comments,Author,Subreddit,URL,Permalink,Post ID,Created UTC,Selfpost?,Selftext,Post Flair,Domain,Search Query


Add rows from each of the original dataframes to new reddit_data_df

In [None]:
reddit_data_df = pd.concat([hot_Astronomy_df, hot_nasa_df, hot_space_df,
                           hot_spaceflight_df, hot_spacex_df, rocket_Astronomy_df,
                           rocket_nasa_df, rocket_space_df, rocket_spaceflight_df,
                           rocket_spacex_df], axis=0, ignore_index=True)

reddit_data_df.head()

Unnamed: 0,Title,Score,Upvote Ratio,Number of Comments,Author,Subreddit,URL,Permalink,Post ID,Created UTC,Selfpost?,Selftext,Post Flair,Domain,Search Query
0,"Call to Action (Again!): Americans, Call Your ...",39,0.85,2,SAUbjj,Astronomy,https://www.reddit.com/r/Astronomy/comments/1l...,/r/Astronomy/comments/1lxe2ze/call_to_action_a...,1lxe2ze,1752258000.0,True,**Good news for the astronomy research communi...,Astro Research,self.Astronomy,
1,Read the rules sub before posting!,860,0.98,0,VoijaRisa,Astronomy,https://www.reddit.com/r/Astronomy/comments/fq...,/r/Astronomy/comments/fq44oo/read_the_rules_su...,fq44oo,1585338000.0,True,"Hi all,\n\nFriendly mod warning here. In r/Ast...",Mod Post,self.Astronomy,
2,Tycho Crater,260,1.0,9,Slow_Contribution114,Astronomy,https://i.redd.it/vin83d9aiwyf1.jpeg,/r/Astronomy/comments/1omrh2a/tycho_crater/,1omrh2a,1762114000.0,False,Taken with a Skywatcher Skymax 127 and a Canon...,Astrophotography (OC),i.redd.it,
3,Jupiter‚Äôs Rotation for 1 Hour,205,0.99,12,Unlikely-Bee-985,Astronomy,https://v.redd.it/udvqnjv9jtyf1,/r/Astronomy/comments/1omdnn1/jupiters_rotatio...,1omdnn1,1762078000.0,False,Optics: Celestron Nexstar 6 SLT on nexstar Alt...,Astrophotography (OC),v.redd.it,
4,M27 - Dumbbell Nebula,114,0.99,2,corpsmoderne,Astronomy,https://i.redd.it/6xi6d1tv9uyf1.jpeg,/r/Astronomy/comments/1omgaki/m27_dumbbell_neb...,1omgaki,1762087000.0,False,A new processing of data taken last june\n\n48...,Astrophotography (OC),i.redd.it,


Check length of our final dataframe. Should be 50 rows for each original dataframe, total of 500

In [None]:
len(reddit_data_df)

500

Check for any duplicate values in Permalink column

In [None]:
dup = reddit_data_df.duplicated()

dup

Unnamed: 0,0
0,False
1,False
2,False
3,False
4,False
...,...
495,False
496,False
497,False
498,False


In [None]:
dup.unique()

array([False])

In [None]:
dup2 = reddit_data_df['Permalink'].duplicated()

dup2.value_counts()

Unnamed: 0_level_0,count
Permalink,Unnamed: 1_level_1
False,499
True,1


In [None]:
dup3 = reddit_data_df['Post ID'].duplicated()

dup3.value_counts()

Unnamed: 0_level_0,count
Post ID,Unnamed: 1_level_1
False,499
True,1


Results above suggest there is one duplicate entry under Permalink and under Post ID. We need to delete this row. When returning the index that shows True, the result should be the same for both dup2 and dup3

In [None]:
print(dup2.index[dup2 == True])
print(dup3.index[dup3 == True])

Index([400], dtype='int64')
Index([400], dtype='int64')


In [None]:
reddit_data_df.iloc[400]

Unnamed: 0,400
Title,That time my grandfather blew up a NASA rocket.
Score,754
Upvote Ratio,1.0
Number of Comments,24
Author,walker1812
Subreddit,spaceflight
URL,https://www.reddit.com/gallery/1o8nbwk
Permalink,/r/spaceflight/comments/1o8nbwk/that_time_my_g...
Post ID,1o8nbwk
Created UTC,1760661828.0


In [None]:
reddit_data_df[reddit_data_df['Post ID'] == '1o8nbwk']

Unnamed: 0,Title,Score,Upvote Ratio,Number of Comments,Author,Subreddit,URL,Permalink,Post ID,Created UTC,Selfpost?,Selftext,Post Flair,Domain,Search Query
182,That time my grandfather blew up a NASA rocket.,756,1.0,24,walker1812,spaceflight,https://www.reddit.com/gallery/1o8nbwk,/r/spaceflight/comments/1o8nbwk/that_time_my_g...,1o8nbwk,1760662000.0,False,"It‚Äôs the evening of September 18, 1968 and the...",,reddit.com,
400,That time my grandfather blew up a NASA rocket.,754,1.0,24,walker1812,spaceflight,https://www.reddit.com/gallery/1o8nbwk,/r/spaceflight/comments/1o8nbwk/that_time_my_g...,1o8nbwk,1760662000.0,False,"It‚Äôs the evening of September 18, 1968 and the...",,reddit.com,rocket


Technically Reddit's API did not duplicate this since one was produced by the "hot" section and the other was produced by the "rocket" search. But according to instructions we need to remove any duplicate posts based on Post ID or Permalink.

In [None]:
reddit_data_final_df = reddit_data_df.drop_duplicates(subset=['Permalink'])

len(reddit_data_final_df)

499

In [None]:
# just checking to make sure now "True" does not show up in value counts

dup4 = reddit_data_final_df['Post ID'].duplicated()

dup4.value_counts()

Unnamed: 0_level_0,count
Post ID,Unnamed: 1_level_1
False,499


In [None]:
dup5 = reddit_data_final_df['URL'].duplicated()

dup5.value_counts()

Unnamed: 0_level_0,count
URL,Unnamed: 1_level_1
False,494
True,5


There are a few duplicate URL's but per PRAW documentation, this could be due to multiple posts linking to same URL. So we are done removing duplicates.

Save the cleaned final df as reddit_data_csv

In [None]:
reddit_data_final_df.to_csv('reddit_data.csv', index=False)

---------------------------------------------------------------


--------------------------------------------------------

In [None]:
pip freeze > requirements.txt

--------------------------------------------------------------

----------------------------------------------------------------

In [202]:
'''
%cd /content/drive/MyDrive/

!mkdir assignmentfolder
%cd assignmentfolder

!touch reddit_code.py
!touch .env.example
!touch requirements.txt
!touch README.md
'''

# created these files manually in assignmentfolder

'\n%cd /content/drive/MyDrive/\n\n!mkdir assignmentfolder\n%cd assignmentfolder\n\n!touch reddit_code.py\n!touch .env.example\n!touch requirements.txt\n!touch README.md\n'

In [None]:
'''
%%writefile .gitignore

.env

venv/__pycache__
'''

# created this manually in folder

Writing .gitignore


In [None]:
%%writefile reddit.env

REDDIT_CLIENT_ID="YOUR_CLIENT_ID_HERE"
REDDIT_CLIENT_SECRET="YOUR_CLIENT_SECRET_HERE"
REDDIT_USER_AGENT="YOUR_USER_AGENT_HERE"

Writing reddit.env


In [None]:
!git config --global user.name "danielwbanksjr"
!git config --global user.email "danielwbanksjr@gmail.com"

In [199]:
%cd /content/drive/MyDrive/assignmentfolder/

!git init

!git add .

!git commit -m "Initial commit from Google Colab"

/content/drive/MyDrive/assignmentfolder
Reinitialized existing Git repository in /content/drive/MyDrive/assignmentfolder/.git/
[main a99728b] Initial commit from Google Colab
 6 files changed, 3188 insertions(+)
 delete mode 100644 .env.example
 create mode 100644 HW-RedditApiDataPipeline.ipynb
 create mode 100644 reddit_api_final.env
 create mode 100644 reddit_data.csv


In [200]:
from google.colab import userdata
github_token = userdata.get('GITHUB_PAT')
username = 'danielwbanksjr'
repo_name = 'MSBA212HW-RedditPipeline'

In [201]:
!git remote remove origin


remote_url = f"https://{username}:{github_token}@github.com/{username}/{repo_name}.git"

!git remote add origin {remote_url}
!git branch -M main
!git push -u origin main

Enumerating objects: 10, done.
Counting objects:  11% (1/9)Counting objects:  22% (2/9)Counting objects:  33% (3/9)Counting objects:  44% (4/9)Counting objects:  55% (5/9)Counting objects:  66% (6/9)Counting objects:  77% (7/9)Counting objects:  88% (8/9)Counting objects: 100% (9/9)Counting objects: 100% (9/9), done.
Delta compression using up to 2 threads
Compressing objects:  14% (1/7)Compressing objects:  28% (2/7)Compressing objects:  42% (3/7)Compressing objects:  57% (4/7)Compressing objects:  71% (5/7)Compressing objects:  85% (6/7)Compressing objects: 100% (7/7)Compressing objects: 100% (7/7), done.
Writing objects:  14% (1/7)Writing objects:  28% (2/7)Writing objects:  42% (3/7)Writing objects:  57% (4/7)Writing objects:  71% (5/7)Writing objects:  85% (6/7)Writing objects: 100% (7/7)Writing objects: 100% (7/7), 155.67 KiB | 2.47 MiB/s, done.
Total 7 (delta 0), reused 0 (delta 0), pack-reused 0
To https://github.com/danielwbanksjr/MSBA212HW-RedditPipelin

----------------------------------------------------------------------

--------------------------------------------------------------

-----------------------------------------------------------------

------------------------------------------------------------

--------------------------------------------------------------------

-------------------------------------------------------------

Ignore these 3 code blocks below. I am keeping them for my own notes

In [None]:
# Create a method that takes the collected data, processes it, and saves it to a CSV file


'''
import pandas as pd

# Check if file exists and load data
if os.path.exists(filename):
    # Load the CSV file
    df = pd.read_csv(filename)

    print(f"üìä Dataset Overview:")
    print(f"   Total posts: {len(df)}")
    print(f"   Columns: {list(df.columns)}")
    print(f"   File size: {os.path.getsize(filename)} bytes")

    print(f"\nüìù Sample Posts:")
    print("=" * 50)

    # Display first 3 posts
    for i, row in df.head(3).iterrows():
        print(f"\nPost {i+1}:")
        print(f"Title: {row['Title'][:80]}...")
        print(f"Author: {row['Author']}")
        print(f"Score: {row['Score']}")
        print(f"URL: {row['URL']}")
        print("-" * 30)

    # Basic statistics
    print(f"\nüìà Basic Statistics:")
    print(f"   Average score: {df['Score'].mean():.2f}")
    print(f"   Highest score: {df['Score'].max()}")
    print(f"   Lowest score: {df['Score'].min()}")

else:
    print(f"‚ùå File '{filename}' not found!")
'''

'\nimport pandas as pd\n\n# Check if file exists and load data\nif os.path.exists(filename):\n    # Load the CSV file\n    df = pd.read_csv(filename)\n\n    print(f"üìä Dataset Overview:")\n    print(f"   Total posts: {len(df)}")\n    print(f"   Columns: {list(df.columns)}")\n    print(f"   File size: {os.path.getsize(filename)} bytes")\n\n    print(f"\nüìù Sample Posts:")\n    print("=" * 50)\n\n    # Display first 3 posts\n    for i, row in df.head(3).iterrows():\n        print(f"\nPost {i+1}:")\n        print(f"Title: {row[\'Title\'][:80]}...")\n        print(f"Author: {row[\'Author\']}")\n        print(f"Score: {row[\'Score\']}")\n        print(f"URL: {row[\'URL\']}")\n        print("-" * 30)\n\n    # Basic statistics\n    print(f"\nüìà Basic Statistics:")\n    print(f"   Average score: {df[\'Score\'].mean():.2f}")\n    print(f"   Highest score: {df[\'Score\'].max()}")\n    print(f"   Lowest score: {df[\'Score\'].min()}")\n\nelse:\n    print(f"‚ùå File \'{filename}\' not found!"

In [None]:
## 8. Optional: Data Analysis

# Perform some basic analysis on the collected cricket posts data.

# Additional analysis (optional)
'''
if 'df' in locals() and not df.empty:
    print("üîç Additional Analysis:")
    print("=" * 40)

    # Top scoring posts
    top_posts = df.nlargest(5, 'Score')[['Title', 'Score', 'Author']]
    print("\nüèÜ Top 5 Posts by Score:")
    for i, (_, row) in enumerate(top_posts.iterrows(), 1):
        print(f"{i}. {row['Title'][:60]}... (Score: {row['Score']}, Author: {row['Author']})")

    # Most active authors
    author_counts = df['Author'].value_counts().head(5)
    print("\nüë• Most Active Authors:")
    for author, count in author_counts.items():
        print(f"   {author}: {count} posts")

    # Posts with text content
    posts_with_text = df[df['Text'].str.len() > 0]
    print(f"\nüìù Posts with text content: {len(posts_with_text)} out of {len(df)}")

else:
    print("‚ùå No data available for analysis")
'''


'\nif \'df\' in locals() and not df.empty:\n    print("üîç Additional Analysis:")\n    print("=" * 40)\n\n    # Top scoring posts\n    top_posts = df.nlargest(5, \'Score\')[[\'Title\', \'Score\', \'Author\']]\n    print("\nüèÜ Top 5 Posts by Score:")\n    for i, (_, row) in enumerate(top_posts.iterrows(), 1):\n        print(f"{i}. {row[\'Title\'][:60]}... (Score: {row[\'Score\']}, Author: {row[\'Author\']})")\n\n    # Most active authors\n    author_counts = df[\'Author\'].value_counts().head(5)\n    print("\nüë• Most Active Authors:")\n    for author, count in author_counts.items():\n        print(f"   {author}: {count} posts")\n\n    # Posts with text content\n    posts_with_text = df[df[\'Text\'].str.len() > 0]\n    print(f"\nüìù Posts with text content: {len(posts_with_text)} out of {len(df)}")\n\nelse:\n    print("‚ùå No data available for analysis")\n'

## Summary

This notebook successfully:

1. ‚úÖ **Authenticated** with Reddit API using PRAW
2. ‚úÖ **Downloaded** recent posts from r/Cricket subreddit
3. ‚úÖ **Saved** data to CSV format with proper encoding
4. ‚úÖ **Validated** input parameters and handled errors
5. ‚úÖ **Analyzed** the collected data with basic statistics

The data is now ready for further analysis or processing! üéâ
