## Reddit Scrapper Overview 
A walkthrough of the different features of the reddit scrapper package

You can read more about the process [here](https://towardsdatascience.com/predicting-reddit-flairs-using-machine-learning-and-deploying-the-model-on-heroku-part-1-574b69098d9a).

In [None]:
# Install the praw library 
!pip install praw

In [None]:
!pip install reddit_scraper

In [None]:
# Import the library
import reddit_scraper as rs
import praw
import pandas as pd

### Authenticating client

In [None]:
# Credentials generated from the reddit developers applications page
# Hidden to protect my details. Add your own info.  
my_client_id = ''
my_client_secret = ''
user = ''

In [None]:
reddit = praw.Reddit(client_id=my_client_id, client_secret=my_client_secret, user_agent=user)


In [None]:
# List of subreddits you want to scrape
subreddits = ['wearables', 'SmartWearables', 'Coros', 'Fittrack', 'WearableFitness', 'WearableDisplays']

# Number of posts you want in your data
num_of_posts = 100000

# Collecting posts from multiple subreddits
all_posts = []

for subreddit_name in subreddits:
    subreddit = reddit.subreddit(subreddit_name)
    posts = subreddit.new(limit=num_of_posts)  # You can use 'hot', 'new', 'rising', etc. instead of 'top'
    
    for post in posts:
        all_posts.append({'Subreddit': subreddit_name, 'Title': post.title, 'Text': post.selftext, 'URL': post.url})

# Creating a Pandas DataFrame with specific column names
df = pd.DataFrame(all_posts, columns=['Subreddit', 'Title', 'Text', 'URL'])

# Printing the DataFrame
print(df)

In [None]:

from datetime import datetime
import time

In [None]:
import praw
import pandas as pd
from datetime import datetime

# Assuming you have already initialized your Reddit instance as 'reddit'
# reddit = praw.Reddit(client_id='YOUR_CLIENT_ID', client_secret='YOUR_CLIENT_SECRET', user_agent='YOUR_USER_AGENT')

# List of subreddits you want to scrape
subreddits = ['wearables', 'SmartWearables', 'Coros', 'Fittrack', 'WearableFitness', 'WearableDisplays']

# Number of posts you want in your data
num_of_posts = 100000

# Collecting posts from multiple subreddits
all_posts = []

for subreddit_name in subreddits:
    subreddit = reddit.subreddit(subreddit_name)
    posts = subreddit.new(limit=num_of_posts)  # You can use 'hot', 'new', 'rising', etc. instead of 'top'
    
    for post in posts:
        post_date = datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S')
        all_posts.append({'Subreddit': subreddit_name, 'Title': post.title, 'Text': post.selftext, 'URL': post.url, 'Posted Date': post_date, 'Comments': post.num_comments})

# Creating a Pandas DataFrame with specific column names
df = pd.DataFrame(all_posts, columns=['Subreddit', 'Title', 'Text', 'URL', 'Posted Date', 'Comments'])

# Printing the DataFrame
print(df)


In [None]:
# Assuming df is your DataFrame and 'XXX' is the column name
filtered_df = df[df['Text'].str.contains('wearable', case=False, na=False)]

# Display the resulting DataFrame
print(filtered_df)


In [None]:
['wearables', 'SmartWearables', 'Coros', 'Fittrack', 'WearableFitness', 'WearableDisplays',]

In [None]:
import praw
import pandas as pd
from datetime import datetime

# Assuming you have already initialized your Reddit instance as 'reddit'
# reddit = praw.Reddit(client_id='YOUR_CLIENT_ID', client_secret='YOUR_CLIENT_SECRET', user_agent='YOUR_USER_AGENT')

# List of subreddits you want to scrape
subreddits = ['WearableDisplays']

# Number of posts you want in your data
num_of_posts = 100000

# Collecting posts from multiple subreddits
all_posts = []

for subreddit_name in subreddits:
    subreddit = reddit.subreddit(subreddit_name)
    posts = subreddit.new(limit=num_of_posts)  # You can use 'hot', 'new', 'rising', etc. instead of 'top'
    
    for post in posts:
        post_date = datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S')
        all_posts.append({'Subreddit': subreddit_name, 'Title': post.title, 'Text': post.selftext, 'URL': post.url, 'Posted Date': post_date, 'Comments': post.num_comments})

# Creating a Pandas DataFrame with specific column names
df = pd.DataFrame(all_posts, columns=['Subreddit', 'Title', 'Text', 'URL', 'Posted Date', 'Comments'])

# Printing the DataFrame
print(df)


In [None]:
# Assuming df is your DataFrame and 'XXX' is the column name
filtered_df = df[df['Text'].str.contains('wearable', case=False, na=False)]

# Display the resulting DataFrame
print(filtered_df)


In [None]:
import praw
import pandas as pd
from datetime import datetime

# Assuming you have already initialized your Reddit instance as 'reddit'
# reddit = praw.Reddit(client_id='YOUR_CLIENT_ID', client_secret='YOUR_CLIENT_SECRET', user_agent='YOUR_USER_AGENT')

# Words to filter posts and comments
filter_words = ['health wearables']

# Number of posts you want in your data
num_of_posts = 100000

# Collecting posts from the entire Reddit site
all_posts = []

# Using Reddit's search to find posts containing the specified filter words
results = reddit.subreddit('all').search(' OR '.join(filter_words), sort='new', limit=num_of_posts)

for post in results:
    post_date = datetime.utcfromtimestamp(post.created_utc).strftime('%Y-%m-%d %H:%M:%S')
    
    all_posts.append({'Subreddit': post.subreddit.display_name, 'Title': post.title, 'Text': post.selftext, 'URL': post.url, 'Posted Date': post_date, 'Comments': post.num_comments})

# Creating a Pandas DataFrame with specific column names
df = pd.DataFrame(all_posts, columns=['Subreddit', 'Title', 'Text', 'URL', 'Posted Date', 'Comments'])

# Printing the DataFrame
print(df)


In [None]:
import praw
import pandas as pd
from datetime import datetime

# Assuming you have already initialized your Reddit instance as 'reddit'
# reddit = praw.Reddit(client_id='YOUR_CLIENT_ID', client_secret='YOUR_CLIENT_SECRET', user_agent='YOUR_USER_AGENT')

# Words to filter posts and comments
filter_words = ['health wearables']

# Number of posts you want in your data
num_of_posts = 100000

# Collecting posts and comments from the entire Reddit site
all_posts = []

# Using Reddit's search to find posts containing the specified filter words
results = reddit.subreddit('all').search(' OR '.join(filter_words), sort='new', limit=num_of_posts)

for submission in results:
    post_date = datetime.utcfromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S')
    
    all_posts.append({'Subreddit': submission.subreddit.display_name, 'Title': submission.title, 'Post_Text': submission.selftext, 'URL': submission.url, 'Posted Date': post_date, 'Comments': submission.num_comments})
    
    # Adding comments to the list
    submission.comments.replace_more(limit=None)
    for comment in submission.comments.list():
        comment_date = datetime.utcfromtimestamp(comment.created_utc).strftime('%Y-%m-%d %H:%M:%S')
        all_posts.append({'Subreddit': submission.subreddit.display_name, 'Title': submission.title, 'Comment_Text': comment.body, 'URL': submission.url, 'Posted Date': comment_date, 'Comments': 0})

# Creating a Pandas DataFrame with specific column names
df = pd.DataFrame(all_posts, columns=['Subreddit', 'Title', 'Post_Text', 'Comment_Text', 'URL', 'Posted Date', 'Comments'])

# Printing the DataFrame
print(df)


In [None]:
df.head(1000)

In [None]:
# Exporting the DataFrame to a CSV file
df.to_csv('reddit_final_data.csv', index=False)

# Printing a message indicating successful export
print("DataFrame has been exported to 'reddit_data.csv'")


In [None]:
import praw
import pandas as pd
from datetime import datetime

# Assuming you have already initialized your Reddit instance as 'reddit'
# reddit = praw.Reddit(client_id='YOUR_CLIENT_ID', client_secret='YOUR_CLIENT_SECRET', user_agent='YOUR_USER_AGENT')

# Words to filter posts and comments
filter_words = filter_words = ['health wearables', 'health sensors', 'healthcare wearable', 'healthcare sensors']

# Number of posts you want in your data
num_of_posts = 100000

# Collecting posts and comments from the entire Reddit site
all_posts = []

# Using Reddit's search to find posts containing the specified filter words
results = reddit.subreddit('all').search(' OR '.join(filter_words), sort='new', limit=num_of_posts)

for submission in results:
    post_date = datetime.utcfromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S')
    
    all_posts.append({'Subreddit': submission.subreddit.display_name, 'Title': submission.title, 'Post_Text': submission.selftext, 'URL': submission.url, 'Posted Date': post_date, 'Comments': submission.num_comments})
    
    # Adding comments to the list
    submission.comments.replace_more(limit=None)
    for comment in submission.comments.list():
        comment_date = datetime.utcfromtimestamp(comment.created_utc).strftime('%Y-%m-%d %H:%M:%S')
        all_posts.append({'Subreddit': submission.subreddit.display_name, 'Title': submission.title, 'Comment_Text': comment.body, 'URL': submission.url, 'Posted Date': comment_date, 'Comments': 0})

# Creating a Pandas DataFrame with specific column names
df = pd.DataFrame(all_posts, columns=['Subreddit', 'Title', 'Post_Text', 'Comment_Text', 'URL', 'Posted Date', 'Comments'])

# Printing the DataFrame
print(df)


In [None]:
# Exporting the DataFrame to a CSV file
df.to_csv('multi_words_reddit_scrape.csv', index=False)

# Printing a message indicating successful export
print("DataFrame has been exported to 'reddit_data.csv'")

In [None]:
df.shape

In [None]:
import praw
import pandas as pd
from datetime import datetime, timedelta

# Assuming you have already initialized your Reddit instance as 'reddit'
# reddit = praw.Reddit(client_id='YOUR_CLIENT_ID', client_secret='YOUR_CLIENT_SECRET', user_agent='YOUR_USER_AGENT')

# Words to filter posts and comments
filter_words = ['health wearables']

# Number of posts you want in your data
num_of_posts = 100000

# Collecting posts and comments from the entire Reddit site
all_posts = []

# Specify the time range for the search (e.g., fetch posts from the last 10 years)
end_date = datetime.utcnow()
start_date = end_date - timedelta(days=3652)  # Roughly 10 years

# Fetching posts in chunks based on specified time periods
while start_date < end_date:
    # Using Reddit's search to find posts containing the specified filter words
    results = reddit.subreddit('all').search(f'{",".join(filter_words)} timestamp:{int(start_date.timestamp())}..{int(end_date.timestamp())}', sort='new', limit=num_of_posts)

    for submission in results:
        post_date = datetime.utcfromtimestamp(submission.created_utc).strftime('%Y-%m-%d %H:%M:%S')

        all_posts.append({'Subreddit': submission.subreddit.display_name, 'Title': submission.title, 'Post_Text': submission.selftext, 'URL': submission.url, 'Posted Date': post_date, 'Comments': submission.num_comments})

        # Adding comments to the list
        submission.comments.replace_more(limit=None)
        for comment in submission.comments.list():
            comment_date = datetime.utcfromtimestamp(comment.created_utc).strftime('%Y-%m-%d %H:%M:%S')
            all_posts.append({'Subreddit': submission.subreddit.display_name, 'Title': submission.title, 'Comment_Text': comment.body, 'URL': submission.url, 'Posted Date': comment_date, 'Comments': 0})

    # Update the time range for the next request
    end_date = start_date
    start_date = end_date - timedelta(days=3652)  # Another 10 years back

# Creating a Pandas DataFrame with specific column names
df = pd.DataFrame(all_posts, columns=['Subreddit', 'Title', 'Post_Text', 'Comment_Text', 'URL', 'Posted Date', 'Comments'])

# Printing the DataFrame
print(df)


In [None]:
# Exporting the DataFrame to a CSV file
df.to_csv('10_years_reddit.csv', index=False)

# Printing a message indicating successful export
print("DataFrame has been exported to 'reddit_data.csv'")

In [None]:
df.shape