In [1]:
import os
from scraper.reddit_scraper import get_reddit_object, get_qa_object, \
                                   get_trump_reddit_posts_and_comments, \
                                    get_kamala_reddit_posts_and_comments, \
                                    read_paths_create_df
from scraper.youtube_scraper import scrape_youtube
from utilities.util import create_folder_if_not_exists
from preprocessor.preprocess import rename_df_cols, set_post_title_as_parent_comment_if_na, \
                                    unify_youtube_and_reddit_comments, combine_reddit_comment_and_post_df,\
                                    preprocess_dataset, split_comments_to_sentence

import pandas as pd
import html

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Scrape Reddit

In [2]:
# Alter file paths if necessary
REDDIT_SAVE_DIR = './data/reddit/'
YOUTUBE_SAVE_DIR = './data/youtube/'
FINAL_FILE_SAVE_DIR = './data/'

CONSOLIDATED_REDDIT_COMMENT_SAVE_FILE = f'{REDDIT_SAVE_DIR}/consolidated_reddit_comments.csv'
CONSOLIDATED_REDDIT_POST_SAVE_FILE = f'{REDDIT_SAVE_DIR}/consolidated_reddit_posts.csv'
YOUTUBE_FILE_NAME = 'youtube_comments.csv'

COMBINED_COMMENT_LEVEL_FILE = 'comment_level.csv'
COMBINED_SENTENCE_LEVEL_FILE = 'sentence_level.csv'

In [3]:
for path in [REDDIT_SAVE_DIR, YOUTUBE_SAVE_DIR, FINAL_FILE_SAVE_DIR]:
    create_folder_if_not_exists(path)

In [3]:
reddit = get_reddit_object()
qa_obj = get_qa_object()

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [5]:
REDDIT_POST_TO_SCRAPE = 1 # Change to alter number of posts to scrape comments from
trump_post_df, trump_comment_df = get_trump_reddit_posts_and_comments(reddit, qa_obj, REDDIT_SAVE_DIR, max_posts_to_collect=REDDIT_POST_TO_SCRAPE)
kamala_post_df, kamala_comment_df = get_kamala_reddit_posts_and_comments(reddit, qa_obj, REDDIT_SAVE_DIR, max_posts_to_collect=REDDIT_POST_TO_SCRAPE)

In [22]:
scraped_reddit_comment_file_paths = [REDDIT_SAVE_DIR+file for file in os.listdir(REDDIT_SAVE_DIR) if '_comment_data_' in file and 'test' not in file]
scraped_reddit_post_file_paths = [REDDIT_SAVE_DIR+file for file in os.listdir(REDDIT_SAVE_DIR) if '_post_data_' in file and 'test' not in file]

scraped_reddit_comment_df = read_paths_create_df(scraped_reddit_comment_file_paths)
scraped_reddit_comment_df.to_csv(CONSOLIDATED_REDDIT_COMMENT_SAVE_FILE, index=False)
scraped_reddit_post_df = read_paths_create_df(scraped_reddit_post_file_paths)
scraped_reddit_post_df.to_csv(CONSOLIDATED_REDDIT_POST_SAVE_FILE, index=False)


In [4]:
scraped_reddit_comment_df = pd.read_csv(CONSOLIDATED_REDDIT_COMMENT_SAVE_FILE)
scraped_reddit_post_df = pd.read_csv(CONSOLIDATED_REDDIT_POST_SAVE_FILE)

# Youtube Scrape TODO

In [6]:
youtube_comment_df = scrape_youtube(YOUTUBE_SAVE_DIR+YOUTUBE_FILE_NAME, videos_to_scrape=1)

YouTube comments have been saved to ./data/youtube/youtube_comments.csv


In [5]:
youtube_comment_df = pd.read_csv(YOUTUBE_SAVE_DIR+YOUTUBE_FILE_NAME)

# Unifying Reddit + Youtube Data

In [7]:
df_reddit_comment = rename_df_cols(scraped_reddit_comment_df, 'reddit_comments')
df_reddit_post = rename_df_cols(scraped_reddit_post_df, 'reddit_posts')
df_youtube = rename_df_cols(youtube_comment_df, 'youtube')

df_reddit_comment = df_reddit_comment.drop_duplicates('comment_id', keep='first')
df_youtube = df_youtube.drop_duplicates('comment_id', keep='first')
df_reddit_post = df_reddit_post.drop_duplicates('post_id', keep='first')

df_reddit = combine_reddit_comment_and_post_df(df_reddit_comment, df_reddit_post)

# df_reddit = set_post_title_as_parent_comment_if_na(df_reddit)
# df_youtube = set_post_title_as_parent_comment_if_na(df_youtube)

In [8]:
combined_df = unify_youtube_and_reddit_comments(df_reddit, df_youtube)

In [12]:
# Handle formatting issues (e.g. weird characters) using html library
combined_df = preprocess_dataset(combined_df)
combined_df.sample(10, random_state=666)

Unnamed: 0,post_id,post_title,post_timestamp,parent_comment_id,parent_comment,comment_id,comment,comment_timestamp,number_of_comment_votes,platform
19074,1fzyjvi,Podcaster Andrew Schultz laughs in Trump's fa...,09-Oct-2024,lr4twbl,He either knows he’s lying and doesn’t care or...,lr4zan6,"Kim and Putin are his heroes, so...",09-Oct-2024,1,Reddit
4666,1g88apd,It was all STAGED!! Trump did not work. McDona...,20-Oct-2024,,,lsyjeod,Off topic but I keep thinking about Kamala wor...,21-Oct-2024,4,Reddit
5636,1g88apd,It was all STAGED!! Trump did not work. McDona...,20-Oct-2024,,,lsx56gb,Oh I thought those were AI pictures lmao,20-Oct-2024,1,Reddit
9651,1g88apd,It was all STAGED!! Trump did not work. McDona...,20-Oct-2024,,,lt2jwjs,I do think Kamal needs to spend a few hours do...,21-Oct-2024,1,Reddit
17089,1fzyjvi,Podcaster Andrew Schultz laughs in Trump's fa...,09-Oct-2024,,,lr6e20v,There couldn't be a funnier context to this ph...,10-Oct-2024,1,Reddit
4918,-5KWZL1blWc,Jon Stewart on Trump's McDonald's Shift & His ...,2024-10-22T03:00:11Z,,,UgyXYzxWr84-XDUhYnZ4AaABAg,trump will be making license plates soon,2024-10-22T03:37:04Z,7,Youtube
106,1g34hu5,Bryan Cranston campaigning for Kamala Harris i...,14-Oct-2024,,,lrtk1zt,Good for him. Only just recenetly started Bett...,14-Oct-2024,2,Reddit
18137,1fzyjvi,Podcaster Andrew Schultz laughs in Trump's fa...,09-Oct-2024,lr5zy5e,This is political propaganda. Know your histor...,lr6ehms,I don't think any of you knows what propaganda...,10-Oct-2024,1,Reddit
2310,1g62265,Kamala Harris to MAGA hecklers: “Oh you guys a...,17-Oct-2024,,,lsfji33,Did anyone get the number of that truck,17-Oct-2024,263,Reddit
13962,1g88apd,It was all STAGED!! Trump did not work. McDona...,20-Oct-2024,lsxrvjb,Their jobs didn't matter that day.,lsycd51,Their jobs don’t matter. *Jobs* in the abstrac...,21-Oct-2024,9,Reddit


In [3]:
# combined_df.to_csv(FINAL_FILE_SAVE_DIR+COMBINED_COMMENT_LEVEL_FILE, index=False, encoding='utf-8-sig')
# combined_df = pd.read_csv(FINAL_FILE_SAVE_DIR+COMBINED_COMMENT_LEVEL_FILE, encoding='utf-8-sig')

In [4]:
sentence_df = split_comments_to_sentence(combined_df[~combined_df.comment.isna()])
sentence_df.head()

Unnamed: 0,comment_id,post_id,post_title,post_timestamp,parent_comment_id,parent_comment,comment,comment_timestamp,number_of_comment_votes,sentence,previous_sentence
0,lrt56j4,1g34hu5,Bryan Cranston campaigning for Kamala Harris i...,14-Oct-2024,,,It looks like this post is about Politics. Var...,14-Oct-2024,1,It looks like this post is about Politics.,
1,lrt56j4,1g34hu5,Bryan Cranston campaigning for Kamala Harris i...,14-Oct-2024,,,It looks like this post is about Politics. Var...,14-Oct-2024,1,Various methods of filtering out content relat...,It looks like this post is about Politics.
2,lrt56j4,1g34hu5,Bryan Cranston campaigning for Kamala Harris i...,14-Oct-2024,,,It looks like this post is about Politics. Var...,14-Oct-2024,1,"*I am a bot, and this action was performed aut...",Various methods of filtering out content relat...
3,lrt56j4,1g34hu5,Bryan Cranston campaigning for Kamala Harris i...,14-Oct-2024,,,It looks like this post is about Politics. Var...,14-Oct-2024,1,Please [contact the moderators of this subredd...,"*I am a bot, and this action was performed aut..."
4,lrt56j4,1g34hu5,Bryan Cranston campaigning for Kamala Harris i...,14-Oct-2024,,,It looks like this post is about Politics. Var...,14-Oct-2024,1,*,Please [contact the moderators of this subredd...


In [5]:
# sentence_df.to_csv(FINAL_FILE_SAVE_DIR+COMBINED_SENTENCE_LEVEL_FILE, index=False, encoding='utf-8-sig')
# sentence_df = pd.read_csv(FINAL_FILE_SAVE_DIR+COMBINED_SENTENCE_LEVEL_FILE, encoding='utf-8-sig')