In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from datetime import datetime

try:
    reddit_tech = pd.read_csv('../reddit_tech_subs/data/reddit_ai_tech_20251102_1836.csv')
except FileNotFoundError:
    raise FileNotFoundError("Preprocessed CSV not found. Check your path.")


reddit_tech.head()

Unnamed: 0.1,Unnamed: 0,subreddit,type,id,parent_id,created_utc,score,num_comments,url,content,neg,neu,pos,compound,sent_label
0,0,MachineLearning,post,1olehrk,,1761969000.0,53,14.0,https://www.reddit.com/r/MachineLearning/comme...,[D] Realized I like the coding and ML side of ...,0.013,0.815,0.172,0.9895,pos
1,1,MachineLearning,post,1oggr5l,,1761473000.0,45,37.0,https://www.reddit.com/r/MachineLearning/comme...,[D] Building low cost GPU compute in Africa ch...,0.057,0.874,0.07,0.5927,pos
2,2,MachineLearning,post,1ojqgq4,,1761799000.0,38,3.0,https://www.reddit.com/r/MachineLearning/comme...,[P] I made a tool to search papers from select...,0.0,0.909,0.091,0.5994,pos
3,3,MachineLearning,post,1okdq0s,,1761863000.0,26,14.0,https://www.reddit.com/r/MachineLearning/comme...,[R] We found LRMs look great…until the problem...,0.055,0.896,0.049,-0.674,neg
4,4,MachineLearning,post,1ojwyye,,1761823000.0,17,7.0,https://www.reddit.com/r/MachineLearning/comme...,[P] `triton_bwd`: Enabling Backpropagation for...,0.028,0.828,0.144,0.9397,pos


In [11]:
reddit_tech.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2244 entries, 0 to 2243
Data columns (total 15 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    2244 non-null   int64  
 1   subreddit     2244 non-null   object 
 2   type          2244 non-null   object 
 3   id            2244 non-null   object 
 4   parent_id     1821 non-null   object 
 5   created_utc   2244 non-null   float64
 6   score         2244 non-null   int64  
 7   num_comments  423 non-null    float64
 8   url           2244 non-null   object 
 9   content       2244 non-null   object 
 10  neg           2244 non-null   float64
 11  neu           2244 non-null   float64
 12  pos           2244 non-null   float64
 13  compound      2244 non-null   float64
 14  sent_label    2244 non-null   object 
dtypes: float64(6), int64(2), object(7)
memory usage: 263.1+ KB


In [12]:
# Set visualization style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
%matplotlib inline

In [None]:
# Check for missing values
print("Missing Values:")
missing_counts = reddit_tech.isnull().sum()
missing_pct = (reddit_tech.isnull().sum() / len(reddit_tech)) * 100
missing_df = pd.DataFrame({
    'Missing Count': missing_counts,
    'Percentage': missing_pct
})
print(missing_df[missing_df['Missing Count'] > 0])

Missing Values:
              Missing Count  Percentage
parent_id               423   18.850267
num_comments           1821   81.149733




In [16]:
reddit_tech.groupby('type').size()

type
comment    1821
post        423
dtype: int64

In [None]:
# cleaning null values in parent_id columns
reddit_tech['parent_id'] = reddit_tech['parent_id'].fillna('None')

In [None]:
# checking for null values in parent_id columns
reddit_tech['parent_id'][reddit_tech['parent_id'].isnull()==True]

Series([], Name: parent_id, dtype: object)

In [None]:
# cleaning null values in num_comments columns
reddit_tech['num_comments'] = reddit_tech['num_comments'].fillna(0)

In [None]:
# checking for null values in num_comments columns
reddit_tech['num_comments'][reddit_tech['num_comments'].isnull()==True]

Series([], Name: num_comments, dtype: float64)

In [25]:
# Check unique values in key columns
print("Unique values:")
print(f"Subreddits: {reddit_tech['subreddit'].nunique()}")
print(f"Content types: {reddit_tech['type'].unique()}")
print(f"Sentiment labels: {reddit_tech['sent_label'].value_counts()}")

Unique values:
Subreddits: 5
Content types: ['post' 'comment']
Sentiment labels: sent_label
pos    1319
neg     614
neu     311
Name: count, dtype: int64


In [28]:
# Drop the unnamed index column
reddit_tech = reddit_tech.drop('Unnamed: 0', axis=1)

In [52]:
reddit_tech['created_datetime'] = pd.to_datetime(reddit_tech['created_utc'], unit='s')


In [53]:
reddit_tech.head()

Unnamed: 0,subreddit,type,id,parent_id,created_utc,score,num_comments,url,content,neg,neu,pos,compound,sent_label,created_datetime,created_date
0,MachineLearning,post,1olehrk,,1761969000.0,53,14.0,https://www.reddit.com/r/MachineLearning/comme...,[D] Realized I like the coding and ML side of ...,0.013,0.815,0.172,0.9895,pos,2025-11-01 03:56:20,2025-11-01 03:56:20
1,MachineLearning,post,1oggr5l,,1761473000.0,45,37.0,https://www.reddit.com/r/MachineLearning/comme...,[D] Building low cost GPU compute in Africa ch...,0.057,0.874,0.07,0.5927,pos,2025-10-26 09:57:53,2025-10-26 09:57:53
2,MachineLearning,post,1ojqgq4,,1761799000.0,38,3.0,https://www.reddit.com/r/MachineLearning/comme...,[P] I made a tool to search papers from select...,0.0,0.909,0.091,0.5994,pos,2025-10-30 04:30:28,2025-10-30 04:30:28
3,MachineLearning,post,1okdq0s,,1761863000.0,26,14.0,https://www.reddit.com/r/MachineLearning/comme...,[R] We found LRMs look great…until the problem...,0.055,0.896,0.049,-0.674,neg,2025-10-30 22:29:26,2025-10-30 22:29:26
4,MachineLearning,post,1ojwyye,,1761823000.0,17,7.0,https://www.reddit.com/r/MachineLearning/comme...,[P] `triton_bwd`: Enabling Backpropagation for...,0.028,0.828,0.144,0.9397,pos,2025-10-30 11:21:04,2025-10-30 11:21:04


In [4]:
try:
    reddit_non_tech = pd.read_csv('../reddit_non_tech_subs/data/reddit_ai_nontech_20251102_1847.csv')
except FileNotFoundError:
    raise FileNotFoundError("Preprocessed CSV not found. Check your path.")


reddit_non_tech.head()

Unnamed: 0.1,Unnamed: 0,subreddit,type,id,parent_id,created_utc,score,num_comments,url,content,neg,neu,pos,compound,sent_label
0,0,writing,post,1oi5jlx,,1761647000.0,1,1.0,https://www.reddit.com/r/writing/comments/1oi5...,"Book Club Scam (?) Hi everyone, I'm quite new ...",0.076,0.813,0.111,0.7301,pos
1,1,writing,post,1om3fol,,1762043000.0,0,13.0,https://www.reddit.com/r/writing/comments/1om3...,Concerns With Storytelling with My Art I am so...,0.055,0.836,0.109,0.9599,pos
2,2,writing,post,1ogrbxz,,1761502000.0,0,26.0,https://www.reddit.com/r/writing/comments/1ogr...,Let's discuss the paradox of Utopian fiction I...,0.074,0.825,0.101,0.8761,pos
3,3,Screenwriting,post,1okdm7s,,1761863000.0,13,25.0,https://www.reddit.com/r/Screenwriting/comment...,DEFCON ONE - FEATURE - 104 pages) appreciate f...,0.117,0.719,0.165,0.296,pos
4,4,Screenwriting,post,1ohb9po,,1761563000.0,9,35.0,https://www.reddit.com/r/Screenwriting/comment...,"What makes a script ""Lynchian""? My husband is ...",0.062,0.835,0.103,0.8743,pos


In [6]:
try:
    youtube_tech = pd.read_csv('../youtube_data/youtube_comments_20251103_0104.csv')
except FileNotFoundError:
    raise FileNotFoundError("Preprocessed CSV not found. Check your path.")


youtube_tech.head()

Unnamed: 0.1,Unnamed: 0,video_id,id,parent_id,author,created_at,likes,content
0,0,sTeoEFzVNSc,UgyHAdzyGGobT8qiWph4AaABAg,,@programmingwithmosh,2023-01-13T17:56:38Z,1427,"Whether you like ChatGPT or not, remember: Cha..."
1,1,sTeoEFzVNSc,UgyHAdzyGGobT8qiWph4AaABAg.9kppxbFGGEc9kpw5I_SsiM,UgyHAdzyGGobT8qiWph4AaABAg,@RUFMUT,2023-01-13T18:50:15Z,5,@programmingwithmosh Thank you for video. But ...
2,2,sTeoEFzVNSc,UgykJ_-KODLohk9I77h4AaABAg,,@Innoodrr,2025-08-09T10:19:21Z,0,ChatGPT Tutorial for Developers - 38 Ways to 1...
3,3,sTeoEFzVNSc,UgyJ_YALNNX2nYdSCzh4AaABAg,,@arindamghosh3787,2025-03-29T10:00:10Z,0,I developed a custom Bot application and deplo...
4,4,sTeoEFzVNSc,UgwwKEPbrC66zBjpfwR4AaABAg,,@shawnz1,2025-03-12T04:55:23Z,0,and #39 use a powerful Chrome extension like c...
