# Metadata

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
from dateutil import parser
from datetime import datetime
import re

In [3]:
channel_df = pd.read_csv("metadata.csv")

In [4]:
channel_df.head()

Unnamed: 0,Channel_name,Subscribers,Views,PublishedAt,Country,Total_videos,playlist_id
0,Alex The Analyst,1130000,53403017,2020-01-08T05:04:24.970712Z,US,386,UU7cs8q-gJRlGwj4A8OmCmXg


In [5]:
channel_df["PublishedAt"] = pd.to_datetime(channel_df["PublishedAt"])

channel_df["Year"] = channel_df["PublishedAt"].dt.year
channel_df["Time"] = channel_df["PublishedAt"].dt.strftime("%H:%M:%S")

channel_df = channel_df.drop(columns=["PublishedAt"])

In [6]:
channel_df.head()

Unnamed: 0,Channel_name,Subscribers,Views,Country,Total_videos,playlist_id,Year,Time
0,Alex The Analyst,1130000,53403017,US,386,UU7cs8q-gJRlGwj4A8OmCmXg,2020,05:04:24


In [7]:
current_year = datetime.now().year
channel_df["ChannelAge"] = current_year - channel_df["Year"]

In [8]:
channel_df.head()

Unnamed: 0,Channel_name,Subscribers,Views,Country,Total_videos,playlist_id,Year,Time,ChannelAge
0,Alex The Analyst,1130000,53403017,US,386,UU7cs8q-gJRlGwj4A8OmCmXg,2020,05:04:24,5


# Videos Data

In [9]:
videos_df = pd.read_csv("videos.csv")

In [10]:
videos_df.head()

Unnamed: 0,video_id,video_title,published_at,view_count,like_count,comment_count,duration_seconds
0,kk5zEOQzTmQ,Data Visualization and Presentation in R | R f...,2025-08-19T12:01:22Z,1860,61,5,1264.0
1,yhlqKsYpzgE,Alex The Analyst Q/A Livestream | Come Ask Me ...,2025-08-18T01:01:35Z,0,8,0,0.0
2,TP2OJuZhbIQ,Things I Learned as a Data Analyst p1,2025-08-15T11:46:04Z,5823,203,12,38.0
3,Mi8st3hyMH8,Alex The Analyst Q/A Livestream | Come Ask Me ...,2025-08-14T14:18:24Z,2777,109,8,3967.0
4,vAKs1-EEJ38,How to Remove Duplicates in an R Dataframe | R...,2025-08-12T12:01:49Z,1752,59,2,488.0


In [11]:
videos_df.isna().sum()

Unnamed: 0,0
video_id,0
video_title,0
published_at,0
view_count,0
like_count,0
comment_count,0
duration_seconds,0


In [12]:
videos_df["published_at"] = pd.to_datetime(videos_df["published_at"])

videos_df["Year"] = videos_df["published_at"].dt.year
videos_df["Month"] = videos_df["published_at"].dt.month
videos_df["Day"] = videos_df["published_at"].dt.day
videos_df["Weekday"] = videos_df["published_at"].dt.day_name()
videos_df["Time"] = videos_df["published_at"].dt.strftime("%H:%M:%S")

def clean_title(title):
    cleaned = re.sub(r'[^a-zA-Z0-9\s]', '', str(title))
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    return cleaned

videos_df["Cleaned_Title"] = videos_df["video_title"].apply(clean_title)


def classify_duration(seconds):
    minutes = seconds / 60
    if minutes < 5:
        return "Short"
    elif minutes <= 20:
        return "Medium"
    else:
        return "Long"

videos_df["Duration_Category"] = videos_df["duration_seconds"].apply(classify_duration)

videos_df = videos_df.drop(columns=["video_title", "published_at"])

In [13]:
videos_df.head()

Unnamed: 0,video_id,view_count,like_count,comment_count,duration_seconds,Year,Month,Day,Weekday,Time,Cleaned_Title,Duration_Category
0,kk5zEOQzTmQ,1860,61,5,1264.0,2025,8,19,Tuesday,12:01:22,Data Visualization and Presentation in R R for...,Long
1,yhlqKsYpzgE,0,8,0,0.0,2025,8,18,Monday,01:01:35,Alex The Analyst QA Livestream Come Ask Me Any...,Short
2,TP2OJuZhbIQ,5823,203,12,38.0,2025,8,15,Friday,11:46:04,Things I Learned as a Data Analyst p1,Short
3,Mi8st3hyMH8,2777,109,8,3967.0,2025,8,14,Thursday,14:18:24,Alex The Analyst QA Livestream Come Ask Me Any...,Long
4,vAKs1-EEJ38,1752,59,2,488.0,2025,8,12,Tuesday,12:01:49,How to Remove Duplicates in an R Dataframe R f...,Medium


# Comments Data

In [24]:
comments = pd.read_csv("video_comments.csv")

In [25]:
comments.head()

Unnamed: 0,video_id,comment
0,kk5zEOQzTmQ,I thought R was dying out
1,kk5zEOQzTmQ,Like your R content.\n\nCurious about your rea...
2,kk5zEOQzTmQ,Great to see some love for R on the channel. H...
3,kk5zEOQzTmQ,Can you do this using python
4,kk5zEOQzTmQ,This is for me. Thank you ❤


In [26]:
from textblob import TextBlob
import pandas as pd
import re

def clean_text(text):
    text = str(text)
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"[^A-Za-z0-9\s]", "", text)
    text = text.strip()
    return text

comments['comment'] = comments['comment'].apply(clean_text)

def get_sentiment_label(text):
    score = TextBlob(str(text)).sentiment.polarity
    if score > 0.1:
        return 1
    elif score < -0.1:
        return -1
    else:
        return 0

comments['sentiment_label'] = comments['comment'].apply(get_sentiment_label)

comments

Unnamed: 0,video_id,comment,sentiment_label
0,kk5zEOQzTmQ,I thought R was dying out,0
1,kk5zEOQzTmQ,Like your R content\n\nCurious about your reas...,0
2,kk5zEOQzTmQ,Great to see some love for R on the channel Hu...,1
3,kk5zEOQzTmQ,Can you do this using python,0
4,kk5zEOQzTmQ,This is for me Thank you,0
...,...,...,...
20777,6lQzbk6_OTw,Hey Alex Thanks for reviewing my resume and gr...,1
20778,6lQzbk6_OTw,Hey Alex what do you think about COGNOS,0
20779,6lQzbk6_OTw,Hi Alex\nfound your channel on Reddit and am g...,0
20780,6lQzbk6_OTw,Great video Alex I definitely agree that Excel...,1


In [27]:
channel_df.to_csv("meta.csv", index=False)
videos_df.to_csv("vid.csv", index=False)
comments.to_csv("cmt.csv", index=False)