In [14]:
import warnings
warnings.filterwarnings("ignore")

In [15]:
import pandas as pd
from dateutil import parser
from datetime import datetime

# Channel Metadata

In [16]:
channel_df = pd.read_csv("channel_metadata.csv")

In [17]:
channel_df['PublishedAt'] = channel_df['PublishedAt'].apply(lambda x: parser.parse(x).replace(tzinfo=None) if pd.notnull(x) else None)

In [19]:
channel_df

Unnamed: 0,Channel_name,Subscribers,Views,PublishedAt,Country,Total_videos,playlist_id
0,Alex The Analyst,1080000,51090763,2020-01-08 05:04:24.970712,US,366,UU7cs8q-gJRlGwj4A8OmCmXg


In [20]:
now = datetime.now()
channel_df['channel_age_years'] = (now - channel_df['PublishedAt']).dt.days // 365

In [21]:
channel_df['PublishedAt'] = channel_df['PublishedAt'].dt.strftime('%Y-%m-%d %H:%M:%S')

In [22]:
channel_df

Unnamed: 0,Channel_name,Subscribers,Views,PublishedAt,Country,Total_videos,playlist_id,channel_age_years
0,Alex The Analyst,1080000,51090763,2020-01-08 05:04:24,US,366,UU7cs8q-gJRlGwj4A8OmCmXg,5


In [23]:
channel_df.to_csv("channel.csv", index=False)

# Video Data

In [24]:
videos_df = pd.read_csv("videos_data.csv")

In [25]:
videos_df.head()

Unnamed: 0,video_id,video_title,published_at,view_count,like_count,comment_count,duration_seconds
0,TsnGd6p9oTk,Installing R and R Studio | R for Data Analyti...,2025-06-17T12:00:48Z,2787,114,9,359.0
1,DsI1vG-kXR8,Best Resources to Learn Data Analytics in 2025,2025-06-10T12:00:19Z,16190,758,33,407.0
2,4lhPHhPkVLM,Best Resources to Learn Data Visualization in ...,2025-06-03T12:00:19Z,10229,306,28,268.0
3,dkU5n0G7FRQ,Best Resources to Learn Excel in 2025,2025-05-27T12:01:27Z,11340,569,15,289.0
4,kagUDLvrcZ8,Best Resources to Learn Python in 2025,2025-05-20T12:01:32Z,18442,707,54,309.0


In [26]:
videos_df['published_at'] = videos_df['published_at'].apply(lambda x: parser.parse(x).replace(tzinfo=None) if pd.notnull(x) else None)

In [27]:
videos_df.head()

Unnamed: 0,video_id,video_title,published_at,view_count,like_count,comment_count,duration_seconds
0,TsnGd6p9oTk,Installing R and R Studio | R for Data Analyti...,2025-06-17 12:00:48,2787,114,9,359.0
1,DsI1vG-kXR8,Best Resources to Learn Data Analytics in 2025,2025-06-10 12:00:19,16190,758,33,407.0
2,4lhPHhPkVLM,Best Resources to Learn Data Visualization in ...,2025-06-03 12:00:19,10229,306,28,268.0
3,dkU5n0G7FRQ,Best Resources to Learn Excel in 2025,2025-05-27 12:01:27,11340,569,15,289.0
4,kagUDLvrcZ8,Best Resources to Learn Python in 2025,2025-05-20 12:01:32,18442,707,54,309.0


In [28]:
def clean_title(title):
    cleaned = re.sub(r'[^a-zA-Z0-9\s.,!?-]', '', str(title))
    cleaned = re.sub(r'\s+', ' ', cleaned).strip()
    return cleaned.lower()

videos_df['video_title'] = videos_df['video_title'].astype(str).apply(clean_title)

In [29]:
videos_df.head(2)

Unnamed: 0,video_id,video_title,published_at,view_count,like_count,comment_count,duration_seconds
0,TsnGd6p9oTk,installing r and r studio r for data analytics...,2025-06-17 12:00:48,2787,114,9,359.0
1,DsI1vG-kXR8,best resources to learn data analytics in 2025,2025-06-10 12:00:19,16190,758,33,407.0


In [30]:
videos_df['published_year'] = videos_df['published_at'].dt.year
videos_df['published_month'] = videos_df['published_at'].dt.month
videos_df['published_day'] = videos_df['published_at'].dt.day
videos_df['published_weekday'] = videos_df['published_at'].dt.day_name()
videos_df['is_weekend'] = (videos_df['published_at'].dt.dayofweek >= 5).astype(int)

In [31]:
videos_df.head(2)

Unnamed: 0,video_id,video_title,published_at,view_count,like_count,comment_count,duration_seconds,published_year,published_month,published_day,published_weekday,is_weekend
0,TsnGd6p9oTk,installing r and r studio r for data analytics...,2025-06-17 12:00:48,2787,114,9,359.0,2025,6,17,Tuesday,0
1,DsI1vG-kXR8,best resources to learn data analytics in 2025,2025-06-10 12:00:19,16190,758,33,407.0,2025,6,10,Tuesday,0


In [32]:
cols_to_numeric = ['view_count', 'like_count', 'comment_count', 'duration_seconds']
videos_df[cols_to_numeric] = videos_df[cols_to_numeric].apply(pd.to_numeric, errors='coerce')
videos_df.dropna(subset=cols_to_numeric, inplace=True)

In [33]:
videos_df['like_ratio'] = videos_df['like_count'] / videos_df['view_count'].replace(0, 1)
videos_df['comment_ratio'] = videos_df['comment_count'] / videos_df['view_count'].replace(0, 1)

In [34]:
videos_df.head(2)

Unnamed: 0,video_id,video_title,published_at,view_count,like_count,comment_count,duration_seconds,published_year,published_month,published_day,published_weekday,is_weekend,like_ratio,comment_ratio
0,TsnGd6p9oTk,installing r and r studio r for data analytics...,2025-06-17 12:00:48,2787,114,9,359.0,2025,6,17,Tuesday,0,0.040904,0.003229
1,DsI1vG-kXR8,best resources to learn data analytics in 2025,2025-06-10 12:00:19,16190,758,33,407.0,2025,6,10,Tuesday,0,0.046819,0.002038


In [35]:
videos_df['title_length'] = videos_df['video_title'].apply(len)
videos_df['days_since_published'] = (datetime.now() - videos_df['published_at']).dt.days

In [36]:
videos_df.head(2)

Unnamed: 0,video_id,video_title,published_at,view_count,like_count,comment_count,duration_seconds,published_year,published_month,published_day,published_weekday,is_weekend,like_ratio,comment_ratio,title_length,days_since_published
0,TsnGd6p9oTk,installing r and r studio r for data analytics...,2025-06-17 12:00:48,2787,114,9,359.0,2025,6,17,Tuesday,0,0.040904,0.003229,53,10
1,DsI1vG-kXR8,best resources to learn data analytics in 2025,2025-06-10 12:00:19,16190,758,33,407.0,2025,6,10,Tuesday,0,0.046819,0.002038,46,17


In [37]:
videos_df['published_at'] = videos_df['published_at'].dt.strftime('%Y-%m-%d %H:%M:%S')

In [38]:
videos_df.head(2)

Unnamed: 0,video_id,video_title,published_at,view_count,like_count,comment_count,duration_seconds,published_year,published_month,published_day,published_weekday,is_weekend,like_ratio,comment_ratio,title_length,days_since_published
0,TsnGd6p9oTk,installing r and r studio r for data analytics...,2025-06-17 12:00:48,2787,114,9,359.0,2025,6,17,Tuesday,0,0.040904,0.003229,53,10
1,DsI1vG-kXR8,best resources to learn data analytics in 2025,2025-06-10 12:00:19,16190,758,33,407.0,2025,6,10,Tuesday,0,0.046819,0.002038,46,17


In [39]:
videos_df.to_csv("videos.csv", index=False)