In [94]:
import numpy as np
import pandas as pd
import re
from datetime import datetime
import json
import seaborn as sb
import matplotlib.pyplot as plt

In [95]:
youtube_video = "../datasets/original/US_youtube_trending_data_15_4_24.csv"
channel = "../datasets/original/Top_Youtubers_Dataset.csv"
category_json = "../US_category_id.json"
api_dataset = "../datasets/cleaned/youtube_video_with_api.csv"

youtube_video_data = pd.read_csv(youtube_video, quotechar='"', escapechar='\\')
channel_data = pd.read_csv(channel, quotechar='"', escapechar='\\', encoding='ISO-8859-1')
api_data = pd.read_csv(api_dataset, quotechar='"', escapechar='\\')
with open(category_json, "r") as file:
    category_data = json.load(file)

In [96]:
# cleaning video data

# restricting date (1 year)
def restrict_date(df):
    try:
        df['trending_date'] = pd.to_datetime(df['trending_date'])
    except:
        # If that fails, try other common formats
        try:
            df['trending_date'] = pd.to_datetime(df['trending_date'], format='%Y-%m-%dT%H:%M:%SZ', errors='coerce')
        except:
            # If still failing, try with direct string format detection
            date_formats = ['%Y-%m-%d', '%Y-%m-%dT%H:%M:%SZ', '%Y/%m/%d']
            
            for fmt in date_formats:
                try:
                    df['trending_date'] = pd.to_datetime(df['trending_date'], format=fmt, errors='coerce')
                    if not df['trending_date'].isna().all():
                        break
                except:
                    continue
    
    # Set date range for filtering - with timezone information
    # Make the start and end dates timezone-aware if the data is timezone-aware
    if hasattr(df['trending_date'].dt, 'tz') and df['trending_date'].dt.tz is not None:
        # If data has timezone info, add timezone to our filter dates
        start_date = pd.Timestamp('2023-04-15').tz_localize(df['trending_date'].dt.tz)
        end_date = pd.Timestamp('2024-04-15').tz_localize(df['trending_date'].dt.tz)
    else:
        # If data is timezone-naive, use naive timestamps
        start_date = pd.Timestamp('2023-04-15')
        end_date = pd.Timestamp('2024-04-15')
    
    # Filter the dataframe to include only rows within the date range
    filtered_df = df[(df['trending_date'] >= start_date) & 
                     (df['trending_date'] <= end_date)]

    return filtered_df

# counting tags
def count_tags(tags_str):
        if pd.isna(tags_str) or tags_str == '[None]' or not tags_str:
            return 0
        
        # Split by | which appears to be the tag delimiter in the sample data
        if '|' in str(tags_str):
            return len(str(tags_str).split('|'))
        
        # If no delimiter is found but there's content, assume it's one tag
        return 1 if str(tags_str).strip() else 0


# removing columns
youtube_video_data = youtube_video_data.drop(['dislikes','thumbnail_link', 'ratings_disabled'], axis = 1)
youtube_video_data = restrict_date(youtube_video_data)

#combining api dataset 
youtube_video_data['video_length'] = api_data['video_length'].values
youtube_video_data['video_language'] = api_data['video_language'].values

# mapping category 
category_mapping = {}
for item in category_data.get("items", []):
    category_mapping[item['id']] = item["snippet"]["title"]

youtube_video_data["categoryId"] = youtube_video_data["categoryId"].astype(str).map(category_mapping)
youtube_video_data.rename(columns = {'categoryId': 'category'}, inplace = True)

# mapping top channel (T/F)
top_channels = channel_data["Youtuber"].tolist()
youtube_video_data["Top channel"] = False
youtube_video_data.loc[youtube_video_data["channelTitle"].isin(top_channels), "Top channel"] = True

# turning description into T/F
youtube_video_data['description'] = youtube_video_data['description'].apply(lambda x: 'Y' if pd.notna(x) and str(x).strip() != '' else 'N') 
youtube_video_data['count_tags'] = youtube_video_data['tags'].apply(count_tags)

# youtube_video_data.drop_duplicates() # no duplicates 
# youtube_video_data.shape
youtube_video_data.head()

Unnamed: 0,video_id,title,publishedAt,channelId,channelTitle,category,trending_date,tags,view_count,likes,comment_count,comments_disabled,description,video_length,video_language,Top channel,count_tags
195990,gMq-I0dejjE,Ice Spice & Nicki Minaj - Princess Diana (Offi...,2023-04-14T16:00:09Z,UCCdbL-yl7Qa_vurbYWdn5SA,IceSpiceVEVO,Music,2023-04-15 00:00:00+00:00,Ice Spice|Nicki Minaj|Princess Diana,4474729,387627,32406,False,Y,182,unknown,False,3
195991,STPzH0Uob54,My Brother Almost Died (18 Times...),2023-04-14T17:00:02Z,UC2hm5rD_IrfYRMfq5YQudgA,Haminations,Comedy,2023-04-15 00:00:00+00:00,Haminations,1584808,88621,15881,False,Y,943,en,False,1
195992,uHGShqcAHlQ,The Legend of Zelda: Tears of the Kingdom – Of...,2023-04-13T14:06:39Z,UCGIY_O-8vW4rfX98KlMkvRg,Nintendo of America,Gaming,2023-04-15 00:00:00+00:00,nintendo|game|gameplay|fun|video game|action|a...,6121261,352321,29624,False,Y,235,en,False,21
195993,SAheAwnUbcE,YoungBoy Never Broke Again Ft Mariah the Scien...,2023-04-14T04:09:20Z,UClW4jraMKz6Qj69lJf-tODA,YoungBoy Never Broke Again,Music,2023-04-15 00:00:00+00:00,YoungBoy Never Broke Again|NBA YoungBoy|YoungB...,1208574,98889,7752,False,N,161,unknown,False,23
195994,T3R8onC1yM0,Man United vs. Sevilla: Extended Highlights | ...,2023-04-13T21:29:22Z,UCf8YPuOWXlpTS7RibaJlP4g,CBS Sports Golazo - Europe,Sports,2023-04-15 00:00:00+00:00,[None],650463,5945,1374,False,Y,744,en,False,0


In [97]:
output_file =  "../datasets/final/cleaned_youtube_trending_with_api.csv"
youtube_video_data.to_csv(output_file, index=False, quoting=1)  # quoting=1 ensures fields with commas are properly quoted

In [98]:
# Clean channel data
channel_data = channel_data.drop(['Rank'], axis=1)

# Remove rows where the 'Video Views' or 'Video Count' are equal to zero
channel_data = channel_data[(channel_data['Video Views'] > 0) & (channel_data['Video Count'] > 0)]

# Remove rows with missing category values
channel_data = channel_data.dropna(subset=['Category'])

# Display cleaned data
channel_data

channel_output_file = "../datasets/final/cleaned_top_channel_.csv"
channel_data.to_csv(channel_output_file, index=False, quoting=1)