In [None]:
# Install required packages (uncomment if not already installed)
# pip install pandas matplotlib seaborn wordcloud textblob kagglehub[pandas-datasets]

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from textblob import TextBlob
import kagglehub
from kagglehub import KaggleDatasetAdapter

# --- Load Dataset from KaggleHub ---
file_path = "USvideos.csv"  # Specify the file you want from the dataset

df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "rsrishav/youtube-trending-video-dataset",
    file_path,
)

# --- Data Overview ---
print(df.head())
print(df.info())

# --- Data Cleaning ---
df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')
df['trending_date'] = pd.to_datetime(df['trending_date'], format='%y.%d.%m', errors='coerce')

# Fill NA values in tags
df['tags'] = df['tags'].fillna('')

# Drop duplicates
df = df.drop_duplicates()

# --- Most Trending Channels ---
top_channels = df['channel_title'].value_counts().head(10)
top_channels.plot(kind='barh', color='purple')
plt.title("Top 10 Trending Channels")
plt.xlabel("Number of Trending Videos")
plt.ylabel("Channel")
plt.tight_layout()
plt.show()

# --- Average Views, Likes, Comments ---
avg_views = df['views'].mean()
avg_likes = df['likes'].mean()
avg_comments = df['comment_count'].mean()

print(f"Average Views: {avg_views:.2f}")
print(f"Average Likes: {avg_likes:.2f}")
print(f"Average Comments: {avg_comments:.2f}")

# --- Sentiment Analysis on Titles ---
def get_sentiment(text):
    blob = TextBlob(str(text))
    return blob.sentiment.polarity

df['title_sentiment'] = df['title'].apply(get_sentiment)

sns.histplot(df['title_sentiment'], bins=50, kde=True, color='skyblue')
plt.title("Sentiment Distribution of Video Titles")
plt.xlabel("Sentiment Polarity")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()

# --- Word Cloud of Titles ---
text = " ".join(title for title in df['title'] if isinstance(title, str))
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

plt.figure(figsize=(15, 7))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Word Cloud of Video Titles")
plt.tight_layout()
plt.show()
