# Final Project

CS 5830 - Data Science in Practice

Ann Marie Humble & Emma Lynn May

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import seaborn as sns
import isodate

In [None]:
#How much an advertiser pays for every 1000 views
    #Sources:
    #Yahoo Finance & UberConnect
AD_REVENUE_CPM = {
    'Slam Poetry': 0.36,
    'Minecraft': 0.44,
    'Movie Scenes': 1.24,
    'Woodworking': 3.02,
    'Space Songs': 0.01,
    'Ancient Memes': 0.00,
    'Health': 14.5,
    'Travel': 17.5,
    'Personal Finance': 18,
    'USU': 1.5
}

In [None]:
# youtube = pd.read_csv('tiny-data.csv')
# youtube = pd.read_csv('new-youtube-data.csv')
youtube = pd.read_csv('youtube-data.csv')

In [None]:
youtube['viewCount'] = pd.to_numeric(youtube['viewCount'])
youtube['likeCount'] = pd.to_numeric(youtube['likeCount'])

youtube['genreCPM'] = youtube['genre'].map(AD_REVENUE_CPM)
youtube['adRevenue'] = (youtube['viewCount'] / 1000) * youtube['genreCPM']

# Normalize columns using min-max normalization
youtube['norm_adRevenue'] = (youtube['adRevenue'] - youtube['adRevenue'].min()) / (youtube['adRevenue'].max() - youtube['adRevenue'].min())
youtube['norm_viewCount'] = (youtube['viewCount'] - youtube['viewCount'].min()) / (youtube['viewCount'].max() - youtube['viewCount'].min())
youtube['norm_likeCount'] = (youtube['likeCount'] - youtube['likeCount'].min()) / (youtube['likeCount'].max() - youtube['likeCount'].min())

youtube['success'] = round((youtube['norm_adRevenue'] + youtube['norm_viewCount'] + youtube['norm_likeCount']) / 3, 7)

youtube["durationSeconds"] = youtube["duration"].apply(lambda x: isodate.parse_duration(x).total_seconds())

youtube.dropna(subset=['description'], inplace=True)
youtube['descriptionLen'] = youtube['description'].str.len()
youtube.dropna(subset=['title'], inplace=True)
youtube['titleLen'] = youtube['title'].str.len()

youtube

In [None]:
plt.figure(figsize=(10, 6))
genres = youtube['genre'].unique()  # Unique genres for color mapping
colors = plt.cm.tab10(range(len(genres)))  # Assign unique colors

for genre, color in zip(genres, colors):
    genre_data = youtube[youtube['genre'] == genre]  # Filter data by genre
    plt.scatter(genre_data.index, genre_data['success'], label=genre, color=color, s=100)

# Customizing the plot
plt.title("Success by Genre", fontsize=16)
plt.xlabel("Index", fontsize=12)
plt.ylabel("Success", fontsize=12)
plt.legend(title="Genre")
plt.grid(True)
plt.show()

In [None]:
genre_success_normalized = youtube.groupby('genre')['success'].mean()

# Plotting the normalized bar graph
plt.figure(figsize=(10, 6))
genre_success_normalized.sort_values().plot(kind='bar', color=plt.cm.tab10(range(len(genre_success_normalized))), edgecolor='black')

# Customizing the plot
plt.title("Normalized Success by Genre", fontsize=16)
plt.xlabel("Genre", fontsize=12)
plt.ylabel("Average Success (Normalized)", fontsize=12)
plt.xticks(rotation=45, fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.show()

In [None]:
# Remove outliers using IQR method
Q1 = youtube['success'].quantile(0.25)
Q3 = youtube['success'].quantile(0.75)
IQR = Q3 - Q1
df_no_outliers = youtube[(youtube['success'] >= (Q1 - 1.5 * IQR)) & (youtube['success'] <= (Q3 + 1.5 * IQR))]

# Scatter plot
plt.figure(figsize=(10, 6))
genres = df_no_outliers['genre'].unique()  # Unique genres for color mapping
colors = plt.cm.tab10(range(len(genres)))  # Assign unique colors

for genre, color in zip(genres, colors):
    genre_data = df_no_outliers[df_no_outliers['genre'] == genre]  # Filter data by genre
    plt.scatter(genre_data.index, genre_data['success'], label=genre, color=color, s=100)

# Customizing the plot
plt.title("Success by Genre (No Outliers)", fontsize=16)
plt.xlabel("Index", fontsize=12)
plt.ylabel("Success", fontsize=12)
plt.legend(title="Genre")
plt.grid(True)
plt.show()

In [None]:
# Group the data by 'genre' and calculate the mean 'success' normalized by group size
genre_success_normalized = df_no_outliers.groupby('genre')['success'].mean()

# Plotting the normalized bar graph
plt.figure(figsize=(10, 6))
genre_success_normalized.sort_values().plot(kind='bar', color=plt.cm.tab10(range(len(genre_success_normalized))), edgecolor='black')

# Customizing the plot
plt.title("Normalized Success by Genre (No Outliers)", fontsize=16)
plt.xlabel("Genre", fontsize=12)
plt.ylabel("Average Success (Normalized)", fontsize=12)
plt.xticks(rotation=45, fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.show()

In [None]:
# Group the data by 'genre' and calculate the mean 'adRevenue' normalized by group size
genre_success_normalized = df_no_outliers.groupby('genre')['adRevenue'].mean()

# Plotting the normalized bar graph
plt.figure(figsize=(10, 6))
genre_success_normalized.sort_values().plot(kind='bar', color=plt.cm.tab10(range(len(genre_success_normalized))), edgecolor='black')

# Customizing the plot
plt.title("Normalized Estimated Ad Revenue by Genre (No Outliers)", fontsize=16)
plt.xlabel("Genre", fontsize=12)
plt.ylabel("Average Estimated Ad Revenue (Normalized)", fontsize=12)
plt.xticks(rotation=45, fontsize=10)
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.show()

In [None]:
# Remove outliers using IQR method
Q1 = youtube['adRevenue'].quantile(0.25)
Q3 = youtube['adRevenue'].quantile(0.75)
IQR = Q3 - Q1
df_no_outliers = youtube[(youtube['adRevenue'] >= (Q1 - 1.5 * IQR)) & (youtube['adRevenue'] <= (Q3 + 1.5 * IQR))]

# Scatter plot
plt.figure(figsize=(10, 6))
genres = df_no_outliers['genre'].unique()  # Unique genres for color mapping
colors = plt.cm.tab10(range(len(genres)))  # Assign unique colors

for genre, color in zip(genres, colors):
    genre_data = df_no_outliers[df_no_outliers['genre'] == genre]  # Filter data by genre
    plt.scatter(genre_data.index, genre_data['adRevenue'], label=genre, color=color, s=100)

# Customizing the plot
plt.title("Estimated Ad Revenue by Genre (No Outliers)", fontsize=16)
plt.xlabel("Index", fontsize=12)
plt.ylabel("Estimated Ad Revenue", fontsize=12)
plt.legend(title="Genre")
plt.grid(True)
plt.show()

In [None]:
youtube.dropna(subset=['success'], inplace=True)

In [None]:
print("What attributes are correlated with successful videos?")

corr, pval = stats.pearsonr(df_no_outliers["durationSeconds"], df_no_outliers["success"])
print(f"Length: {round(corr, 3)}, p-value: {round(pval, 3)}")

corr, pval = stats.pearsonr(df_no_outliers["caption"], df_no_outliers["success"])
print(f"Caption Status: {round(corr, 3)}, p-value: {round(pval, 3)}")

corr, pval = stats.pearsonr(df_no_outliers["descriptionLen"], df_no_outliers["success"])
print(f"Description Length: {round(corr, 3)}, p-value: {round(pval, 3)}")

corr, pval = stats.pearsonr(df_no_outliers["titleLen"], df_no_outliers["success"])
print(f"Title Length: {round(corr, 3)}, p-value: {round(pval, 3)}")

corr, pval = stats.pearsonr(df_no_outliers["hasPaidProductPlacement"], df_no_outliers["success"])
print(f"Paid Product Placement: {round(corr, 3)}, p-value: {round(pval, 3)}")

In [None]:
sns.regplot(data=df_no_outliers, x='caption', y='success')
plt.xlabel('Video Caption Status')
plt.ylabel('Video "Success"')
plt.show()

In [None]:
sns.regplot(data=df_no_outliers, x='descriptionLen', y='success')
plt.xlabel('Description Length')
plt.ylabel('Video "Success"')
plt.show()

In [None]:
top_6 = youtube.nlargest(6, "success")
print(top_6)