# DSCI 320: Project Milestone 2

## Global YouTube Statistics 2023


As business, we want to market/ target the right audience. (can be from learning about categories, channel types, subscriber number…)... - to edit later

## Data Tidy/Wrangling

In [40]:
import altair as alt
import pandas as pd
import os

# Load df, select and rename columns, drop NA
df = pd.read_csv("Data/Global YouTube Statistics.csv", encoding='latin-1', parse_dates=['created_year'])
selected_columns = ['subscribers', 'Youtuber','video views', 'category', 'uploads', 'Country',
                    'lowest_yearly_earnings', 'highest_yearly_earnings','created_year', 
                    'Gross tertiary education enrollment (%)','Population','Unemployment rate']

clean_df = df[selected_columns].rename(
    columns={'video views':'video_views', 'Youtuber':'youtuber', 'category':'channel_category', 'uploads':'channel_uploads', 
             'Country': 'region', 'Gross tertiary education enrollment (%)' :'tertiary_education_enrollment',
             'Population':'population','Unemployment rate':'unemployment_rate'}
).dropna()

In [41]:
# Grouping Categories
def categorize_channel(category):
    if category in ['Film & Animation', 'Entertainment', 'Shows', 'Gaming', 'Comedy', 'Trailers', 'Movies','Music']:
        return 'Entertainment'
    elif category in ['Education', 'Howto & Style', 'Science & Technology','News & Politics']:
        return 'Education'
    elif category in ['People & Blogs', 'Pets & Animals', 'Travel & Events']:
        return 'Lifestyle'
    elif category in ['Nonprofits & Activism', 'Autos & Vehicles', 'Sports']:
        return 'Other'
    else:
        return 'Other' 

clean_df['channel_category'] = clean_df['channel_category'].apply(categorize_channel)

# Grouping Regions
def categorize_region(country):
    region_mapping = {
        'N.America': ['United States', 'Canada', 'Mexico', 'El Salvador', 'Barbados'],
        'S.America': ['Brazil', 'Argentina', 'Chile', 'Cuba', 'Colombia', 'Venezuela', 'Ecuador', 'Peru'],
        'Europe': ['United Kingdom', 'Netherlands', 'Spain', 'Italy', 'Germany', 'France', 'Sweden', 'Ukraine', 'Russia', 'Latvia', 'Switzerland', 'Finland'],
        'Asia': ['India', 'Japan', 'South Korea', 'Pakistan', 'Philippines', 'Thailand', 'United Arab Emirates', 'Saudi Arabia', 'Indonesia', 
                 'Kuwait', 'Jordan', 'Turkey', 'China', 'Singapore', 'Vietnam', 'Malaysia', 'Iraq', 'Bangladesh', 'Afghanistan'],
        'Africa': ['Morocco', 'Egypt'],
        'Australia': ['Australia','Samoa']
    }
    return next((region for region, countries in region_mapping.items() if country in countries), 'Other')

clean_df['region'] = clean_df['region'].apply(categorize_region)

In [42]:
# Drop Africa, Australia - not enough data.
region_counts = clean_df['region'].value_counts()
print(region_counts)
clean_df = clean_df[(clean_df['region'] != 'Australia') & (clean_df['region'] != 'Africa')]

#Drop year 1970, show only Year.
clean_df = clean_df[clean_df['created_year'].dt.year != 1970]
clean_df['created_year'] = pd.to_datetime(clean_df['created_year']).dt.year
clean_df.head()

region
N.America    340
Asia         275
Europe       110
S.America     93
Australia      9
Africa         3
Name: count, dtype: int64


Unnamed: 0,subscribers,youtuber,video_views,channel_category,channel_uploads,region,lowest_yearly_earnings,highest_yearly_earnings,created_year,tertiary_education_enrollment,population,unemployment_rate
0,245000000,T-Series,228000000000.0,Entertainment,20082,Asia,6800000.0,108400000.0,2006,28.1,1366418000.0,5.36
1,170000000,YouTube Movies,0.0,Entertainment,1,N.America,0.04,0.58,2006,88.2,328239500.0,14.7
2,166000000,MrBeast,28368840000.0,Entertainment,741,N.America,4000000.0,64700000.0,2012,88.2,328239500.0,14.7
3,162000000,Cocomelon - Nursery Rhymes,164000000000.0,Education,966,N.America,5900000.0,94800000.0,2006,88.2,328239500.0,14.7
4,159000000,SET India,148000000000.0,Entertainment,116536,Asia,5500000.0,87500000.0,2006,28.1,1366418000.0,5.36


## Data Viz for Tasks

### TASK 1: 
#### “What is the distribution of subscribers for each Youtube channel categories based on region?”

In [43]:
chart1 = alt.Chart(clean_df).mark_bar().encode(
    alt.Y('subscribers:Q', title = 'Subscribers').stack('normalize'),
    alt.X('region:N', title = 'Region'),
    alt.Color('channel_category', title = 'Channel Category')
).properties(title = 'Subscriber Counts by Channel Category in Different Regions', width = 400, height=350)
chart1

### TASK 2: 
#### "What are the top earners among YT channel types on a yearly basis?"

In [34]:
chart2 = alt.Chart(clean_df).mark_rect().encode(
    alt.X('created_year:O', title = 'Created Year'),
    alt.Y('channel_category', title = 'Channel Category'),
   # alt.Color('channel_type'), 
    alt.Color('average(highest_yearly_earnings)', title = 'Highest Average Yearly Earnings')
).properties(title = 'Highest Earnings for each Channel Category by Year',
            height=200, width=350)
chart2 

### TASK 3: 
#### "In which countries does the channel have the highest and lowest rankings based on subscribers, and how can localization strategies be employed to enhance engagement in specific regions?" 

### TASK 4 
#### "How does the number of subscribers and video views correlate with the channel's earnings, and what are the top 10 YouTube channels within each category based on subscribers?

### TASK 5: 
#### "Is there a correlation between the percentage of the population enrolled in tertiary education in each region and the channel's success in terms of subscribers and video views?" 

In [38]:
chart5 = alt.Chart(clean_df).mark_point().encode(
    alt.X('tertiary_education_enrollment', title = 'tertiary education enrollment (%)'),
    alt.Y('subscribers', title = 'subscribers'),
   alt.Size('video_views', title = 'video views'), 
    alt.Color('region'),
    alt.Tooltip(['youtuber','region','subscribers'])
).properties(title = "Correlation between Tertiary Education Enrollment and Channel's Success",
            height=400, width=550)
chart5 

#### Objective:
Explore the potential link between educational demographics and the channel's popularity, providing insights into content preferences and engagement patterns.

## .