In [1]:
import altair as alt
import pandas as pd
import os

# Load df, select and rename columns, drop NA
df = pd.read_csv("Data/Global YouTube Statistics.csv", encoding='latin-1', parse_dates=['created_year'])
selected_columns = ['subscribers', 'Youtuber','video views', 'category', 'uploads', 'Country',
                    'lowest_yearly_earnings', 'highest_yearly_earnings','created_year', 
                    'Gross tertiary education enrollment (%)','Population','Unemployment rate','channel_type','Latitude','Longitude']

clean_df = df[selected_columns].rename(
    columns={'video views':'video_views', 'Youtuber':'youtuber', 'category':'channel_category', 'uploads':'channel_uploads', 
             'Country': 'country', 'Gross tertiary education enrollment (%)' :'tertiary_education_enrollment',
             'Population':'population','Unemployment rate':'unemployment_rate'}
).dropna()

In [2]:
# Grouping Categories
# Grouping Categories
def categorize_channel(category):
    if category in ['Entertainment']:
        return 'Entertainment'
    elif category in ['Music']:
        return 'Music'
    elif category in ['Gaming']:
        return 'Gaming'
    elif category in ['Education', 'Howto & Style', 'Science & Technology','News & Politics','Nonprofits & Activism']:
        return 'Education'
    elif category in ['People & Blogs', 'Pets & Animals', 'Travel & Events','Comedy','Autos & Vehicles']:
        return 'Lifestyle'
    elif category in ['Film & Animation', 'Shows', 'Trailers', 'Movies', 'Sports']:
        return 'Media and Sports'
#    elif category in ['Nonprofits & Activism', 'Autos & Vehicles', 'Sports']:
#        return 'Other'
    else:
        return 'Other' 

clean_df['channel_category'] = clean_df['channel_category'].apply(categorize_channel)

# Grouping Regions
def categorize_region(country):
    region_mapping = {
        'N.America': ['United States', 'Canada', 'Mexico', 'El Salvador', 'Barbados'],
        'S.America': ['Brazil', 'Argentina', 'Chile', 'Cuba', 'Colombia', 'Venezuela', 'Ecuador', 'Peru'],
        'Europe': ['United Kingdom', 'Netherlands', 'Spain', 'Italy', 'Germany', 'France', 'Sweden', 'Ukraine', 'Russia', 'Latvia', 'Switzerland', 'Finland'],
        'Asia': ['India', 'Japan', 'South Korea', 'Pakistan', 'Philippines', 'Thailand', 'United Arab Emirates', 'Saudi Arabia', 'Indonesia', 
                 'Kuwait', 'Jordan', 'Turkey', 'China', 'Singapore', 'Vietnam', 'Malaysia', 'Iraq', 'Bangladesh', 'Afghanistan'],
        'Africa': ['Morocco', 'Egypt'],
        'Australia': ['Australia','Samoa']
    }
    return next((region for region, countries in region_mapping.items() if country in countries), 'Other')

clean_df['region'] = clean_df['country'].apply(categorize_region)

In [3]:
# Drop Africa, Australia - not enough data.
region_counts = clean_df['region'].value_counts()
print(region_counts)
clean_df = clean_df[(clean_df['region'] != 'Australia') & (clean_df['region'] != 'Africa')]

#Drop year 1970, show only Year.
clean_df = clean_df[clean_df['created_year'].dt.year != 1970]
clean_df['created_year'] = pd.to_datetime(clean_df['created_year']).dt.year
clean_df.head()

region
N.America    340
Asia         271
Europe       108
S.America     93
Australia      9
Africa         3
Name: count, dtype: int64


Unnamed: 0,subscribers,youtuber,video_views,channel_category,channel_uploads,country,lowest_yearly_earnings,highest_yearly_earnings,created_year,tertiary_education_enrollment,population,unemployment_rate,channel_type,Latitude,Longitude,region
0,245000000,T-Series,228000000000.0,Music,20082,India,6800000.0,108400000.0,2006,28.1,1366418000.0,5.36,Music,20.593684,78.96288,Asia
1,170000000,YouTube Movies,0.0,Media and Sports,1,United States,0.04,0.58,2006,88.2,328239500.0,14.7,Games,37.09024,-95.712891,N.America
2,166000000,MrBeast,28368840000.0,Entertainment,741,United States,4000000.0,64700000.0,2012,88.2,328239500.0,14.7,Entertainment,37.09024,-95.712891,N.America
3,162000000,Cocomelon - Nursery Rhymes,164000000000.0,Education,966,United States,5900000.0,94800000.0,2006,88.2,328239500.0,14.7,Education,37.09024,-95.712891,N.America
4,159000000,SET India,148000000000.0,Media and Sports,116536,India,5500000.0,87500000.0,2006,28.1,1366418000.0,5.36,Entertainment,20.593684,78.96288,Asia


In [4]:
clean_df.to_csv("/Users/serra/Documents/school/DSCI320Project/clean_df.csv")