# Instalações necessarias

In [81]:
# Pydrive para acesso ao drive
!pip install -U -q PyDrive
# Pycaret para o modelo
!pip install -U -q pycaret[full]

# Importações necessarias

In [82]:
# Todos os imports
import pandas as pd
import numpy as np

# Baixar o dataset (Global YouTube Statistics 2023)

In [83]:
# Download do dataset do Drive
!gdown 1tXfxpKWjtvtmf9J-XYhFIOinn68c5TWg

Downloading...
From: https://drive.google.com/uc?id=1tXfxpKWjtvtmf9J-XYhFIOinn68c5TWg
To: /content/dataset.csv
  0% 0.00/200k [00:00<?, ?B/s]100% 200k/200k [00:00<00:00, 57.0MB/s]


# Carregar o dataframe


In [84]:
df = pd.read_csv('./dataset.csv',encoding="latin-1")

# Ajeitando os nomes das colunas

In [85]:
df.columns = df.columns.str.replace(' ', '_')
df.columns = df.columns.str.lower()
df.rename(columns={'video_views_for_the_last_30_days': 'video_views_last_30_days'}, inplace=True)
df.rename(columns={'gross_tertiary_education_enrollment_(%)': 'percent_gross_tertiary_education'}, inplace=True)
df.rename(columns={'subscribers_for_last_30_days': 'subscribers_last_30_days'}, inplace=True)

# Verificando e explorando o dataset

In [86]:
df.head()

Unnamed: 0,rank,youtuber,subscribers,video_views,category,title,uploads,country,abbreviation,channel_type,...,subscribers_last_30_days,created_year,created_month,created_date,percent_gross_tertiary_education,population,unemployment_rate,urban_population,latitude,longitude
0,1,T-Series,245000000,228000000000.0,Music,T-Series,20082,India,IN,Music,...,2000000.0,2006.0,Mar,13.0,28.1,1366418000.0,5.36,471031528.0,20.593684,78.96288
1,2,YouTube Movies,170000000,0.0,Film & Animation,youtubemovies,1,United States,US,Games,...,,2006.0,Mar,5.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891
2,3,MrBeast,166000000,28368840000.0,Entertainment,MrBeast,741,United States,US,Entertainment,...,8000000.0,2012.0,Feb,20.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891
3,4,Cocomelon - Nursery Rhymes,162000000,164000000000.0,Education,Cocomelon - Nursery Rhymes,966,United States,US,Education,...,1000000.0,2006.0,Sep,1.0,88.2,328239500.0,14.7,270663028.0,37.09024,-95.712891
4,5,SET India,159000000,148000000000.0,Shows,SET India,116536,India,IN,Entertainment,...,1000000.0,2006.0,Sep,20.0,28.1,1366418000.0,5.36,471031528.0,20.593684,78.96288


In [87]:
df.shape

(995, 28)

In [88]:
df.columns

Index(['rank', 'youtuber', 'subscribers', 'video_views', 'category', 'title',
       'uploads', 'country', 'abbreviation', 'channel_type',
       'video_views_rank', 'country_rank', 'channel_type_rank',
       'video_views_last_30_days', 'lowest_monthly_earnings',
       'highest_monthly_earnings', 'lowest_yearly_earnings',
       'highest_yearly_earnings', 'subscribers_last_30_days', 'created_year',
       'created_month', 'created_date', 'percent_gross_tertiary_education',
       'population', 'unemployment_rate', 'urban_population', 'latitude',
       'longitude'],
      dtype='object')

In [89]:
df.dtypes

rank                                  int64
youtuber                             object
subscribers                           int64
video_views                         float64
category                             object
title                                object
uploads                               int64
country                              object
abbreviation                         object
channel_type                         object
video_views_rank                    float64
country_rank                        float64
channel_type_rank                   float64
video_views_last_30_days            float64
lowest_monthly_earnings             float64
highest_monthly_earnings            float64
lowest_yearly_earnings              float64
highest_yearly_earnings             float64
subscribers_last_30_days            float64
created_year                        float64
created_month                        object
created_date                        float64
percent_gross_tertiary_education

In [90]:
df.isnull().sum()

rank                                  0
youtuber                              0
subscribers                           0
video_views                           0
category                             46
title                                 0
uploads                               0
country                             122
abbreviation                        122
channel_type                         30
video_views_rank                      1
country_rank                        116
channel_type_rank                    33
video_views_last_30_days             56
lowest_monthly_earnings               0
highest_monthly_earnings              0
lowest_yearly_earnings                0
highest_yearly_earnings               0
subscribers_last_30_days            337
created_year                          5
created_month                         5
created_date                          5
percent_gross_tertiary_education    123
population                          123
unemployment_rate                   123


# Pré-processamento dos dados
Na etapa de processamento de dados foi utilizado e escolhido o label enconder por ele já criar uma metrica para ser substituida já que possui alguns campos com metricas em string, assim evitando replaces enormes realizando metricas na mão

In [91]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder() # Armazenando o processo em uma variável

In [92]:
# Apply label encoding to the categorical columns
df['country'] = label_encoder.fit_transform(df['country'])

In [93]:
df['category'] = label_encoder.fit_transform(df['category'])

In [94]:
df['channel_type'] = label_encoder.fit_transform(df['channel_type'])

In [95]:
columns_drop = ['youtuber', 'unemployment_rate','lowest_monthly_earnings', 'highest_yearly_earnings', 'lowest_yearly_earnings','highest_monthly_earnings','title', 'abbreviation', 'latitude', 'longitude', 'created_year', 'created_month', 'created_date', 'urban_population', 'percent_gross_tertiary_education', 'population']
df.drop(columns=columns_drop, inplace=True)

In [96]:
df.fillna(0,inplace=True) # Substitui os nulos por strings vazias
df.dropna(inplace=True) # Remove os registros com dados nulos
df.drop_duplicates(inplace=True) # Remove os registros duplicados

# Checar se sobrou ainda nulos

In [97]:
df.isnull().sum()

rank                        0
subscribers                 0
video_views                 0
category                    0
uploads                     0
country                     0
channel_type                0
video_views_rank            0
country_rank                0
channel_type_rank           0
video_views_last_30_days    0
subscribers_last_30_days    0
dtype: int64

# Checar se ainda tem colunas do tipo string

In [98]:
df.dtypes

rank                          int64
subscribers                   int64
video_views                 float64
category                      int64
uploads                       int64
country                       int64
channel_type                  int64
video_views_rank            float64
country_rank                float64
channel_type_rank           float64
video_views_last_30_days    float64
subscribers_last_30_days    float64
dtype: object

# Matriz de correlação


In [99]:
matriz_corr = df.corr().style.background_gradient(cmap='coolwarm')
matriz_corr

Unnamed: 0,rank,subscribers,video_views,category,uploads,country,channel_type,video_views_rank,country_rank,channel_type_rank,video_views_last_30_days,subscribers_last_30_days
rank,1.0,-0.640608,-0.453363,0.002896,-0.051036,-0.045467,0.01102,-0.059768,0.022122,-0.026728,-0.178444,-0.191666
subscribers,-0.640608,1.0,0.750958,0.04518,0.077136,0.006151,-0.015517,0.057388,0.029136,0.025103,0.268556,0.27345
video_views,-0.453363,0.750958,1.0,0.017769,0.165928,0.00352,-0.030372,-0.061541,-0.062024,-0.046935,0.364076,0.210308
category,0.002896,0.04518,0.017769,1.0,0.03961,0.120881,0.417583,0.024115,-0.040511,-0.026442,0.066245,0.025586
uploads,-0.051036,0.077136,0.165928,0.03961,1.0,-0.090093,0.031722,-0.108865,-0.067727,-0.09462,0.107344,0.051745
country,-0.045467,0.006151,0.00352,0.120881,-0.090093,1.0,0.088489,0.190371,0.014888,0.141107,-0.059569,-0.046033
channel_type,0.01102,-0.015517,-0.030372,0.417583,0.031722,0.088489,1.0,0.304161,0.073246,0.122452,-0.075181,-0.075512
video_views_rank,-0.059768,0.057388,-0.061541,0.024115,-0.108865,0.190371,0.304161,1.0,0.535202,0.812766,-0.113343,-0.175407
country_rank,0.022122,0.029136,-0.062024,-0.040511,-0.067727,0.014888,0.073246,0.535202,1.0,0.537084,-0.103833,-0.112267
channel_type_rank,-0.026728,0.025103,-0.046935,-0.026442,-0.09462,0.141107,0.122452,0.812766,0.537084,1.0,-0.141811,-0.154111


# Utilizando o pycaret para gerar o melhor modelo

## Importações necessario

In [100]:
from pycaret.regression import *

## Definir o nosso setu do dataset e a coluna target

In [101]:
s = setup(
    data=df,
    target='rank',
    fold=5
)

Unnamed: 0,Description,Value
0,Session id,4969
1,Target,rank
2,Target type,Regression
3,Original data shape,"(995, 12)"
4,Transformed data shape,"(995, 12)"
5,Transformed train set shape,"(696, 12)"
6,Transformed test set shape,"(299, 12)"
7,Numeric features,11
8,Preprocess,True
9,Imputation type,simple


## Comparando os modelos gerados

In [102]:
best_model = compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,2.6993,11.654,3.4027,0.9999,0.0494,0.0223,0.236
rf,Random Forest Regressor,2.706,12.527,3.5358,0.9998,0.0559,0.0271,0.66
lightgbm,Light Gradient Boosting Machine,3.5267,26.3005,5.0826,0.9997,0.13,0.0788,0.15
dt,Decision Tree Regressor,3.7829,23.3493,4.8258,0.9997,0.0493,0.0225,0.06
xgboost,Extreme Gradient Boosting,4.3398,32.2744,5.6663,0.9996,0.0469,0.0243,0.162
et,Extra Trees Regressor,4.3583,63.1336,7.7474,0.9992,0.1196,0.0621,0.448
catboost,CatBoost Regressor,7.158,123.8546,11.013,0.9985,0.1711,0.1067,3.794
ada,AdaBoost Regressor,12.0782,226.2267,15.007,0.9972,0.245,0.2156,0.278
knn,K Neighbors Regressor,198.494,59945.7845,244.6694,0.2614,0.7493,1.1369,0.056
br,Bayesian Ridge,193.8739,66079.8436,250.83,0.2015,0.8653,7.6562,0.046


Processing:   0%|          | 0/85 [00:00<?, ?it/s]

## Resultado do melhor modelo

In [103]:
best_model

## Detalhamento do modelo

In [104]:
evaluate_model(best_model)

interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

## Exportando o modelo

In [105]:
# Salvando o modelo
save_model(best_model, 'model')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['subscribers', 'video_views',
                                              'category', 'uploads', 'country',
                                              'channel_type', 'video_views_rank',
                                              'country_rank',
                                              'channel_type_rank',
                                              'video_views_last_30_days',
                                              'subscribers_last_30_days'],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=[],
                                     transformer=SimpleImputer(strategy='most_frequent'))),
                 ('trained_model',
                  GradientBoostingRegressor(random_state=4969))]),
 'model.pkl')

## Criando a API do modelo

In [106]:
create_api(best_model, 'main')

API successfully created. This function only creates a POST API, it doesn't run it automatically. To run your API, please run this command --> !python main.py
