# **Dataset GRÃ-BRETANHA**

In [0]:
#Importando as bibliotecas
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import datetime
import statsmodels.discrete.discrete_model as sm



from collections import Counter
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

import warnings
warnings.simplefilter(action = "ignore", category = RuntimeWarning)

In [0]:
df_GB = pd.read_csv('GBvideos.csv', delimiter=',')
df_GB.info()

## Tratamento do DataFrame


In [0]:
df_GB.head().T

In [0]:
#Fazendo uma cópia do Dataframe
df_GB = df_GB.copy()

In [0]:
df_GB.describe()

In [0]:
#Importando o json para ver a category_id
import json

with open('GB_category_id.json', 'r') as f:
  categ_gb = json.load(f)

for k in categ_gb['items']:
  print(k['id'], k['snippet']['title'])

In [0]:
#Analisando a quantidade de categorias 

import json

# open json file with category names
with open('GB_category_id.json', 'r') as f:
  categ_us = json.load(f)

categ_dic = {}

for k in categ_us['items']:
  categ_dic[k['id']]= k['snippet']['title']

#print(categ_dic)

# Creating new column with category names
temp = df_GB.category_id.copy()
temp = temp.astype('str')
df_GB['category_name'] = temp.map(categ_dic)

#print(df.loc[:10, ['category_id', 'category_name']])

In [0]:
df_GB2 = df_GB.copy()
df_GB2.drop_duplicates('video_id', inplace=True)
sns.countplot(y='category_name', data=df_GB2, order=df_GB2.category_name.value_counts().index )
plt.title('Category popularity among top 200')
plt.ylabel('Category')

### Outliers

In [0]:
#Função para detectar os outliers do novo DataFrame

def detect_outliers(df_GB2,n,features):
    outlier_indices = []
    
    for col in features:

        Q1 = np.percentile(df_GB2[col], 25)
        Q3 = np.percentile(df_GB2[col],75)
        IQR = Q3 - Q1

        outlier_step = 1.5 * IQR
        outlier_list_col = df_GB2[(df_GB2[col] < Q1 - outlier_step) | (df_GB2[col] > Q3 + outlier_step )].index
        outlier_indices.extend(outlier_list_col)

    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    
    return multiple_outliers 

In [0]:
Outliers_to_drop = detect_outliers(df_GB2,1,['views', 'likes', 'dislikes', 'comment_count'])

In [0]:
df_GB2.loc[Outliers_to_drop]

In [0]:
#Deletando os Outliers encontrados
df_GB2 = df_GB2.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)

In [0]:
#Percentual dos valores nulos
percent_missing = round(df_GB2.isnull().sum() * 100 / len(df_GB2),2)
missing_value_df = pd.DataFrame({'column_name': df_GB2.columns, 'percent_missing': percent_missing})
missing_value_df = missing_value_df[missing_value_df['percent_missing'] > 0]
missing_value_df.sort_values('percent_missing', inplace=True, ascending=False)
missing_value_df

In [0]:
#Substituindo os valores na da coluna description e category_name por'Missing' 
df_GB2.description.fillna('Missing', inplace=True)
df_GB2.category_name.fillna('Missing', inplace=True)

In [0]:
# Detectando Valores nulos
df_GB2.isnull().sum()/ len(df_GB2) * 100

In [0]:
# Bloxpot feature 'category_id'
plt.boxplot(df_GB2['category_id'])
plt.title('Boxplot: category_id')
plt.ylabel('category_id')

### Análise e Visualização

In [0]:
#Correlação de todas as variáveis

plt.figure(figsize=[20,10])
plt.title('Correlation between features total dataset')
sns.heatmap(df_GB2.corr(), annot=True, fmt='.2f', vmin=-1, vmax=1, cmap=sns.diverging_palette(999, 999, n=20))
plt.show()

### Modelando

In [0]:
#Deletando as colunas "genero" e "des_email"
train_bkp = df_GB2.copy()
df_GB2.drop(['video_id', 'trending_date', 'title', 'channel_title', 'publish_time', 'tags', 'thumbnail_link', 'description', 'category_name', 'comments_disabled', 'ratings_disabled', 'video_error_or_removed' ],axis=1,inplace=True)
df_GB2.head()

In [0]:
x = df_GB2.drop(columns=['views']).copy()
y = df_GB2['views']

In [0]:
#Separando o dataframe em traino e teste
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3,random_state=1)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

## Feature Importance

In [0]:
#Construção do Modelo de classificação Random Forest
rfmodel = RandomForestClassifier(random_state=0,n_estimators=450,criterion='gini',n_jobs=-1,max_depth = 8,min_samples_leaf=1,min_samples_split= 11)

#Treinamento do modelo em X_train, Y_train
rfmodel.fit(x_train,y_train)

#Predição do modelo no X_test
y_pred = rfmodel.predict(x_test)
proba = rfmodel.predict_proba(x_test)

#Variáveis Importantes
imp = rfmodel.feature_importances_
names = df_GB2.columns

imp, names = zip(*sorted(zip(imp, names)))

plt.barh(range(len(names)), imp, align = "center")
plt.yticks(range(len(names)), names)
plt.xlabel("Importance of Features")
plt.ylabel("Features")
plt.title("Importance of each feature")
fig=plt.gcf()
fig.set_size_inches(18,5)
plt.show()

In [0]:
a = {'names':names,'imp':imp}
tabela = pd.DataFrame(data=a)
tabela

## Cross Validation


In [0]:
#Applying K-Fold Cross Validation
accuracies = cross_val_score(estimator=rfmodel,X= x_train,y=y_train,cv=2)
print(accuracies)
acc_train = accuracies.mean() #Prints out the  average of the 10 scores.
acc_train