In [1]:
## IMPORTS
import pandas as pd
import numpy as np
from factor_analyzer import FactorAnalyzer
from factor_analyzer.factor_analyzer import calculate_bartlett_sphericity
import pingouin as pg
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.io as pio
pio.renderers.default = 'browser'
import plotly.graph_objects as go
import plotly.express as px

In [2]:
## VARIABLES
SEARCH_PATH = '../youtube_channels/output/'
JSON_FILE = 'trendings-brasileirao3.json'
TRENDINS_PATH = '../datasets/trendings.json'

In [None]:
## GET RESULT SEARCH DATAFRAME FROM JSON FILE
def readSearchResult():
    try:
        df = pd.read_json(SEARCH_PATH + JSON_FILE, encoding='utf-8')
        return df
    except ValueError as ve:
        print(ve)
        return None
    
df_search = readSearchResult()
df_search.head(5)

In [None]:
## REMOVE VALUES FROM ARRAY (BRACKETS)
df_search['channel_name'] = df_search['channel_name'].str.get(0)
df_search['channel_account'] = df_search['channel_account'].str.get(0)
df_search['channel_url'] = df_search['channel_url'].str.get(0)
df_search['subscribers'] = df_search['subscribers'].str.get(0)
df_search['num_views'] = df_search['num_views'].str.get(0)
df_search['num_videos'] = df_search['num_videos'].str.get(0)
df_search['last_avg_likes'] = df_search['last_avg_likes'].str.get(0)
df_search['last_avg_views'] = df_search['last_avg_views'].str.get(0)
df_search['last_avg_comments'] = df_search['last_avg_comments'].str.get(0)
df_search.head(5)

In [None]:
## CONVERT DATA TYPES
df_search = df_search.convert_dtypes()
df_search.dtypes

In [None]:
## ADDING NEW COLUMNS
df_search.loc[:, ["num_trend_videos","mean_trend_likes","mean_trend_views","mean_trend_comments"]] = 0
df_search.describe()

In [None]:
## GET RESULT SEARCH DATAFRAME FROM JSON FILE
def readTrendings():
    try:
        df = pd.read_json(TRENDINS_PATH, encoding='utf-8')
        return df
    except ValueError as ve:
        print(ve)
        return None

df_trendings = readTrendings()
df_trendings.head(10)

In [None]:
## FUNCTIONS TO GET TRENDING DATA
## GETTING NUM TRENDING VIDEOS (VALIDATE FROM CHANNEL ACCOUNT)
def getNumTrendingVideos(channel_account):
    return df_trendings[df_trendings['video_channel_account'] == channel_account]['video_url'].count()

## GETTING MEAN TRENDINGS
def getMeansTrendingsVideos(channel_account, column):
    return round(df_trendings[df_trendings['video_channel_account'] == channel_account][column].mean(), 2)

## TESTING 
print(getNumTrendingVideos('@espnbrasil'))
print(getMeansTrendingsVideos('@espnbrasil', 'likes'))

In [None]:
## ADDING NEW VALUES IN SEARCH RESULT
for index, row in df_search.iterrows():
    if not df_trendings[df_trendings['video_channel_account'] == row['channel_account']].empty:
        df_search.loc[index,['num_trend_videos']] = getNumTrendingVideos(row['channel_account'])
        df_search.loc[index, ['mean_trend_likes']] = getMeansTrendingsVideos(row['channel_account'], 'likes')
        df_search.loc[index, ['mean_trend_views']] = getMeansTrendingsVideos(row['channel_account'], 'views')
        df_search.loc[index, ['mean_trend_comments']] = getMeansTrendingsVideos(row['channel_account'], 'comments')

df_search

In [None]:
## PEARSON CORRELATION MATRIX
pg.rcorr(df_search, method = 'pearson', upper = 'pval', 
         decimals = 4, 
         pval_stars = {0.01: '***', 0.05: '**', 0.10: '*'})

# RUNNING FATORIAL AND PCA CLASSIFICATION (UNSUPERVISED MACHINE LEARNING)

In [None]:
## GET DATAFRAME INFORMATION
df_search.info()

In [None]:
## DESCRIBING DATA
df_search.describe()

In [None]:
## REMOVING QUALITY VARIABLES
df_pca = df_search.drop(columns=['channel_name', 'channel_account', 'channel_url', 'keywords'])

## CONVERT VARIABLE TYPES TO NUMPY TYPES (NECESSARY FOR BARTLLET'S TEST)
df_pca['last_avg_likes'] = df_pca['last_avg_likes'].astype(np.float64)
df_pca['last_avg_views'] = df_pca['last_avg_views'].astype(np.float64)
df_pca['last_avg_comments'] = df_pca['last_avg_comments'].astype(np.float64)
df_pca['mean_trend_likes'] = df_pca['mean_trend_likes'].astype(np.float64)
df_pca['mean_trend_views'] = df_pca['mean_trend_views'].astype(np.float64)
df_pca['mean_trend_comments'] = df_pca['mean_trend_comments'].astype(np.float64)
df_pca['subscribers'] = df_pca['subscribers'].astype(np.int64)
df_pca['num_videos'] = df_pca['num_videos'].astype(np.int64)
df_pca['num_views'] = df_pca['num_views'].astype(np.int64)
df_pca['num_trend_videos'] = df_pca['num_trend_videos'].astype(np.int64)

df_pca.head(5)

In [None]:
##  GET DF_PCA INFO
df_pca.info()

In [17]:
## ANALYZING PEARSON CORRELATION

# CORRELATION MATRIX

corr = df_pca.corr()

## GRAPHICS

fig = go.Figure()

fig.add_trace(
    go.Heatmap(
        x = corr.columns,
        y = corr.index,
        z = np.array(corr),
        text=corr.values,
        texttemplate='%{text:.3f}',
        colorscale='viridis'))

fig.update_layout(
    height = 750,
    width = 750,
    yaxis=dict(autorange="reversed"))

fig.show()

In [None]:
## Bartlett's Test of Sphericity and Get P value
bartlett, p_value = calculate_bartlett_sphericity(df_pca)

print(f'Qui² Bartlett: {round(bartlett, 2)}')
print(f'p-valor: {round(p_value, 4)}')


In [None]:
### PCA DEFINING (PRINCIPAL METHOD) --> TEST
fa = FactorAnalyzer(n_factors=10, method='principal', rotation=None).fit(df_pca)
eigenvalues = fa.get_eigenvalues()[0]

np.count_nonzero(eigenvalues > 1)

In [13]:
## PCA DEFINING NUM FACTORS (DERIVATIVE) FUNCTION
def getNumFactors(num_factors):
    fa = FactorAnalyzer(n_factors=num_factors, method='principal', rotation=None).fit(df_pca)
    eigenvalues = fa.get_eigenvalues()[0]

    ## APPLYING KAISER CRITERIA (EIGENVALUES > 1) = FOUR FACTORS
    return np.count_nonzero(eigenvalues > 1)

In [None]:
## APPLYING KAISER CRITERIA (EIGENVALUES > 1) = FOUR FACTORS
### PCA DEFINING (PRINCIPAL METHOD)
num_factors = getNumFactors(len(df_pca.columns))
fa = FactorAnalyzer(n_factors=num_factors, method='principal', rotation=None).fit(df_pca)
eigenvalues = fa.get_eigenvalues()[0]

print(eigenvalues)

In [None]:
## GETTING FACTOR VARIANCE AND PLOT TABLE

eigenvalues_factors = fa.get_factor_variance()

tabela_eigen = pd.DataFrame(eigenvalues_factors)
tabela_eigen.columns = [f"fator_{i+1}" for i, v in enumerate(tabela_eigen.columns)]
tabela_eigen.index = ['Autovalor','Variância', 'Variância Acumulada']
tabela_eigen = tabela_eigen.T

print(tabela_eigen)

In [None]:
## ACUMULATE VARIANCE GRAPHICS

plt.figure(figsize=(12,8))
ax = sns.barplot(x=tabela_eigen.index, y=tabela_eigen['Variância'], data=tabela_eigen, palette='magma')
for bars in ax.containers:
    ax.bar_label(bars, fontsize=12)
plt.title("Fatores Extraídos", fontsize=16)
plt.xlabel(f"{tabela_eigen.shape[0]} fatores que explicam {round(tabela_eigen['Variância'].sum()*100,2)}% da variância", fontsize=12)
plt.ylabel("Porcentagem de variância explicada", fontsize=12)
plt.show()

In [None]:
## LOAD FACTORS
l_factors = fa.loadings_

load_table = pd.DataFrame(l_factors)
load_table.columns = [f"fator_{i+1}" for i, v in enumerate(load_table.columns)]
load_table.index = df_pca.columns

print(load_table)

In [None]:
## LOADING PLOTS
plt.figure(figsize=(12,8))
load_chart = load_table.reset_index()
plt.scatter(load_chart['fator_1'], load_chart['fator_2'], s=50, color='blue')

def label_point(x, y, val, ax):
    a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
    for i, point in a.iterrows():
        ax.text(point['x'] + 0.05, point['y'], point['val'])

label_point(x = load_chart['fator_1'],
            y = load_chart['fator_2'],
            val = load_chart['index'],
            ax = plt.gca()) 

plt.axhline(y=0, color='grey', ls='--')
plt.axvline(x=0, color='grey', ls='--')
plt.ylim([-1.1,1.1])
plt.xlim([-1.1,1.1])
plt.title("Loading Plot", fontsize=16)
plt.xlabel(f"Fator 1: {round(tabela_eigen.iloc[0]['Variância']*100,2)}% de variância explicada", fontsize=12)
plt.ylabel(f"Fator 2: {round(tabela_eigen.iloc[1]['Variância']*100,2)}% de variância explicada", fontsize=12)
plt.show()

In [46]:
## LOADING PLOTS (3D)

load_chart = load_table.reset_index()
fig = px.scatter_3d(load_chart, x='fator_1', y='fator_2', z='fator_3',
                    color='index',
                    color_discrete_sequence=px.colors.sequential.Viridis)
fig.update_layout(
    height = 750,
    width = 1200)
fig.update_layout(legend_title_text = "Loading Plot (3D)")

fig.show()

In [None]:
load_table_graph = load_table.reset_index()
load_table_graph = load_table_graph.melt(id_vars='index')

sns.barplot(data=load_table_graph, x='variable', y='value', hue='index', palette='bright')
plt.legend(title='Variáveis', bbox_to_anchor=(1,1), fontsize = '6')
plt.title('Cargas Fatoriais', fontsize='12')
plt.xlabel(xlabel=None)
plt.ylabel(ylabel=None)
plt.show()

In [None]:
## COMMUNALITIES
communalities = fa.get_communalities()

commun_tables = pd.DataFrame(communalities)
commun_tables.columns = ['Comunalidades']
commun_tables.index = df_pca.columns

print(commun_tables)

In [None]:
## EXTRACT FACTORS TO DATAFRAME
factors = pd.DataFrame(fa.transform(df_pca))
factors.columns =  [f"fator_{i+1}" for i, v in enumerate(factors.columns)]

# ADDING FACTORS INTO DATAFRAME SEARCH
df_search = pd.concat([df_search.reset_index(drop=True), factors], axis=1)
df_search.info()

In [None]:
## SCORES FACTOR
scores = fa.weights_

scores_table = pd.DataFrame(scores)
scores_table.columns = [f"fator_{i+1}" for i, v in enumerate(scores_table.columns)]
scores_table.index = df_pca.columns

print(scores_table)

In [None]:
scores_graph = scores_table.reset_index()
scores_graph = scores_graph.melt(id_vars='index')

sns.barplot(data=scores_graph, x='variable', y='value', hue='index', palette='viridis')
plt.legend(title='Variáveis', bbox_to_anchor=(1,1), fontsize = '6')
plt.title('Scores Fatoriais', fontsize='12')
plt.xlabel(xlabel=None)
plt.ylabel(ylabel=None)
plt.show()

In [36]:
## weighted sum (sum the factors)
df_search['ranking'] = 0

for index, item in enumerate(list(tabela_eigen.index)):
    variancia = tabela_eigen.loc[item]['Variância']

    df_search['ranking'] = df_search['ranking'] + df_search[tabela_eigen.index[index]]*variancia

In [None]:
## SEE CORRELATION FROM VARIABLES (PEARSON)
pg.rcorr(df_search[df_search.columns], 
         method = 'pearson', upper = 'pval', 
         decimals = 4, 
         pval_stars = {0.01: '***', 0.05: '**', 0.10: '*'})

In [None]:
## final ranking
df_search.sort_values(by=['ranking'], ascending=False)

In [None]:
## PLOTTING FINAL DATAFRAME IN A TABLE

df_final = df_search.drop(columns=['channel_url', 
                                   'subscribers', 
                                   'last_avg_likes', 
                                   'num_views', 
                                   'num_videos', 
                                   'keywords', 
                                   'num_trend_videos',
                                   'mean_trend_likes',
                                   'mean_trend_views',
                                   'mean_trend_comments',
                                   'last_avg_views',
                                   'last_avg_comments'])

## REORDER BY RANKING
df_final = df_final.sort_values(by=['ranking'], ascending=False)

## RESET INDEX VALUES
df_final.reset_index(level=0, inplace=True)
df_final.index = np.arange(1, len(df_final)+1)
df_final.rename(columns={'index': 'n'}, inplace=True)

from tabulate import tabulate
tabela = tabulate(df_final, headers='keys', tablefmt='grid', numalign='center')

plt.figure(figsize=(8, 3))
plt.text(0.1, 0.1, tabela, {'family': 'monospace', 'size': 30})
plt.axis('off')
plt.show()