In [9]:
import pandas as pd
from io import StringIO
import requests as r
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
import numpy as np

Criei uma função para fazer DataFrames a partir de um url

In [10]:
def faz_df(url):
    req = r.request('GET', url, headers={'user-agent': 'Mozilla/5.0'})
    df = pd.read_json(StringIO(req.text), encoding='UTF-8')
    return df

Criando os DataFrames

In [11]:
information_df = faz_df('https://treinamento-ndados.s3.sa-east-1.amazonaws.com/boardgames_information_raw.json')
categoricals_df = faz_df('https://treinamento-ndados.s3.sa-east-1.amazonaws.com/boardgames_categoricals_raw.json')
stat_df = faz_df('https://treinamento-ndados.s3.sa-east-1.amazonaws.com/boardgames_stat_raw.json')

Juntando os DataFrames em um só, com o nome de 'merged'

In [12]:
pd.set_option('display.max_columns', 500)
merged = pd.concat(objs=[information_df, categoricals_df, stat_df], axis=1)
merged.head(3)

Unnamed: 0,object_id,name,yearpublished,sortindex,minplayers,maxplayers,minplaytime,maxplaytime,minage,objectid,label,boardgamedesigner_cnt,boardgameartist_cnt,boardgamepublisher_cnt,boardgamehonor_cnt,boardgamecategory_cnt,boardgamemechanic_cnt,boardgameexpansion_cnt,boardgameversion_cnt,boardgamefamily_cnt,boardgamedesigner,boardgameartist,boardgamepublisher,boardgamehonor,boardgamecategory,boardgameversion,boardgamemechanic,boardgameexpansion,boardgamefamily,gamelink,objectid.1,min_community,max_community,totalvotes,playerage,languagedependence,usersrated,average,baverage,stddev,avgweight,numweights,numgeeklists,numtrading,numwanting,numcomments,siteviews,numplays,numplays_month,news,blogs,weblink,podcast
0,174430,Gloomhaven,2017,1,1,4,60,120,12,174430,Board Game,1,3,9,23,5,12,4,19,7,"['Isaac Childres""']","['Alexandr Elichev', 'Josh T. McDowell', 'Alva...","['Cephalofair Games', 'Albi', 'Asmodee', 'Feue...",['2017 Best Science Fiction or Fantasy Board G...,"['Adventure', 'Exploration', 'Fantasy', 'Fight...","['Chinese edition', 'Czech edition', 'English ...","['Campaign / Battle Card Driven', 'Cooperative...","['Gloomhaven: Forgotten Circles', 'Gloomhaven:...","['Campaign Games', 'Components: Miniatures', '...",/boardgame/174430/gloomhaven,174430,3.0,4.0,827.0,14,4,31254.0,8.85292,8.58424,1.59819,3.8078,1311,3657,313,1365,5972,8933078,230213,3478,7,471,31,139
1,161936,Pandemic Legacy Season 1,2015,2,2,4,60,60,13,161936,Board Game,2,1,11,20,2,8,0,33,3,"['Rob Daviau""', 'Matt Leacock""']",['Chris Quilliams'],"['Z-Man Games', 'Asterion Press', 'Devir', 'Fi...",['2015 Cardboard Republic Immersionist Laurel ...,"['Environmental', 'Medical']","['Chinese blue edition', 'Chinese red edition'...","['Action Points', 'Cooperative Game', 'Hand Ma...",['None'],"['Campaign Games', 'Legacy', 'Pandemic']",/boardgame/161936/pandemic-legacy-season-1,161936,4.0,4.0,549.0,12,4,34729.0,8.62499,8.47159,1.59463,2.8301,971,3612,272,771,5477,2971746,196621,1090,9,597,69,164
2,167791,Terraforming Mars,2016,3,1,5,120,120,12,167791,Board Game,1,1,20,20,6,9,15,29,6,"['Jacob Fryxelius""']",['Isaac Fryxelius'],"['FryxGames', 'Arclight', 'Fantasmagoria', 'Gh...",['2016 Cardboard Republic Architect Laurel Nom...,"['Economic', 'Environmental', 'Industry / Manu...","['Bulgarian edition', 'Chinese edition', 'Czec...","['Card Drafting', 'End Game Bonuses', 'Hand Ma...",['French Championship Promo Cards (fan expansi...,"['Fryxgames Future Timeline', 'Planets: Mars',...",/boardgame/167791/terraforming-mars,167791,3.0,4.0,1252.0,12,3,48339.0,8.42299,8.26781,1.36938,3.2313,1863,5941,277,2068,7274,4724387,245997,4380,14,1158,60,148


In [13]:
print(stat_df.shape)
print(information_df.shape)
print(categoricals_df.shape)
print(merged.shape)

(20016, 23)
(20016, 9)
(20016, 21)
(20016, 53)


Vemos que o merged contem a soma de colunas dos outros 3 dataframes. Agora, vou remover algumas colunas consideradas inúteis.

In [14]:
merged.drop(labels=['objectid', 'label'], axis=1, inplace=True)
merged.shape

(20016, 50)

Fazendo um heatmap das variáveis estatísticas:


In [15]:
# corr = stat_df.corr(method='pearson')

# plt.figure(figsize=(15,15))
# sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
# plt.show()

## Medindo a popularidade de um jogo:

A tabela estatística contém diversos dados que, teoricamente, medem a popularidade de um jogo. Entretanto, por enquanto, considerei apenas as médias de avaliação do jogo (BGG e dos jogadores) e a quantidade de avaliações. Posteriormente, há espaço para aplicar algum modelo de clusterização para analisarmos todas essas variáveis juntas, mas por enquanto prevalece o modelo simplificado.

In [16]:
colunas_selecionadas = ['name', 'object_id', 'usersrated', 'average', 'baverage', 'numgeeklists',
                        'numwanting', 'numcomments', 'siteviews', 'news', 'blogs', 'weblink',
                        'podcast']
boardgame_stat = merged[colunas_selecionadas].copy()
stat_df_filtrado = boardgame_stat.copy()
stat_df_filtrado

Unnamed: 0,name,object_id,usersrated,average,baverage,numgeeklists,numwanting,numcomments,siteviews,news,blogs,weblink,podcast
0,Gloomhaven,174430,31254.0,8.85292,8.58424,3657,1365,5972,8933078,7,471,31,139
1,Pandemic Legacy Season 1,161936,34729.0,8.62499,8.47159,3612,771,5477,2971746,9,597,69,164
2,Terraforming Mars,167791,48339.0,8.42299,8.26781,5941,2068,7274,4724387,14,1158,60,148
3,Through the Ages A New Story of Civilization,182028,18269.0,8.49419,8.23513,2083,1049,2660,2448074,5,185,30,42
4,Brass Birmingham,224517,10070.0,8.62031,8.20459,1039,1077,1702,894621,3,124,13,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20011,Tuppi,14765,33.0,7.18788,5.53873,15,2,15,8992,0,0,0,0
20012,History of War,8134,96.0,5.99240,5.51421,56,4,50,43202,0,0,2,0
20013,Hesketh's Legacy,2673,1.0,6.00000,0.00000,2,0,1,2559,0,2,0,0
20014,Snug as a Bug in a Rug,132322,42.0,6.36905,5.52454,13,1,17,9524,0,0,4,0


Aqui encontrei um problema: há diversos dados nulos nas colunas baverage e average, e não sei o que essa nulicidade representa diretamente. Portanto, vou prever esses valores com base na razão entre average e baverage. A seguir fica o passo a passo de como fiz para o cálculo da razão.

## Análise Prévia

### Analisando usersrated == 0


In [17]:
stat_df_filtrado.query('usersrated == 0').sum()

name            Looney LeoRolazoneContangoDon't Give Up Your D...
object_id                                                  360634
usersrated                                                    0.0
average                                                       0.0
baverage                                                      0.0
numgeeklists                                                  550
numwanting                                                     62
numcomments                                                    35
siteviews                                                  438442
news                                                            0
blogs                                                           1
weblink                                                        12
podcast                                                         0
dtype: object

Vou comparar os valores de cada variável que contém userrated == 0 com o valor total delas, para saber sua representatividade.

In [18]:
def verifica_porcentagem(df:pd.DataFrame, coluna):
    valor = df.query('usersrated == 0').sum()[coluna]
    total = df[coluna].sum()
    return valor / total * 100

In [19]:
colunas_analisadas = ['numgeeklists', 'numwanting', 'numcomments', 'siteviews', 'blogs', 'weblink']
for coluna in colunas_analisadas:
    print(f"A porcentagem de {coluna} é {verifica_porcentagem(stat_df_filtrado, coluna)}")

A porcentagem de numgeeklists é 0.011253797901023462
A porcentagem de numwanting é 0.007868988949147294
A porcentagem de numcomments é 0.0009131586156515387
A porcentagem de siteviews é 0.02870729860782448
A porcentagem de blogs é 0.0004923682914820286
A porcentagem de weblink é 0.009585812996764788


Como todos os valores representam nem 1% do total, serão desconsiderados.

In [20]:
# Removi todas as linhas nas quais usersrated == 0
stat_df_filtrado = stat_df_filtrado.query('usersrated != 0').copy()

In [21]:
stat_df_filtrado.head(3)

Unnamed: 0,name,object_id,usersrated,average,baverage,numgeeklists,numwanting,numcomments,siteviews,news,blogs,weblink,podcast
0,Gloomhaven,174430,31254.0,8.85292,8.58424,3657,1365,5972,8933078,7,471,31,139
1,Pandemic Legacy Season 1,161936,34729.0,8.62499,8.47159,3612,771,5477,2971746,9,597,69,164
2,Terraforming Mars,167791,48339.0,8.42299,8.26781,5941,2068,7274,4724387,14,1158,60,148


### Analisando average == 0

In [22]:
stat_df_filtrado.query('average == 0').sum()

name              0
object_id         0
usersrated      0.0
average         0.0
baverage        0.0
numgeeklists      0
numwanting        0
numcomments       0
siteviews         0
news              0
blogs             0
weblink           0
podcast           0
dtype: object

In [23]:
stat_df_filtrado.query('average == 0').shape

(0, 13)

Não há linhas nas quais o average é 0, então não há nada a se fazer

### Analisando baverage == 0

In [24]:
stat_df_filtrado.query('baverage == 0').sum()

name            The Garden GamePeaceful ResistanceGreen Thumb ...
object_id                                                 4223416
usersrated                                                15670.0
average                                                6768.32561
baverage                                                      0.0
numgeeklists                                                22960
numwanting                                                   3063
numcomments                                                  9586
siteviews                                                10745815
news                                                            4
blogs                                                          88
weblink                                                       972
podcast                                                        18
dtype: object

Aparentemente, temos algumas linhas com baverage == 0

In [25]:
stat_df_filtrado.query('baverage == 0').shape

(1296, 13)

São 1296 linhas nas quais baverage == 0. Vamos ver o quanto isso é em relação aos votos totais.

### Observando a ocorrencia de baverage == 0 em relação aos votos totais

In [26]:
print(f"A porcentagem de usersrated com baverage == 0 por total é {stat_df_filtrado.query('baverage == 0')['usersrated'].sum() / stat_df_filtrado['usersrated'].sum() * 100}")

A porcentagem de usersrated com baverage == 0 por total é 0.10620128769908506


Como as ocorrencias de baverage == 0 refletem 0.1% dos votos todais, optei por desonsiderá-las para efeito de cálculo

In [27]:
stat_df_filtrado = stat_df_filtrado.query('baverage != 0').copy()

### Lidando com dados nulos:

In [28]:
stat_df_filtrado.isnull().sum()

name             0
object_id        0
usersrated      89
average         94
baverage        96
numgeeklists     0
numwanting       0
numcomments      0
siteviews        0
news             0
blogs            0
weblink          0
podcast          0
dtype: int64

Como os dados nulos ocasionarão em futuros erros ou novos dados nulos após efetuação do cálculo, desconsideraremos

In [29]:
stat_df_filtrado = stat_df_filtrado.dropna().copy()

In [30]:
stat_df_filtrado.isnull().sum()

name            0
object_id       0
usersrated      0
average         0
baverage        0
numgeeklists    0
numwanting      0
numcomments     0
siteviews       0
news            0
blogs           0
weblink         0
podcast         0
dtype: int64

## Cálculo da razão

#### Efetuando cálculo da razão geral

In [31]:
stat_df_filtrado['average/baverage'] = stat_df_filtrado['average'] / stat_df_filtrado['baverage']

#### Ponderando o efeito da razão pela quantidade total de votos

In [32]:
stat_df_filtrado['average/baverage_ponderado'] =stat_df_filtrado['average/baverage'] * stat_df_filtrado['usersrated']

#### Efetuando o cálculo da razão média ponderada de average / baverage

In [33]:
razao_media = stat_df_filtrado['average/baverage_ponderado'].sum() / stat_df_filtrado['usersrated'].sum()

In [34]:
razao_media

1.0617524552384836

## Remapeando na base original

#### Removendo registros sem votos:

In [35]:
boardgame_stat = boardgame_stat.query('usersrated != 0').copy()

#### Tratando dados nulos e zerados como o mesmo problema

In [36]:
boardgame_stat = boardgame_stat.fillna(0)
boardgame_stat

Unnamed: 0,name,object_id,usersrated,average,baverage,numgeeklists,numwanting,numcomments,siteviews,news,blogs,weblink,podcast
0,Gloomhaven,174430,31254.0,8.85292,8.58424,3657,1365,5972,8933078,7,471,31,139
1,Pandemic Legacy Season 1,161936,34729.0,8.62499,8.47159,3612,771,5477,2971746,9,597,69,164
2,Terraforming Mars,167791,48339.0,8.42299,8.26781,5941,2068,7274,4724387,14,1158,60,148
3,Through the Ages A New Story of Civilization,182028,18269.0,8.49419,8.23513,2083,1049,2660,2448074,5,185,30,42
4,Brass Birmingham,224517,10070.0,8.62031,8.20459,1039,1077,1702,894621,3,124,13,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...
20011,Tuppi,14765,33.0,7.18788,5.53873,15,2,15,8992,0,0,0,0
20012,History of War,8134,96.0,5.99240,5.51421,56,4,50,43202,0,0,2,0
20013,Hesketh's Legacy,2673,1.0,6.00000,0.00000,2,0,1,2559,0,2,0,0
20014,Snug as a Bug in a Rug,132322,42.0,6.36905,5.52454,13,1,17,9524,0,0,4,0


In [37]:
boardgame_stat.query('average == 0 | baverage == 0')

Unnamed: 0,name,object_id,usersrated,average,baverage,numgeeklists,numwanting,numcomments,siteviews,news,blogs,weblink,podcast
41,Le Havre,35677,24266.0,7.87980,0.00000,5739,1554,4868,1876534,3,253,67,30
64,Battlestar Galactica The Board Game,37111,31058.0,0.00000,7.58289,10878,756,6426,2220403,0,307,100,94
170,Deception Murder in Hong Kong,156129,13172.0,7.51194,0.00000,1484,710,2337,748461,2,132,19,44
209,\u05e7\u05d9\u05e0\u05d2\u05d3\u05d5\u05de\u05...,204583,24282.0,0.00000,7.22535,2295,346,3644,824994,7,322,66,59
218,Sid Meier's Civilization The Board Game,77130,13748.0,7.43552,0.00000,3559,405,2501,1475436,1,92,66,15
...,...,...,...,...,...,...,...,...,...,...,...,...,...
19999,Obsession,5163,5.0,4.60000,0.00000,18,1,4,3806,0,0,0,0
20006,Cham\u00e4leon,3104,22.0,6.56818,0.00000,7,1,11,5627,0,0,0,0
20009,Welcome to the Dungeon,150312,10331.0,0.00000,6.49553,1340,150,1782,433329,7,118,42,16
20010,Supremacy,5073,6.0,6.66667,0.00000,2,1,1,8206,0,0,0,0


#### Removendo duplo 0 (average == 0 e baverage == 0)

In [38]:
boardgame_stat['average+baverage'] = boardgame_stat['average'] + boardgame_stat['baverage']

In [39]:
boardgame_stat = boardgame_stat.query('average+baverage != 0').copy()

#### Efetuando o mapeamento

In [40]:
boardgame_stat

Unnamed: 0,name,object_id,usersrated,average,baverage,numgeeklists,numwanting,numcomments,siteviews,news,blogs,weblink,podcast,average+baverage
0,Gloomhaven,174430,31254.0,8.85292,8.58424,3657,1365,5972,8933078,7,471,31,139,17.43716
1,Pandemic Legacy Season 1,161936,34729.0,8.62499,8.47159,3612,771,5477,2971746,9,597,69,164,17.09658
2,Terraforming Mars,167791,48339.0,8.42299,8.26781,5941,2068,7274,4724387,14,1158,60,148,16.69080
3,Through the Ages A New Story of Civilization,182028,18269.0,8.49419,8.23513,2083,1049,2660,2448074,5,185,30,42,16.72932
4,Brass Birmingham,224517,10070.0,8.62031,8.20459,1039,1077,1702,894621,3,124,13,21,16.82490
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20011,Tuppi,14765,33.0,7.18788,5.53873,15,2,15,8992,0,0,0,0,12.72661
20012,History of War,8134,96.0,5.99240,5.51421,56,4,50,43202,0,0,2,0,11.50661
20013,Hesketh's Legacy,2673,1.0,6.00000,0.00000,2,0,1,2559,0,2,0,0,6.00000
20014,Snug as a Bug in a Rug,132322,42.0,6.36905,5.52454,13,1,17,9524,0,0,4,0,11.89359


In [41]:
# Defini duas funções para tratar os dados zerados(que antes eram nulos) das colunas
# average e baverage

def tratamento_average(linha):
    if linha['average'] <= 0.1:
        return linha['baverage'] * razao_media
    else:
        return linha['average']
    
def tratamento_baverage(linha):
    if linha['baverage'] <= 0.1:
        return linha['average'] / razao_media
    else:
        return linha['baverage']

In [42]:
# Criando uma coluna nova para os novos(e antigos) valores average
boardgame_stat['average_tratado'] = boardgame_stat.apply(tratamento_average, axis=1)

In [43]:
# Criando uma coluna nova para os novos(e antigos) valores baverage
boardgame_stat['baverage_tratado'] = boardgame_stat.apply(tratamento_baverage, axis=1)

## Validação

#### Teste 1

In [44]:
# objectid == 35677 era um dos valores de baverage que eu sabia que estava nulo
boardgame_stat.query('object_id == 35677')['baverage_tratado']

41    7.421504
Name: baverage_tratado, dtype: float64

In [45]:
boardgame_stat.query('object_id == 35677')['average']

41    7.8798
Name: average, dtype: float64

In [46]:
boardgame_stat.query('object_id == 35677')['average'] / razao_media

41    7.421504
Name: average, dtype: float64

#### Teste 2

In [47]:
# Esse objectid é como o outro, já sabia que estava zerado, só que agora na coluna average
boardgame_stat.query('object_id == 37111')['average_tratado']

64    8.051152
Name: average_tratado, dtype: float64

In [48]:
boardgame_stat.query('object_id == 37111')['baverage']

64    7.58289
Name: baverage, dtype: float64

In [49]:
boardgame_stat.query('object_id == 37111')['baverage'] * razao_media

64    8.051152
Name: baverage, dtype: float64

In [50]:
boardgame_stat.head()

Unnamed: 0,name,object_id,usersrated,average,baverage,numgeeklists,numwanting,numcomments,siteviews,news,blogs,weblink,podcast,average+baverage,average_tratado,baverage_tratado
0,Gloomhaven,174430,31254.0,8.85292,8.58424,3657,1365,5972,8933078,7,471,31,139,17.43716,8.85292,8.58424
1,Pandemic Legacy Season 1,161936,34729.0,8.62499,8.47159,3612,771,5477,2971746,9,597,69,164,17.09658,8.62499,8.47159
2,Terraforming Mars,167791,48339.0,8.42299,8.26781,5941,2068,7274,4724387,14,1158,60,148,16.6908,8.42299,8.26781
3,Through the Ages A New Story of Civilization,182028,18269.0,8.49419,8.23513,2083,1049,2660,2448074,5,185,30,42,16.72932,8.49419,8.23513
4,Brass Birmingham,224517,10070.0,8.62031,8.20459,1039,1077,1702,894621,3,124,13,21,16.8249,8.62031,8.20459


# Machine Learning

#### Usarei o algoritmo de clusterização Kmeans

# Machine Learning

In [51]:
from sklearn.cluster import KMeans
from sklearn import metrics

In [52]:
colunas_clusterizadas = ['usersrated', 'numgeeklists', 'numwanting', 'numcomments', 
                         'siteviews', 'news', 'blogs', 'weblink', 'podcast', 'average_tratado',
                          'baverage_tratado']
X = boardgame_stat[colunas_clusterizadas].copy()

In [53]:
# Utilziando o algoritmo e percorrendo diferentes valores de k
valores_k = []
inercias = []

for i in range(1,10):
    kmeans = KMeans(n_clusters = i, random_state=0, n_init='auto').fit(X)
    valores_k.append(i)
    inercias.append(kmeans.inertia_)

In [54]:
# Visualizando a relação entre inércia e K
fig_cotovelo = px.line(x=valores_k, y=inercias
).update_layout(xaxis_title='Quantidade de clusters', yaxis_title='Inércias')
                                                                       
fig_cotovelo.show()

In [55]:
# Verificando o coeficiente de silhueta para os calores de k
valores_ks = []
s = []

for i in range(2,10):
    kmeans = KMeans(n_clusters=i, random_state=0, n_init='auto').fit(X)
    valores_ks.append(i)
    s.append(metrics.silhouette_score(X, kmeans.labels_))

In [56]:
fig_silhueta = px.line(x=valores_ks, y=s).update_layout(xaxis_title='Quantidade de clusters',
                                                        yaxis_title='Coeficiente de silhueta')
fig_silhueta.show()

# Modelo ajustado: Clusterização dos dados estatísticos

In [57]:
# Aplicando o algoritmo 
model_stat = KMeans(n_clusters=3, random_state=0, n_init='auto')
df_clusterizado = boardgame_stat[colunas_clusterizadas].copy()
labels_stat = model_stat.fit_predict(df_clusterizado)
df_clusterizado['Cluster'] = labels_stat
df_clusterizado.head()

Unnamed: 0,usersrated,numgeeklists,numwanting,numcomments,siteviews,news,blogs,weblink,podcast,average_tratado,baverage_tratado,Cluster
0,31254.0,3657,1365,5972,8933078,7,471,31,139,8.85292,8.58424,2
1,34729.0,3612,771,5477,2971746,9,597,69,164,8.62499,8.47159,2
2,48339.0,5941,2068,7274,4724387,14,1158,60,148,8.42299,8.26781,2
3,18269.0,2083,1049,2660,2448074,5,185,30,42,8.49419,8.23513,2
4,10070.0,1039,1077,1702,894621,3,124,13,21,8.62031,8.20459,1


Descrição dos clusters:


In [58]:
# Função que agrupa o dataframe clusterizado e cria a coluna com a quantidade total de itens em cada cluster
def descricao(df: pd.DataFrame, grupamento: str) -> pd.DataFrame:
    descricao = df.groupby(grupamento)
    n = descricao.size()
    descricao = descricao.mean()
    descricao['Quantidade Total'] = n
    
    return descricao

In [59]:
descricao(df_clusterizado, 'Cluster')

Unnamed: 0_level_0,usersrated,numgeeklists,numwanting,numcomments,siteviews,news,blogs,weblink,podcast,average_tratado,baverage_tratado,Quantidade Total
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,375.441594,165.98751,25.441283,117.658167,47546.61,0.417133,5.869455,5.11194,1.135676,6.269729,5.603026,19296
1,10297.480969,2409.205882,446.382353,2211.038062,805761.3,3.423875,121.754325,39.467128,22.813149,7.333213,6.928978,578
2,37068.880952,6922.952381,924.690476,6758.02381,3416950.0,8.238095,463.428571,87.904762,90.214286,7.846775,7.644558,42


In [60]:
df_clusterizado['name'] = boardgame_stat['name']
df_clusterizado['objectid'] = boardgame_stat['object_id']

In [61]:
categoricals_df.drop_duplicates(inplace=True, subset=['objectid'])

In [62]:
df_final = df_clusterizado.merge(categoricals_df, how='left', on='objectid')
print(df_final.shape)
print(X.shape)

(19916, 34)
(19916, 11)


In [63]:
df_final.query('Cluster == 2').shape

(42, 34)

In [64]:
df_final.query('Cluster == 2').head()

Unnamed: 0,usersrated,numgeeklists,numwanting,numcomments,siteviews,news,blogs,weblink,podcast,average_tratado,baverage_tratado,Cluster,name,objectid,label,boardgamedesigner_cnt,boardgameartist_cnt,boardgamepublisher_cnt,boardgamehonor_cnt,boardgamecategory_cnt,boardgamemechanic_cnt,boardgameexpansion_cnt,boardgameversion_cnt,boardgamefamily_cnt,boardgamedesigner,boardgameartist,boardgamepublisher,boardgamehonor,boardgamecategory,boardgameversion,boardgamemechanic,boardgameexpansion,boardgamefamily,gamelink
0,31254.0,3657,1365,5972,8933078,7,471,31,139,8.85292,8.58424,2,Gloomhaven,174430,Board Game,1,3,9,23,5,12,4,19,7,"['Isaac Childres""']","['Alexandr Elichev', 'Josh T. McDowell', 'Alva...","['Cephalofair Games', 'Albi', 'Asmodee', 'Feue...",['2017 Best Science Fiction or Fantasy Board G...,"['Adventure', 'Exploration', 'Fantasy', 'Fight...","['Chinese edition', 'Czech edition', 'English ...","['Campaign / Battle Card Driven', 'Cooperative...","['Gloomhaven: Forgotten Circles', 'Gloomhaven:...","['Campaign Games', 'Components: Miniatures', '...",/boardgame/174430/gloomhaven
1,34729.0,3612,771,5477,2971746,9,597,69,164,8.62499,8.47159,2,Pandemic Legacy Season 1,161936,Board Game,2,1,11,20,2,8,0,33,3,"['Rob Daviau""', 'Matt Leacock""']",['Chris Quilliams'],"['Z-Man Games', 'Asterion Press', 'Devir', 'Fi...",['2015 Cardboard Republic Immersionist Laurel ...,"['Environmental', 'Medical']","['Chinese blue edition', 'Chinese red edition'...","['Action Points', 'Cooperative Game', 'Hand Ma...",['None'],"['Campaign Games', 'Legacy', 'Pandemic']",/boardgame/161936/pandemic-legacy-season-1
2,48339.0,5941,2068,7274,4724387,14,1158,60,148,8.42299,8.26781,2,Terraforming Mars,167791,Board Game,1,1,20,20,6,9,15,29,6,"['Jacob Fryxelius""']",['Isaac Fryxelius'],"['FryxGames', 'Arclight', 'Fantasmagoria', 'Gh...",['2016 Cardboard Republic Architect Laurel Nom...,"['Economic', 'Environmental', 'Industry / Manu...","['Bulgarian edition', 'Chinese edition', 'Czec...","['Card Drafting', 'End Game Bonuses', 'Hand Ma...",['French Championship Promo Cards (fan expansi...,"['Fryxgames Future Timeline', 'Planets: Mars',...",/boardgame/167791/terraforming-mars
3,18269.0,2083,1049,2660,2448074,5,185,30,42,8.49419,8.23513,2,Through the Ages A New Story of Civilization,182028,Board Game,1,4,13,3,3,5,1,14,2,"['Vlaada Chv\\u00e1til""']","['Filip Murmak', 'Radim Pech', 'Jakub Politzer...","['Czech Games Edition', 'Cranio Creations', 'D...",['2015 Golden Geek Best Strategy Board Game No...,"['Card Game', 'Civilization', 'Economic']","['Chinese edition', 'Czech edition', 'English ...","['Action Points', 'Auction/Bidding', 'Auction:...",['Through the Ages: New Leaders and Wonders'],"['Tableau Building', 'Through the Ages']",/boardgame/182028/through-ages-new-story-civil...
6,36644.0,9658,1341,8095,4506683,2,323,84,82,8.31307,8.16138,2,Twilight Struggle,12333,Board Game,2,5,12,13,3,9,9,32,5,"['Ananda Gupta""', 'Jason Matthews""']","['Viktor Csete', 'Rodger B. MacGowan', 'Chechu...","['GMT Games', '(Self-Published)', 'Asterion Pr...",['2005 Charles S. Roberts Best Modern Era Boar...,"['Modern Warfare', 'Political', 'Wargame']","['Bard Centrum Gier Polish deluxe edition', 'B...","['Action/Event', 'Advantage Token', 'Area Majo...",['Twilight Struggle: Anni di Piombo Promo Card...,"['Cold War', 'Country: Soviet Union', 'Country...",/boardgame/12333/twilight-struggle


df_final indica a junção do DataFrame de categorias filtrado com os clusters junto do DataFrame de categorias. Por conta disso, vou remover algumas colunas que considero inúteis do DataFrame de categorias.

In [65]:
df_final.drop(labels=['boardgamedesigner_cnt', 'boardgameartist_cnt', 'boardgamepublisher_cnt', 'boardgamehonor_cnt', 'boardgamecategory_cnt', 'boardgamemechanic_cnt', 'boardgameexpansion_cnt'
                      , 'boardgameversion_cnt', 'boardgamefamily_cnt', 'label', 'boardgamedesigner', 'boardgameartist', 'boardgamepublisher', 'boardgamehonor', 'boardgameversion'
                      , 'boardgameexpansion', 'gamelink'], inplace=True, axis=1)

In [66]:
df_final.head()

Unnamed: 0,usersrated,numgeeklists,numwanting,numcomments,siteviews,news,blogs,weblink,podcast,average_tratado,baverage_tratado,Cluster,name,objectid,boardgamecategory,boardgamemechanic,boardgamefamily
0,31254.0,3657,1365,5972,8933078,7,471,31,139,8.85292,8.58424,2,Gloomhaven,174430,"['Adventure', 'Exploration', 'Fantasy', 'Fight...","['Campaign / Battle Card Driven', 'Cooperative...","['Campaign Games', 'Components: Miniatures', '..."
1,34729.0,3612,771,5477,2971746,9,597,69,164,8.62499,8.47159,2,Pandemic Legacy Season 1,161936,"['Environmental', 'Medical']","['Action Points', 'Cooperative Game', 'Hand Ma...","['Campaign Games', 'Legacy', 'Pandemic']"
2,48339.0,5941,2068,7274,4724387,14,1158,60,148,8.42299,8.26781,2,Terraforming Mars,167791,"['Economic', 'Environmental', 'Industry / Manu...","['Card Drafting', 'End Game Bonuses', 'Hand Ma...","['Fryxgames Future Timeline', 'Planets: Mars',..."
3,18269.0,2083,1049,2660,2448074,5,185,30,42,8.49419,8.23513,2,Through the Ages A New Story of Civilization,182028,"['Card Game', 'Civilization', 'Economic']","['Action Points', 'Auction/Bidding', 'Auction:...","['Tableau Building', 'Through the Ages']"
4,10070.0,1039,1077,1702,894621,3,124,13,21,8.62031,8.20459,1,Brass Birmingham,224517,"['Economic', 'Industry / Manufacturing', 'Tran...","['Hand Management', 'Income', 'Loans', 'Market...","['Beer', 'Brass', 'Cities: Birmingham (England..."


In [67]:
# Aqui transformei o tipo dos valores dessas colunas de str para listas
df_final['boardgamecategory'] = df_final.boardgamecategory.apply(lambda x: x[1:-1].split(','))
df_final['boardgamemechanic'] = df_final.boardgamemechanic.apply(lambda x: x[1:-1].split(','))
df_final['boardgamefamily'] = df_final.boardgamefamily.apply(lambda x: x[1:-1].split(','))


In [68]:
# Defini uma função para formatar a lista que foi transformada na célula anterior
def formata_lista(lista_teste):
    lista_formatada = []
    for elemento in lista_teste:
        elemento_formatado = elemento.strip().replace("'", "")
        lista_formatada.append(elemento_formatado)
    return lista_formatada

In [69]:
# Formatando as listas de cada colunas
df_final['boardgamecategory'] = df_final.boardgamecategory.apply(formata_lista)
df_final['boardgamemechanic'] = df_final.boardgamemechanic.apply(formata_lista)
df_final['boardgamefamily'] = df_final.boardgamefamily.apply(formata_lista)

In [74]:
# Criei um novo DataFrame com as linhas que estão no cluster 2
df_final_cluster2 = df_final.query('Cluster == 2').copy()
df_final_cluster2.shape

(42, 17)

In [79]:
category_count = {}
for i in df_final_cluster2['boardgamecategory']:
    for j in i:
        category_count[j] = category_count.get(j, 0) + 1
category_count = pd.DataFrame.from_dict(category_count, orient='index')

{'Adventure': 13,
 'Exploration': 7,
 'Fantasy': 12,
 'Fighting': 13,
 'Miniatures': 9,
 'Environmental': 2,
 'Medical': 2,
 'Economic': 10,
 'Industry / Manufacturing': 2,
 'Science Fiction': 11,
 'Space Exploration': 5,
 'Territory Building': 6,
 'Card Game': 9,
 'Civilization': 7,
 'Modern Warfare': 1,
 'Political': 3,
 'Wargame': 5,
 'Movies / TV / Radio theme': 4,
 'Dice': 1,
 'Medieval': 3,
 'Collectible Components': 3,
 'Horror': 7,
 'Novel-based': 5,
 'City Building': 3,
 'Farming': 2,
 'Animals': 1,
 'Mythology': 2,
 'Mature / Adult': 1,
 'Bluffing': 4,
 'Ancient': 1,
 'Negotiation': 2,
 'Travel': 1,
 'Deduction': 2,
 'Aviation / Flight': 1,
 'Zombies': 1,
 'Comic Book / Strip': 1,
 'Trains': 1,
 'Childrens Game': 1}