# Criar grafo bipartido

In [1]:
import numpy as np
import pandas as pd
import ast
import networkx as nx

In [2]:
# declare a list tasks whose products you want to use as inputs
upstream = None
product = None
edge_percent = 0.1 # porcentagem de quantidade do peso da aresta para ela ser removida ou não 
df_input_path = '../../dados/X_trainToronto.csv'
df_review_input_path = '../../dados_produzidos/reviewsTrainTorontoSentiment.csv'

In [None]:
df_reg = pd.read_csv(df_input_path)
df_reg = df_reg[['business_id', 'review_count']]
df_reg

## Fazer grafo bipartido de locais
Se uma pessoa fez uma avaliação de dois estabelecimentos, uma aresta será formada entre eles

In [20]:
df_review = pd.read_csv(df_review_input_path)[['business_id', 'user_id']]
df_review

Unnamed: 0,label,score,business_id,user_id,text_hash,useful,cool,funny
0,5,0.549039,--DaPTJW3-tB1vP-PfdTEg,Y2TBSd3ExydbgEnVxAt_QA,1e890783a9704450433a1dc106edc21061a03995,0,0,0
1,5,0.495692,--DaPTJW3-tB1vP-PfdTEg,Rp-cSUHm-EKBxgBS73PNNg,78eeb17f9841de9e3ec59fdd58f5d6517837e9c5,3,2,0
2,4,0.315593,--DaPTJW3-tB1vP-PfdTEg,2fXZuNKP7Bo-yRrINVbVVA,d9e84e7024b8a8c0f4d5a18696478b662205b028,0,0,0
3,3,0.381734,--DaPTJW3-tB1vP-PfdTEg,fn3Wim-2j042IEDe6VhJFQ,eeb46e802e2b72c5724465f258a9840f28753377,5,5,1
4,2,0.646709,--DaPTJW3-tB1vP-PfdTEg,CD9dSllum_L_OvpRW0YH0w,dabd5db824a3ed91ed90de59e80a4700452d561e,0,0,0
...,...,...,...,...,...,...,...,...
490958,4,0.383298,zzvlwkcNR1CCqOPXwuvz2A,YyV_UBEAoTMgwImiKnWYTQ,0ca783cf1fc66ae112af79d839e9f21261875392,0,0,0
490959,2,0.490604,zzvlwkcNR1CCqOPXwuvz2A,pkRISP8QgKMDPQlr8eNrdA,a1c9cf5c6e831191f2b64a0f1ba404c06e7f0631,0,0,0
490960,4,0.408329,zzvlwkcNR1CCqOPXwuvz2A,yDRw0UY7FhjDAs-q1bvRhg,82b3baa4cd777a6852545c08846e309140eba6ba,0,0,0
490961,5,0.967402,zzvlwkcNR1CCqOPXwuvz2A,U708fyOqNaBU0IQoE6E7WQ,1ddfcabd901a21a227ca0ff8ada688b2160332ea,0,0,0


In [None]:
df_index_to_business = pd.DataFrame(df_reg['business_id'])
df_business_to_index = df_reg['business_id'].reset_index().set_index('business_id')
df_index_to_review_count = df_reg['review_count'].values

In [74]:
import itertools
from tqdm import tqdm

tqdm.pandas()
G = nx.Graph()
G.add_nodes_from(df_index_to_business.index)
    
def add_edges(rows):
    if len(rows) < 2:
        return
    business_ids = [df_business_to_index.loc[x].iloc[0] for x in rows['business_id'].unique()]
    for comb in itertools.combinations(business_ids, 2):
        if comb in G.edges:
            G.edges[comb[0], comb[1]]['weight'] += 1
        else:
            G.add_edge(comb[0], comb[1], weight=1)

df_review.groupby('user_id').progress_apply(add_edges)
G.number_of_edges()

100%|█████████████████████████████████| 109250/109250 [01:32<00:00, 1177.38it/s]


8442907

In [77]:
edges_to_remove = []
for u,v,a in tqdm(G.edges(data=True), total=G.number_of_edges()):
    weight = a['weight']
    u_review_count = df_index_to_review_count[u]
    v_review_count = df_index_to_review_count[v]
    if float(weight) < edge_percent * float(min(u_review_count, v_review_count)):
        edges_to_remove.append((u, v))

100%|█████████████████████████████| 8442906/8442906 [00:31<00:00, 267732.59it/s]


In [78]:
for u,v in tqdm(edges_to_remove):
    G.remove_edge(u, v)

100%|█████████████████████████████| 5233403/5233403 [00:10<00:00, 482257.33it/s]


In [None]:
page_rank = nx.pagerank(G, weight='weight')
df_reg['graph_page_rank'] = pd.Series(page_rank)

In [None]:
closeness = nx.closeness_centrality(G)
df_reg['graph_closeness_centrality'] = pd.Series(closeness)

In [None]:
betweeness = nx.betweenness_centrality(G, weight='weight', k=1000)
df_reg['graph_betweenness_centrality'] = pd.Series(betweeness)

In [92]:
nx.write_gml(G, product['graph'])

In [None]:
df_reg.to_parquet(product['data'])