In [2]:
import pickle

import pandas as pd
import numpy as np
import networkx as nx

# Load indexes, etc

In [3]:
dataset_info = pd.read_csv('datasets-2017-06-27-15-14.csv', sep=';')
datasets = list(dataset_info.id)
datasets_set = set(datasets)
datasets_index = {dataset: i for i, dataset in enumerate(datasets)}
n_datasets = len(datasets)
id2slug = {r[1]: r[3] for r in dataset_info.itertuples()}
slug2id = {r[3]: r[1] for r in dataset_info.itertuples()}
with open('dataset_count.pickle', 'rb') as f:
    dataset_count = pickle.load(f)

keywords = pickle.load(open('keywords_list.pickle', 'rb'))
keywords_set = set(keywords)
keywords_index = {keyword: i for i, keyword in enumerate(keywords)}
n_keywords = len(keywords)
with open('keyword_count.pickle', 'rb') as f:
    keyword_count = pickle.load(f)

# Filter datasets

In [4]:
selected_datasets_i = np.where(dataset_count>=150)[0]
selected_datasets_slugs = [id2slug[datasets[i]] for i in selected_datasets_i]

# Load 

In [5]:
with open('dataset_dataset_adjacency_list.pickle', 'rb') as f:
    dataset_dataset_adjacency_list = pickle.load(f)

In [6]:
dataset_dataset_adjacency_list[0]

{0: 50,
 1: 6,
 3: 1,
 11: 1,
 28: 2,
 43: 2,
 84: 1,
 113: 1,
 115: 1,
 120: 1,
 157: 1,
 211: 1,
 212: 1,
 239: 1,
 245: 4,
 305: 1,
 310: 1,
 325: 1,
 353: 1,
 407: 3,
 410: 1,
 411: 1,
 416: 1,
 422: 2,
 423: 1,
 426: 1,
 440: 1,
 459: 1,
 504: 2,
 511: 2,
 526: 1,
 548: 1,
 603: 1,
 605: 1,
 621: 1,
 624: 2,
 639: 1,
 640: 1,
 657: 2,
 698: 1,
 699: 2,
 781: 1,
 790: 1,
 820: 1,
 821: 1,
 849: 1,
 865: 1,
 885: 1,
 890: 1,
 930: 1,
 941: 1,
 994: 1,
 1009: 1,
 1018: 1,
 1023: 1,
 1077: 1,
 1103: 1,
 1110: 2,
 1113: 2,
 1137: 1,
 1139: 1,
 1175: 1,
 1180: 1,
 1201: 1,
 1207: 4,
 1211: 1,
 1216: 1,
 1245: 1,
 1251: 1,
 1267: 9,
 1291: 2,
 1293: 2,
 1296: 1,
 1305: 1,
 1314: 1,
 1318: 1,
 1357: 1,
 1387: 2,
 1388: 2,
 1391: 1,
 1395: 1,
 1400: 1,
 1405: 1,
 1413: 2,
 1417: 1,
 1427: 1,
 1440: 1,
 1460: 1,
 1461: 3,
 1462: 1,
 1463: 1,
 1478: 1,
 1539: 2,
 1582: 4,
 1585: 1,
 1603: 1,
 1604: 3,
 1611: 1,
 1613: 1,
 1631: 1,
 1639: 1,
 1660: 1,
 1673: 1,
 1691: 1,
 1777: 1,
 1812: 2,
 

# Create graph

In [32]:
G = nx.Graph()

In [33]:
for i in selected_datasets_i:
    slug = id2slug[datasets[i]]
    weight = dataset_count[i]
    G.add_node(slug, weight=float(weight))

In [34]:
edges = []
for d1 in selected_datasets_i:
    adj_list = dataset_dataset_adjacency_list[d1]
    for d2, weight in adj_list.items():
        if d2 in selected_datasets_i and weight >= 50 and d1<d2:
            slug1 = id2slug[datasets[d1]]
            slug2 = id2slug[datasets[d2]]
            edges.append((slug1, slug2, float(weight)))
len(edges)

9580

In [35]:
G.add_weighted_edges_from(edges)

In [36]:
nx.write_gml(G, 'graph_datasets.gml', str)

# Keyword graph

In [37]:
with open('keyword_keyword_adjacency_list.pickle', 'rb') as f:
    keyword_keyword_adjacency_list = pickle.load(f)

In [46]:
selected_keywords_i = np.where(keyword_count>=30)[0]
len(selected_keywords_i)

In [50]:
G = nx.Graph()

for i in selected_keywords_i:
    keyword = keywords[i]
    weight = keyword_count[i]
    G.add_node(keyword, weight=float(weight))
    
edges = []
for k1 in selected_keywords_i:
    adj_list = keyword_keyword_adjacency_list[k1]
    for k2, weight in adj_list.items():
        if k2 in selected_keywords_i and weight >= 5 and k1<k2:
            keyword1 = keywords[k1]
            keyword2 = keywords[k2]
            edges.append((keyword1, keyword2, float(weight)))
print(len(edges))

G.add_weighted_edges_from(edges)

nx.write_gml(G, 'graph_keywords.gml', str)

4672
