# Blogcatalog friendship network

In [1]:
import networkx as nx
import numpy as np
import pickle as p
from scipy.sparse import csr_matrix, lil_matrix
from matplotlib import pyplot as plt
%matplotlib inline

data_loc = './../data/raw/BlogCatalog-dataset/data/'

BlogCatalog is the social blog directory which manages the bloggers and their blogs. There are 10,312 bloggers with unique ids starting from 1 to 10,312 and 333,983 friendship pairs in this dataset. Each blogger belongs to multiple groups. There are 39 groups with indices ranging from 1 to 39. 

## Load data from edge list

In [2]:
# Template var and index conversion lookup
iid = {}
idx = 0
edgelist = []

# Read edges pairs
with open(data_loc+'edges.csv', 'r') as f:
    for line in f.readlines():
        i, j = line.strip().split(',')  # csv
        if i not in iid:
            iid[i] = idx; idx += 1
        if j not in iid:
            iid[j] = idx; idx += 1
        edgelist.append((iid[i], iid[j]))

# Create an nx undirected network
bc = nx.Graph(edgelist)

print("Number of nodes: ", len(bc))
print("Number of edges: ", bc.size())

Number of nodes:  10312
Number of edges:  333983


## Load labels as a sparse matrix

In the original dataset, the group index is in range 1 to 39. For computational convenient, I will convert the group ids to range 0-38.

In [3]:
lil_labels = lil_matrix((len(bc), 39), dtype=int)

# Read (node_id, label) file
with open(data_loc+'group-edges.csv', 'r') as f:
    for line in f.readlines():
        node, group = line.strip().split(',') 
        lil_labels[iid[node], int(group) - 1] = 1  # range(0,39)

In [4]:
print("First 5 rows of labels matrix:")
for i in range(5):
    print(lil_labels[i].toarray())

First 5 rows of labels matrix:
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0]]
[[0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0]]
[[0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0]]
[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
  0 0]]
[[0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0]]


In [5]:
bc_dataset = {'NXGraph': bc, 'LILLabels': lil_labels}
with open('./../data/blogcatalog.data', 'wb') as f:
    p.dump(bc_dataset, f)

## Dump as edgelist

Graph with new ids is needed for other algorithms.

In [7]:
nx.write_edgelist(bc, path='./../data/blogcatalog.edges', data=False)  # delimiter is a white space

In [8]:
lil_labels

<10312x39 sparse matrix of type '<class 'numpy.int64'>'
	with 14476 stored elements in LInked List format>

In [None]:
lil_labels.toarr