In [17]:
import re
import numpy as np
import pandas as pd
import networkx as nx
import community
import markov_clustering as mc
import matplotlib.pyplot as plt
from scipy import sparse

%matplotlib inline

In [18]:
# Load words dataset table
words = pd.read_csv('data/database/words.csv', dtype={
    'tweet': np.unicode_,
    'index': np.int,
    'text': np.unicode_,
    'pos': np.unicode_,
    'conf': np.float
})

In [19]:
# Define function for creating edges dataset
def get_edges(words):
    # Make join to obtain words in the same tweet
    edges = pd.merge(words, words, on='tweet')
    edges = edges[edges.index_x != edges.index_y]  # Remove self join

    # Count how many times the same word matches have been found
    edges = edges.groupby(['text_x', 'pos_x', 'text_y', 'pos_y']).size()
    edges = edges.reset_index(name='counts')

    # Get unique word (text, POS) set
    unique_words = words.groupby(by=['text', 'pos']).size().reset_index(name='counts')
    # Map each unique concept to a number and vice versa 
    w2i, i2w = dict(), dict()
    for index, word in unique_words.iterrows():
        w2i[(word.text, word.pos)] = index
        i2w[index] = (word.text, word.pos)
    # Map each word to a numeric index
    edges['number_x'] = edges.apply(lambda e: w2i[(e.text_x, e.pos_x)], axis=1)
    edges['number_y'] = edges.apply(lambda e: w2i[(e.text_y, e.pos_y)], axis=1)

    # Return dataset
    return edges, w2i, i2w

In [20]:
w2i, i2w, _ = get_edges(words)

In [21]:
# Load adjacency matrices
X_2017 = np.load('data/database/adj_mat2017.npy') 
X_2018 = np.load('data/database/adj_mat2018.npy') 

X = [X_2017, X_2018]

In [22]:
# Undirected multigraph objects defininition from adjacency matrices
net_2017 = nx.from_numpy_matrix(X_2017, parallel_edges=True, create_using=nx.MultiGraph)
net_2018 = nx.from_numpy_matrix(X_2018, parallel_edges=True, create_using=nx.MultiGraph)

nets = [net_2017, net_2018]

# MCL communities extraction

In [23]:
# MCL parameters

# Inflaction: float > 1
imin = 15       # minimum inflaction * 10
imax = 40       # maximum inflaction * 10
istep = 5       # step for inflaction exploration * 10

# Expansion: integer > 1
emin = 2        # minimum expansion 
emax = 8        # maximum expansion 
estep = 1       # step for expansion exploration

max_iter = 500  # max number of iterations for the algorithm

In [12]:
# Evaluate modularity for each set of parameters

for i, mat in enumerate(X):
    # convert the matrix to sparse
    mat = sparse.csr_matrix(mat)
    if not i: print('2017\'s network')
    if  i: print('\n\n2018\'s network')
    # grid search for best parameters
    for inf in [i / 10 for i in range(imin, imax, istep)]:
        for exp in [e for e in range(emin, emax, estep)]:
            # compute clusters
            result = mc.run_mcl(mat, pruning_threshold = 0, iterations = max_iter, inflation = inf, expansion = exp ) 
            clusters = mc.get_clusters(mat) 
            print("Num. clusters:",len(clusters))
            # compute corresponding modularity
            Q = mc.modularity(matrix=result, clusters=clusters)
            print("inflation:", inf, "expansion:", exp, "modularity:", Q) 

2017's network
Num. clusters: 48
inflation: 1.5 expansion: 2 modularity: 0.0011699245792519844
Num. clusters: 48
inflation: 1.5 expansion: 3 modularity: 0.0011699245792519844
Num. clusters: 48
inflation: 1.5 expansion: 4 modularity: 0.0011712948963333333
Num. clusters: 48
inflation: 1.5 expansion: 5 modularity: 0.0011712948963333333
Num. clusters: 48
inflation: 1.5 expansion: 6 modularity: 0.0011712948963333333
Num. clusters: 48
inflation: 1.5 expansion: 7 modularity: 0.0005858157925569253
Num. clusters: 48
inflation: 2.0 expansion: 2 modularity: 0.0011665127731609725
Num. clusters: 48
inflation: 2.0 expansion: 3 modularity: 0.0011699245792519844
Num. clusters: 48
inflation: 2.0 expansion: 4 modularity: 0.0011699245792519844
Num. clusters: 48
inflation: 2.0 expansion: 5 modularity: 0.0011699245792519844
Num. clusters: 48
inflation: 2.0 expansion: 6 modularity: 0.0011712948963333333
Num. clusters: 48
inflation: 2.0 expansion: 7 modularity: 0.0005851302440617211
Num. clusters: 48
inflati

KeyboardInterrupt: 

In [24]:
# Best parameters
param_2017 = {'inf': 1.5, 'exp': 4}   # mod = 0.0011712948963333333, n. clusters 48
param_2018 = {'inf': 3.5, 'exp': 2}   # mod =  , n. clusters 112

# Other parameters can be tried: 
# - 2017: higher exp
# - 2018: higher inf

param = [param_2017, param_2018]

In [25]:
# Compute and save clusters

# List of both classes of clusters (list of list)
wclusters = []

for i, mat in enumerate(X):
    # convert the matrix to sparse
    mat = sparse.csr_matrix(mat)
    # MCL using best parameters
    result = mc.run_mcl(mat, verbose = 1, pruning_threshold = 0, iterations = max_iter, 
                        inflation = param[i]['inf'], expansion = param[i]['exp'])           
    clusters = mc.get_clusters(mat) 
    # list of clusters for the current year
    yc = []
    for cluster in clusters:
        c = [i2w[node] for node in cluster]
        print(c, end='\n\n')
        yc.append(c)
        
np.save('data/database/clusters_MCL',np.array(wclusters))

--------------------------------------------------
MCL Parameters
Expansion: 4
Inflation: 1.5
No pruning
Convergence check: 1 iteration
Maximum iterations: 500
Sparse matrix mode
--------------------------------------------------
Iteration 1
Checking for convergence
Iteration 2
Checking for convergence
Iteration 3
Checking for convergence
Iteration 4
Checking for convergence
Iteration 5
Checking for convergence
Iteration 6
Checking for convergence
Iteration 7
Checking for convergence
Iteration 8
Checking for convergence
Iteration 9
Checking for convergence
Iteration 10
Checking for convergence
Iteration 11
Checking for convergence
Iteration 12
Checking for convergence
Iteration 13
Checking for convergence
Iteration 14
Checking for convergence
Iteration 15
Checking for convergence
Iteration 16
Checking for convergence
Iteration 17
Checking for convergence
Iteration 18
Checking for convergence
Iteration 19
Checking for convergence
Iteration 20
Checking for convergence
Iteration 21
Checki

KeyError: 151

# Louvain clustering 

In [26]:
# Compute clusters for both nets

partition_2017 = community.best_partition(net_2017)
partition_2018 = community.best_partition(net_2018)

np.save('data/database/clusters_LC',np.array([partition_2017,partition_2018]))