In [None]:
output_dir = './polar/'

In [None]:
output_dir = '/v3/infodemic/resources/global/'

### Calculate Sentiment Attitude

For each pair of entities and entity-np, calculate the sentiment attitude from the one towards the other. This is done by looking at the sentiment score of the dependency path between the two. To calculate the sentiment score, we utilize the debater lexicon.

In [None]:
import pandas as pd

mpqa_df = []

with open('./resources/mpqa/subjclueslen1-HLTEMNLP05.tff', 'r') as f:
    for l in f.readlines():
        obj = {}
        
        for d in l.strip().split(' '):
            d = d.split('=')
            obj[d[0]] = d[1] if d[0] != 'len' else int(d[1])
            
        mpqa_df.append(obj)
        
mpqa_df = pd.DataFrame.from_dict(mpqa_df).set_index('word1')

mpqa_dict = mpqa_df.T.to_dict()

In [None]:
from nltk.corpus import stopwords

stop_words = stopwords.words('english')

def calculate_mpqa(tokens):
    
    positive_list, negative_list = [], []
    positive_words, negative_words = [], []
    
    for token in tokens:
        sentiment = 0
        if 'entity_id' in token: continue
            
        t = token['originalText'].lower().strip()
        if t in stop_words: continue
        if not t in list(mpqa_dict.keys()): t = token['lemma'].lower().strip()
        if not t in list(mpqa_dict.keys()): continue
        if t in stop_words: continue
            
        mpqa_obj = mpqa_dict[t]

        t_pos = convert_to_mpqa_pos(token['pos'])
        
        if not (mpqa_obj['pos1'] == 'anypos' or t_pos == mpqa_obj['pos1']): continue
        
        mpqa_polarity = mpqa_obj['priorpolarity']

        if mpqa_polarity == 'positive' or mpqa_polarity == 'both': 
            positive_words.append(t)
            positive_list.append(1.0)
            
        if mpqa_polarity == 'negative' or mpqa_polarity == 'both': 
            negative_words.append(t)
            negative_list.append(1.0)
    
    return {'POSITIVE': sum(positive_list), 'NEGATIVE': abs(sum(negative_list))}, \
           {'POSITIVE': positive_words, 'NEGATIVE': negative_words}

In [None]:
del stop_words[37] 

In [None]:
def convert_to_mpqa_pos(pos):
    if    pos == 'VERB': return 'verb'
    elif  pos == 'NOUN' or pos == 'PROPN': return 'noun'
    elif  pos == 'ADJ': return 'adj'
    elif  pos == 'ADV': return 'adverb'
    else: return 'other'
    
def calculate_sentiment_attitude(path):

    daily_folder = path.split('/')[-2]

    with open(path, 'r') as f: pair_annotation_dependency_features = json.load(f)    
    pair_annotation_dependency_features['dependency_features'] = jsonpickle.decode(pair_annotation_dependency_features['dependency_features'])

    uid = pair_annotation_dependency_features['uid']
    pair_annotation_dependency_features = pair_annotation_dependency_features['dependency_features']

    pair_sentiment_attitude_dict = defaultdict(lambda: {'POSITIVE': [], 'NEGATIVE': []})
    pair_sentiment_word_dict = defaultdict(lambda: {'POSITIVE': [], 'NEGATIVE': []})

    for pair in pair_annotation_dependency_features:
        for att_obj in pair_annotation_dependency_features[pair]:
            
            sentiment_attitudes, sentiment_words = calculate_mpqa(att_obj)

            pair_sentiment_attitude_dict[pair]['POSITIVE'].append(sentiment_attitudes['POSITIVE'])
            pair_sentiment_attitude_dict[pair]['NEGATIVE'].append(sentiment_attitudes['NEGATIVE'])

            pair_sentiment_word_dict[pair]['POSITIVE'] += sentiment_words['POSITIVE']
            pair_sentiment_word_dict[pair]['NEGATIVE'] += sentiment_words['NEGATIVE']

    pair_sentiment_attitude_dict = dict(pair_sentiment_attitude_dict)
    pair_sentiment_word_dict = dict(pair_sentiment_word_dict)

    output_folder = output_dir + 'pair_sentiment_attitudes/' + daily_folder + '/'
    output_file = output_folder + uid + '.json'
    if not os.path.exists(output_folder): os.makedirs(output_folder, exist_ok=True)
    with open(output_file, 'w') as f: f.write(json.dumps({
        'uid': uid,
        'sentiment_attitudes': jsonpickle.encode(pair_sentiment_attitude_dict)
    }))

    output_folder = output_dir + 'pair_sentiment_words/' + daily_folder + '/'
    output_file = output_folder + uid + '.json'
    if not os.path.exists(output_folder): os.makedirs(output_folder, exist_ok=True)
    with open(output_file, 'w') as f: f.write(json.dumps({
        'uid': uid,
        'sentiment_words': jsonpickle.encode(pair_sentiment_word_dict)
    }))

    return True

In [None]:
import os, itertools

pair_paths = [output_dir + 'pair_dependency_features/' + p + '/' for p in sorted(os.listdir(output_dir + 'pair_dependency_features/'))]
pair_paths = list(itertools.chain.from_iterable([[p + d for d in os.listdir(p)] for p in pair_paths]))

In [None]:
%%time

from tqdm import tqdm
from multiprocessing import Pool
from collections import defaultdict
import multiprocessing, json, pickle, jsonpickle

pool = Pool(multiprocessing.cpu_count())

for i in tqdm(
    pool.imap_unordered(calculate_sentiment_attitude, pair_paths),
    desc='Calculating entity-pair sentiment attitudes',
    total=len(pair_paths)
): pass

pool.close()
pool.join()

In [None]:
import gzip, json

#######################################
# Define a loading function for the   #
# .gzip files we generate. Examples:  #
# =================================== #
# for .json.gzip use func=json.loads  #
# for .pckl.gzip use func=pickle.load #
#######################################

def load_gzip(path, func=json.loads):
    with gzip.open(path, 'r') as f: data = func(f.read().decode('utf-8'))
    return data

### Export `Entity-NP Sentiment Attitudes`

Export the attitude objects with entities as sources and NPs as targets. This will later help identify the polarizing topics.

In [None]:
from tqdm import tqdm
import sentiment_features, jsonpickle
from utilities import DepDirection, find_dep_path

def calculate_entity_np_attitudes(path):

    uid = path.split('/')[-1]
    daily_folder = path.split('/')[-2]

    path = path.replace('/spacy/', '/{}/').replace('.pckl', '{}')
    
    if not os.path.exists(path.format('entity_np_annotations', '.json')): return None

    try:
        entity_np_annotations_dict = load_gzip(path.format('entity_np_annotations', '.json'))    
    except Exception as ex:
        print(daily_folder, uid, ex)
        return None
    
    entity_np_annotations_dict = entity_np_annotations_dict['entity_np_annotations']

    entity_np_sentiment_attitudes = defaultdict(lambda: defaultdict(lambda: {'POSITIVE': [], 'NEGATIVE': []}))
    entity_np_sentiment_word_dict = defaultdict(lambda: defaultdict(lambda: {'POSITIVE': [], 'NEGATIVE': []}))

    for entity in entity_np_annotations_dict:

        for np in entity_np_annotations_dict[entity]:
            np_dep_path_feature = []

            for annotation in entity_np_annotations_dict[entity][np]:
                tokens = annotation['tokens']
                source_indices = [i for i, t in enumerate(tokens) if 'entity_id' in t and t['entity_id'][1] == entity]
                destination_indices = [i for i, t in enumerate(tokens) if 'entity_id' in t and t['entity_id'][1] == np]

                for source_destination in itertools.product(source_indices, destination_indices):
                    dep_path = find_dep_path(tokens, source_destination[0], source_destination[1])

                    dep_path = [
                        (( DepDirection.DEP if dep_dir == DepDirection.GOV else DepDirection.GOV, dep_type), dep_idx) 
                        for (dep_dir, dep_type), dep_idx in dep_path
                    ]

                    dep_path_features = sentiment_features.dep_path_features([], tokens, dep_path)

                    if len(dep_path_features) > 0: np_dep_path_feature += [dpf[1] for dpf in dep_path_features]

                    dep_path = find_dep_path(tokens, source_destination[1], source_destination[0])

                    dep_path = [
                        (( DepDirection.DEP if dep_dir == DepDirection.GOV else DepDirection.GOV, dep_type), dep_idx) 
                        for (dep_dir, dep_type), dep_idx in dep_path
                    ]

                    dep_path_features = sentiment_features.dep_path_features([], tokens, dep_path)

                    if len(dep_path_features) > 0: np_dep_path_feature += [dpf[1] for dpf in dep_path_features]

            for att_obj in np_dep_path_feature:
                sentiment_attitudes, sentiment_words = calculate_mpqa(att_obj)

                entity_np_sentiment_attitudes[entity][np]['POSITIVE'].append(sentiment_attitudes['POSITIVE'])
                entity_np_sentiment_attitudes[entity][np]['NEGATIVE'].append(sentiment_attitudes['NEGATIVE'])

                entity_np_sentiment_word_dict[entity][np]['POSITIVE'] += sentiment_words['POSITIVE']
                entity_np_sentiment_word_dict[entity][np]['NEGATIVE'] += sentiment_words['NEGATIVE']

    output_folder = output_dir + 'entity_np_sentiment_attitudes/' + daily_folder + '/'
    output_file = output_folder + uid + '.jsonpckl'
    if not os.path.exists(output_folder): os.makedirs(output_folder, exist_ok=True)
    with open(output_file, 'w') as f: f.write(json.dumps({
        'uid': uid,
        'entity_np_sentiment_attitudes': jsonpickle.encode(entity_np_sentiment_attitudes)
    }))

    output_folder = output_dir + 'entity_np_sentiment_words/' + daily_folder + '/'
    output_file = output_folder + uid + '.jsonpckl'
    if not os.path.exists(output_folder): os.makedirs(output_folder, exist_ok=True)
    with open(output_file, 'w') as f: f.write(json.dumps({
        'uid': uid,
        'entity_np_sentiment_words': jsonpickle.encode(entity_np_sentiment_word_dict)
    })) 

In [None]:
import os, itertools, json

spacy_paths = [output_dir + 'spacy/' + p + '/' for p in sorted(os.listdir(output_dir + 'spacy/'))]
spacy_paths = list(itertools.chain.from_iterable([[p + _ for _ in os.listdir(p)] for p in spacy_paths]))

In [None]:
%%time

pool = Pool(32)

for d in tqdm(
    pool.imap_unordered(calculate_entity_np_attitudes, spacy_paths),
    desc='Calculating entity-np attitudes',
    total=len(spacy_paths)
): del d

pool.close()
pool.join()

In [None]:
import json, jsonpickle, numpy
from collections import defaultdict

from ast import literal_eval as make_tuple

def calculate_pair_frequencies(path):
    with open(path, 'r') as f: pair_sentiment_attitude_dict = json.load(f)    
    pair_sentiment_attitude_dict['sentiment_attitudes'] = jsonpickle.decode(pair_sentiment_attitude_dict['sentiment_attitudes'])
    pair_sentiment_attitude_dict = pair_sentiment_attitude_dict['sentiment_attitudes']
    
    return [{k: v} for k, v in pair_sentiment_attitude_dict.items()]

def calculate_pair_words(path):
    with open(path, 'r') as f: pair_sentiment_attitude_dict = json.load(f)    
    pair_sentiment_attitude_dict['sentiment_words'] = jsonpickle.decode(pair_sentiment_attitude_dict['sentiment_words'])
    pair_sentiment_attitude_dict = pair_sentiment_attitude_dict['sentiment_words']
    
    return [{k: v} for k, v in pair_sentiment_attitude_dict.items() if len(v['POSITIVE']) > 0 or len(v['NEGATIVE']) > 0]

def sentiment_threshold_difference(swn_pos, swn_neg):
    swn_pos = abs(swn_pos)
    swn_neg = abs(swn_neg)
    return numpy.sign(swn_pos - swn_neg) * (abs(swn_pos - swn_neg))

In [None]:
diaily_article_dict = {}

for str_date in os.listdir(output_dir + 'pair_sentiment_attitudes'):
    diaily_article_dict[str_date] = os.listdir(output_dir + 'pair_sentiment_attitudes/' + str_date)

In [None]:
import datetime

diaily_article_freq = {datetime.datetime.strptime(k, '%Y%m%d'): len(v) for k,v in diaily_article_dict.items()}

In [None]:
date_list = sorted(diaily_article_freq.keys(), key=lambda k: k)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(16, 5))

plt.bar(
    x=date_list,
    height=[diaily_article_freq[d] for d in date_list]
)

In [None]:
import networkx as nx, itertools

window_articles = list(itertools.chain.from_iterable(diaily_article_dict.values()))

attitude_paths = [output_dir + f'/pair_sentiment_attitudes/{d}/{p}' for p in window_articles]
attitude_paths = [p for p in attitude_paths if os.path.exists(p)]

pool = Pool(multiprocessing.cpu_count() - 4)

_pair_sentiment_attitude_dict = []

for result in tqdm(
    pool.imap_unordered(calculate_pair_frequencies, attitude_paths),
    desc='Fetching Pairs',
    total=len(attitude_paths)
): _pair_sentiment_attitude_dict += result

pool.close()
pool.join()

In [None]:
pair_sentiment_attitude_dict = {}

for att_obj in tqdm(_pair_sentiment_attitude_dict):

    pair, atts = list(att_obj.items())[0]

    pair = make_tuple(pair)
    pair = list(pair)

    pair[0] = pair[0].replace('http://wat.org/resource/', 'http://dbpedia.org/resource/')
    pair[1] = pair[1].replace('http://wat.org/resource/', 'http://dbpedia.org/resource/')

    if pair[0] == pair[1]: continue

    pair.sort()

    pair = (pair[0], pair[1])
    
    if pair not in pair_sentiment_attitude_dict: pair_sentiment_attitude_dict[pair] = {'POSITIVE': [], 'NEGATIVE': []}

    pair_sentiment_attitude_dict[pair]['POSITIVE'] += atts['POSITIVE']
    pair_sentiment_attitude_dict[pair]['NEGATIVE'] += atts['NEGATIVE']

In [None]:
entity_pair_frequency_dict = {}

for p in tqdm(pair_sentiment_attitude_dict):
   
    sentiments = []

    for i in range(len(pair_sentiment_attitude_dict[p]['POSITIVE'])):

        total_p_n = pair_sentiment_attitude_dict[p]['POSITIVE'][i] + pair_sentiment_attitude_dict[p]['NEGATIVE'][i]
        
        if total_p_n == 0.0: sentiments.append(0.0)
        else: sentiments.append(
            sentiment_threshold_difference(
                pair_sentiment_attitude_dict[p]['POSITIVE'][i] / total_p_n,
                pair_sentiment_attitude_dict[p]['NEGATIVE'][i] / total_p_n
            )
        )

    entity_pair_frequency_dict[p] = len([s for s in sentiments if s != 0.0])

In [None]:
import numpy

numpy.percentile(sorted(list(set(entity_pair_frequency_dict.values()))), 25)

In [None]:
import networkx as nx

G = nx.Graph()

node_id, node_to_int, int_to_node = 0, {}, {}

for p in tqdm(sorted(entity_pair_frequency_dict.keys(), key = lambda k: entity_pair_frequency_dict[k], reverse = True)):
    
    p_freq = entity_pair_frequency_dict[p]
    if p_freq < 2: continue
    
    n_v = numpy.asarray([v for v in pair_sentiment_attitude_dict[p]['NEGATIVE']])
    p_v = numpy.asarray([v for v in pair_sentiment_attitude_dict[p]['POSITIVE']])
    
    sentiments = []

    for j in range(p_v.shape[0]): 
        p_n_total = p_v[j] + n_v[j]
        
        if p_n_total == 0: sentiments.append(0.0)
        else: sentiments.append(sentiment_threshold_difference(p_v[j] / p_n_total, n_v[j] / p_n_total))
        
    sentiments = [s for s in sentiments if s != 0]    
    
    if len(sentiments) == 0: continue
        
    sentiment = numpy.median(sentiments)
        
    if sentiment < 0.01 and sentiment > -0.01: continue
    
    if not p[0] in node_to_int: 
        node_to_int[p[0]] = node_id
        int_to_node[node_id] = p[0]
        node_id += 1

    if not p[1] in node_to_int: 
        node_to_int[p[1]] = node_id
        int_to_node[node_id] = p[1]
        node_id += 1

    p_1, p_2 = node_to_int[p[0]], node_to_int[p[1]]
        
    G.add_edge(p_1, p_2, weight=numpy.sign(sentiment))

In [None]:
print('Nodes:', G.number_of_nodes())
print('Edges:', G.number_of_edges())

In [None]:
node_freq_dict = {}

for n in tqdm(G.nodes()):
    n1 = int_to_node[n]
    
    f1 = 0.0
    
    for n2 in G.neighbors(n):
        n2  = int_to_node[n2]
        
        p = [n1, n2]
        p.sort()
        p = (p[0], p[1])
        
        f12 = entity_pair_frequency_dict[p]
        f1 += f12
        
    node_freq_dict[n1] = f1

In [None]:
index = 1

for n, f in sorted(node_freq_dict.items(), key=lambda kv: kv[1], reverse=True):
    print('{0:5} {1:85} {2}'.format(index, n.replace('http://dbpedia.org/resource/', ''), f))
    index += 1

In [None]:
import os, pickle

if os.path.exists(output_dir + 'pkb/'): print('File already exists.')
else: os.makedirs(output_dir + 'pkb')

In [None]:
with open(output_dir + 'pkb/' + 'sag.pckl', 'wb') as f:         pickle.dump(G, f)
with open(output_dir + 'pkb/' + 'int_to_node.pckl', 'wb') as f: pickle.dump(int_to_node, f)
with open(output_dir + 'pkb/' + 'node_to_int.pckl', 'wb') as f: pickle.dump(node_to_int, f)

### Use `SiMap` to Extract Fellowships

In [None]:
import subprocess, os, pandas as pd
from signed_network_balance import *

def si_map_cpm(G, resolution=0.00):
    if os.path.isfile('/tmp/simap.wrapper.tsv'): print('Removing simap previous data...', os.remove('/tmp/simap.wrapper.tsv'))
    if os.path.isfile('/tmp/simap.wrapper.partition.out'): print('Removing simap previous partitions...', os.remove('/tmp/simap.wrapper.partition.out'))
        
    _df_dict = [{'p_1': max(e[0], e[1]), 'p_2': min(e[1], e[0]), 'sign': int(e[2]['weight'])} for e in list(G.edges(data=True))]

    _df = pd.DataFrame.from_dict(_df_dict)
    _df = _df.sort_values(by=['p_1'])
    
    print('> Dumping graph in /tmp/simap.wrapper.tsv...')
    _df.to_csv('/tmp/simap.wrapper.tsv', sep='\t', index=False, header=False)
    
    subprocess_results = subprocess.run(
        ['java', '-jar', './simap-1.0.0-final.jar', 'mdl', '-r', str(resolution), '-g', '/tmp/simap.wrapper.tsv', '-o', '/tmp/simap.wrapper.partition.out'],
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE
    )
    
    print('> Errors: ', str(subprocess_results.stderr))
    print('> Outputs: ', str(subprocess_results.stdout))
    
    _df_partitions = pd.read_csv('/tmp/simap.wrapper.partition.out', sep='\t', index_col=0, header=None)
    
    print()
    
    return {k: v[1] for k,v in _df_partitions.T.to_dict().items()}

In [None]:
simap_iteration_dict = {i: [] for i in range(10)}

for iteration in range(10):
    
    si_map_0 = si_map_cpm(G.copy(), resolution=0.075)
    
    si_map_partitions = defaultdict(lambda: [])

    for k,v in si_map_0.items(): si_map_partitions[v].append(k)
    for k in list(si_map_partitions.keys()): si_map_partitions[k] = list(si_map_partitions[k])

    si_map_partitions = dict(si_map_partitions)
    
    for i in range(len(si_map_partitions)):
        f_list = []
        
        for n in si_map_partitions[i]: f_list.append(int_to_node[n])

        simap_iteration_dict[iteration].append(f_list.copy())

In [None]:
def jaccard_index(s1, s2): return len(set(s1).intersection(set(s2))) / len(set(s1).union(set(s2)))

In [None]:
fellowship_indices = [['{}_{}'.format(i, j) for j, f in enumerate(f_list)] for i, f_list in simap_iteration_dict.items()]
fellowship_indices = list(itertools.chain.from_iterable(fellowship_indices))

In [None]:
jaccard_indices = []

for i, f1 in tqdm(list(enumerate(fellowship_indices))):
    x1 = int(f1.split('_')[0])
    y1 = int(f1.split('_')[1])
    
    j_f12 = []
    
    for j, f2 in enumerate(fellowship_indices):
        x2 = int(f2.split('_')[0])
        y2 = int(f2.split('_')[1])
        
        d12 = 1.0 - jaccard_index(simap_iteration_dict[x1][y1], simap_iteration_dict[x2][y2])
        
        j_f12.append(d12)
        
    jaccard_indices.append(j_f12)

In [None]:
from scipy.spatial.distance import squareform
from scipy.cluster.hierarchy import ward, fcluster

Z = ward(squareform(jaccard_indices))

In [None]:
clusters = fcluster(Z, t=0.5, criterion='distance')

In [None]:
cluster_dict = {}

for i,c in enumerate(clusters):
    if c not in cluster_dict: cluster_dict[c] = []
    cluster_dict[c].append(fellowship_indices[i])

In [None]:
def decode_fellowship_list(f_list):
    return [simap_iteration_dict[int(index.split('_')[0])][int(index.split('_')[1])] for index in f_list]

In [None]:
from collections import Counter

max_freq_dict = {}

for entry in sorted(cluster_dict.items(), key=lambda kv: len(kv[1]), reverse=True): 
       
    f_list = decode_fellowship_list(entry[1])
    f_list = list(itertools.chain.from_iterable(f_list))
    
    for e,f in Counter(f_list).most_common():
        if e not in max_freq_dict: max_freq_dict[e] = f
        else: max_freq_dict[e] = max(max_freq_dict[e], f)

In [None]:
from termcolor import colored

visited, counter = [], 0

merged_fellowships = []
no_merge_fellowships = []

for entry in sorted(cluster_dict.items(), key=lambda kv: len(kv[1]), reverse=True): 
        
    f_list = decode_fellowship_list(entry[1])
    f_list = list(itertools.chain.from_iterable(f_list))
    
    print('Entry:', entry[0])
    
    merge = []
    no_merge = []
    
    for e,f in Counter(f_list).most_common():
                
        if e not in visited:
            if f >= max_freq_dict[e]: 
                print('- {0:55} {1}'.format(colored(e.replace('http://dbpedia.org/resource/', ''), 'blue'), f))
                visited.append(e)
                no_merge.append(e)
                counter += 1
                
            else: print('- {0:55} {1}'.format(colored(e.replace('http://dbpedia.org/resource/', ''), 'red'), f))
              
        else: print('- {0:55} {1}'.format(colored(e.replace('http://dbpedia.org/resource/', ''), 'red'), f))
    
        if f >= 5: merge.append(e)
            
    if len(merge) > 0: merged_fellowships.append(merge)
    else: no_merge_fellowships.append(no_merge)
        
    print()    

In [None]:
jaccard_indices = []

for i, f1 in tqdm(list(enumerate(merged_fellowships))):
    
    j_f12 = []
    
    for j, f2 in enumerate(merged_fellowships):

        d12 = 1.0 - jaccard_index(f1, f2)
        
        j_f12.append(d12)
        
    jaccard_indices.append(j_f12)

In [None]:
from scipy.cluster.hierarchy import ward, fcluster

Z = ward(squareform(jaccard_indices))

clusters = fcluster(Z, t=0.25, criterion='distance')

cluster_dict = {}

for i,c in enumerate(clusters):
    if c not in cluster_dict: cluster_dict[c] = []
    cluster_dict[c].append(merged_fellowships[i])

In [None]:
from collections import Counter

max_freq_dict = {}

for entry in sorted(cluster_dict.items(), key=lambda kv: len(kv[1]), reverse=True): 
       
    f_list = entry[1].copy()
    f_list = list(itertools.chain.from_iterable(f_list))
    
    for e,f in Counter(f_list).most_common():
        if e not in max_freq_dict: max_freq_dict[e] = f
        else: max_freq_dict[e] = max(max_freq_dict[e], f)

In [None]:
re_merged_fellowships, visited = [], []

for entry in sorted(cluster_dict.items(), key=lambda kv: len(list(itertools.chain.from_iterable(kv[1]))), reverse=True): 

    f_list = list(itertools.chain.from_iterable(entry[1]))
        
    print('Entry:', entry[0])
    print()
    
    remerged = []
    for e,f in Counter(f_list).most_common():
        
        if e not in visited:
            if f >= max_freq_dict[e]: 
                print('- {0:55} {1}'.format(colored(e.replace('http://dbpedia.org/resource/', ''), 'blue'), f))
                visited.append(e)
                remerged.append(e)
                
            else: print('- {0:55} {1}'.format(colored(e.replace('http://dbpedia.org/resource/', ''), 'red'), f))
              
        else: print('- {0:55} {1}'.format(colored(e.replace('http://dbpedia.org/resource/', ''), 'red'), f))
        
    re_merged_fellowships.append(remerged)
    print()    

In [None]:
centrality_dict = {int_to_node[k]:v for k,v in dict(nx.closeness_centrality(G)).items()}

produced_fellowships = re_merged_fellowships + no_merge_fellowships
produced_fellowships = [f for f in produced_fellowships if len(f) > 0]

for f in sorted(produced_fellowships, key=len, reverse=True):
    if len(f) == 0: continue
    for e in sorted(f, key=lambda e: centrality_dict[e], reverse=True): 
        print('-', e)
    print()

In [None]:
with open(output_dir + 'pkb/' + 'fellowships.pckl', 'wb') as f: pickle.dump(produced_fellowships, f)

In [None]:
fellowships = produced_fellowships

In [None]:
print('Number of Nodes:', G.number_of_nodes())
print('Number of Edges:', G.number_of_edges())

In [None]:
import itertools

print('Fellowship Entities:', len(set(itertools.chain.from_iterable(fellowships))))

In [None]:
import networkx as nx

fellowship_graphs = []

for f in fellowships:

    f_i = nx.Graph()

    for n in f: f_i.add_node(n, label = n)

    for e in G.subgraph([node_to_int[n] for n in f]).edges(data=True):        
        f_i.add_edge(
            int_to_node[e[0]],
            int_to_node[e[1]],
            weight=e[2]['weight']
        )            

    fellowship_graphs.append(f_i.copy())

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(16, 3))
plt.hist([g.number_of_nodes() for g in fellowship_graphs], rwidth=0.95)
plt.title('Fellowship Sizes')
plt.show()

plt.figure(figsize=(16, 3))
plt.hist([g.number_of_edges() for g in fellowship_graphs], rwidth=0.95)
plt.title('Fellowship Connections')
plt.show()

### Identify Fellowship Dipoles

In [None]:
def extract_dipole(sag, f_i_j):
    f_i, f_j = f_i_j
    
    int_nodes_1 = [node_to_int[n] for n in fellowship_graphs[f_i].nodes()]
    int_nodes_2 = [node_to_int[n] for n in fellowship_graphs[f_j].nodes()]

    d_ij = G.subgraph(set(int_nodes_1 + int_nodes_2)).copy()

    positive_edges, negative_edges = [], []

    for e in d_ij.edges(data=True):
        if e[0] in int_nodes_1 and e[1] in int_nodes_1: continue
        if e[0] in int_nodes_2 and e[1] in int_nodes_2: continue

        if e[2]['weight'] > 0.0: positive_edges.append(e)
        elif e[2]['weight'] < 0.0: negative_edges.append(e)

    if (len(positive_edges) + len(negative_edges)) == 0: return None
    if len(negative_edges) == 0.0: return None

    p_positive = len(positive_edges) / (len(positive_edges) + len(negative_edges))
    p_negative = len(negative_edges) / (len(positive_edges) + len(negative_edges))

    si_sign_G, si_adj_sign_G, si_sign_edgelist, si_int_to_node = G_to_fi(d_ij)
    si_f_g, si_f_e, si_t, si_solution_dict = calculate_frustration_index(si_sign_G, si_adj_sign_G, si_sign_edgelist)
    
    dipole_g = nx.Graph()

    for n in d_ij.nodes(): 
        dipole_g.add_node(int_to_node[n], label=int_to_node[n])

    for e in d_ij.edges(data=True):        
        dipole_g.add_edge(int_to_node[e[0]], int_to_node[e[1]], weight=e[2]['weight'])             

    return [(min(f_i, f_j), max(f_i, f_j)), {
        'f_g': si_f_g,
        'd_ij': dipole_g.copy(),
        'pos': len(positive_edges),
        'neg': len(negative_edges),
        'simap_1': [int_to_node[n] for n in int_nodes_1],
        'simap_2': [int_to_node[n] for n in int_nodes_2],
        'int_simap_1': int_nodes_1,
        'int_simap_2': int_nodes_2,
        'negative_ratio': p_negative,
        'positive_ratio': p_positive
    }]

In [None]:
f_i_j_list = list(itertools.combinations(list(range(len(fellowship_graphs))), 2))

In [None]:
%%time

from tqdm import tqdm
from functools import partial
import subprocess, os, pandas as pd
from signed_network_balance import *

fellowship_dipoles = [extract_dipole(G, f_i_j) for f_i_j in tqdm(f_i_j_list)]

In [None]:
with open(output_dir + 'pkb/fellowship_graphs.pckl', 'wb') as f: pickle.dump(fellowship_graphs, f, pickle.HIGHEST_PROTOCOL)

In [None]:
with open(output_dir + 'pkb/dipoles.pckl', 'wb') as f: pickle.dump(fellowship_dipoles, f, pickle.HIGHEST_PROTOCOL)

In [None]:
print('Total number of dipoles:', len(fellowship_dipoles))

In [None]:
fellowship_dipoles = [d for d in fellowship_dipoles if d]

_ = [d for d in fellowship_dipoles if d and d[1]['f_g'] >= 0.7 and d[1]['negative_ratio'] >= 0.5]

fellowship_dipoles = _.copy()

print('Final number of dipoles:', len(fellowship_dipoles))

In [None]:
import os, itertools, json, jsonpickle

spacy_paths = [output_dir + 'spacy/' + p + '/' for p in sorted(os.listdir(output_dir + 'spacy/'))]
spacy_paths = list(itertools.chain.from_iterable([[p + _ for _ in os.listdir(p)] for p in spacy_paths]))

e_np_paths = [p.replace('spacy', 'entity_np_sentiment_attitudes').replace('.pckl', '.pckl.jsonpckl') for p in spacy_paths]

In [None]:
def load_entity_np_attitudes(path):

    try:
        with open(path, 'r') as f: entity_np_sentiment_attitudes = json.load(f)   
    except Exception as ex: return {}

    dt_str = path.split('/')[-2]
    
    entity_np_sentiment_attitudes['entity_np_sentiment_attitudes'] = jsonpickle.decode(entity_np_sentiment_attitudes['entity_np_sentiment_attitudes'])
    entity_np_sentiment_attitudes = entity_np_sentiment_attitudes['entity_np_sentiment_attitudes']
    
    for e in entity_np_sentiment_attitudes:
        entity_np_sentiment_attitudes[e] = dict(entity_np_sentiment_attitudes[e])
        
    entity_np_sentiment_attitudes = dict(entity_np_sentiment_attitudes)
    
    return (dt_str, entity_np_sentiment_attitudes)

In [None]:
import json, jsonpickle

from tqdm import tqdm
import multiprocessing
from multiprocessing import Pool

e_np_attitudes = []

pool = Pool(multiprocessing.cpu_count() - 8)

for attitudes in tqdm(
    pool.imap_unordered(load_entity_np_attitudes, e_np_paths),
    desc='Load E-NP Attitudes',
    total=len(e_np_paths)
): e_np_attitudes.append(attitudes)

pool.close()
pool.join()

In [None]:
e_np_attitudes = [e_np for e_np in e_np_attitudes if len(e_np) == 2]

print('E-NP Attitudes:', len(e_np_attitudes))

In [None]:
e_np_attitude_dict = {}

for domain, att_dict in tqdm(e_np_attitudes):
    
    if domain not in e_np_attitude_dict: e_np_attitude_dict[domain] = {}
    
    for e, np_dict in att_dict.items():
        
        e = fix_entity_uri(e)
        
        if e not in e_np_attitude_dict[domain]: e_np_attitude_dict[domain][e] = {}

        for np, atts in np_dict.items():

            if all(v==0 for v in atts['POSITIVE'] + atts['NEGATIVE']): continue

            if np not in e_np_attitude_dict[domain][e]: e_np_attitude_dict[domain][e][np] = {'POSITIVE': [], 'NEGATIVE': []}

            e_np_attitude_dict[domain][e][np]['POSITIVE'] += atts['POSITIVE']
            e_np_attitude_dict[domain][e][np]['NEGATIVE'] += atts['NEGATIVE']

In [None]:
import itertools

dipole_entitites = list(set(itertools.chain.from_iterable([d[1]['simap_1'] + d[1]['simap_2'] for d in fellowship_dipoles])))

print('Total dipole entities:', len(dipole_entitites))

In [None]:
dipole_entitites = [fix_entity_uri(e) for e in dipole_entitites]
dipole_entitites = [e for e in dipole_entitites if e]

print('Number of entities:', len(dipole_entitites))

In [None]:
dipole_domain_nps = {k: [] for k in e_np_attitude_dict}

for k in dipole_domain_nps:
    
    dipole_nps = [list(e_np_attitude_dict[k][e].keys()) for e in dipole_entitites if e in e_np_attitude_dict[k]]
    dipole_domain_nps[k] += list(itertools.chain.from_iterable(dipole_nps))

In [None]:
dipole_nps = list(set(itertools.chain.from_iterable(list(dipole_domain_nps.values()))))

print('Total NPs:', len(dipole_nps))

In [None]:
import string, nltk, re

from nltk.corpus import stopwords
from textblob import TextBlob, Word

hyphen_regex = r'(?=\S+[-])([a-zA-Z-]+)'

english_stopwords = stopwords.words('english')

del english_stopwords[37]

def tokenize(text): return nltk.word_tokenize(text)

def remove_punctuation(text): 
    _text = []
    for t in tokenize(text):
        if not len(re.findall(hyphen_regex, t)) > 0: t = ''.join(c if c not in string.punctuation else ' ' for c in t)
        else:
            hyphen_parts = t.split('-')
            hyphen_parts = [remove_punctuation(_) for _ in hyphen_parts]
            t = '-'.join(hyphen_parts)
        
        t = t.strip()
        if len(t) > 0: _text.append(t)

    return ' '.join(_text)

def remove_trailing(text): return text.strip()

def reduce_white_space(text): return re.sub(' +', ' ', text)

def to_lower_case(text): return text.lower()

def remove_stopwords(tokens): return [t for t in tokens if t not in english_stopwords]

def remove_digit_tokens(tokens): return [t for t in tokens if not all(c.isdigit() for c in t)]

def lemmatize(np):
    blob = TextBlob(np)
    tag_dict = {"J": 'a', "N": 'n', "V": 'v', "R": 'r'}
    word_tag_list = [(w, tag_dict.get(pos[0], 'n')) for w, pos in blob.tags]    
    return " ".join([w.lemmatize(t) for w, t in word_tag_list])

def pipeline_func(text, func_list):
    for f in func_list: text = f(text)
    return text

In [None]:
clean_np_dict = {}

for np in tqdm(set(noun_phrase_set)):
    clean_np_dict[np] = pipeline_func(np, [
        lemmatize,
        to_lower_case,
        remove_punctuation,
        remove_trailing,
        reduce_white_space,       
        tokenize,
        remove_digit_tokens,
        remove_stopwords,
        lambda t: ' '.join(t)
    ])

In [None]:
clean_noun_phrase_list = [np for np in set(clean_np_dict.values()) if len(np) > 1]

print('Total Cleaned NPs:', len(clean_noun_phrase_list))

In [None]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('all-mpnet-base-v2')

In [None]:
encoded_noun_phrase_list = model.encode(
    clean_noun_phrase_list,
    device='cuda',
    show_progress_bar=True,
    batch_size = 128
)

In [None]:
clean_np_embeddings_dict = {clean_noun_phrase_list[i]: encoded_noun_phrase_list[i] for i in range(len(clean_noun_phrase_list))}

In [None]:
import numpy as np
from annoy import AnnoyIndex
from tqdm.notebook import trange, tqdm

def community_detection(
    embeddings, threshold=0.75, min_community_size=10, init_max_size=1000
):
    
    index = AnnoyIndex(embeddings.shape[1], "angular")
    for i, v in tqdm(
        enumerate(embeddings),
        total=embeddings.shape[0],
        desc="Building ANN",
    ):
        index.add_item(i, v)
        
    index.build(30)
    index.save("test.ann")
    
    top_k_values = []
    for emb_i in tqdm(
        range(embeddings.shape[0]), desc="Finding Minimum Community Size"
    ):
        _, distance = index.get_nns_by_item(
            emb_i, min_community_size, include_distances=True
        )
        top_k_values.append(distance)

    top_k_values = 1 - np.array(top_k_values)
    
    extracted_communities = []
    total_entries = embeddings.shape[0]
    
    for i in tqdm(range(total_entries), desc="Extracting Communities"):
        if top_k_values[i][-1] >= threshold:
            new_cluster = []

            top_idx_large, top_val_large = index.get_nns_by_item(
                i, init_max_size, include_distances=True
            )
            top_val_large = (1 - np.array(top_val_large)).tolist()

            if top_val_large[-1] < threshold:
                for idx, val in zip(top_idx_large, top_val_large):
                    if val < threshold:
                        break

                    new_cluster.append(idx)
            else:
                        
                min_most_size = min([int(total_entries * 0.5), 10000])
                idx_large, val_large = index.get_nns_by_item(
                    i, min_most_size, include_distances=True
                )
                val_large = (1 - np.array(val_large)).tolist()
                for idx, val in zip(idx_large, val_large):
                    if val >= threshold:
                        new_cluster.append(idx)

            extracted_communities.append(new_cluster)

    extracted_communities = sorted(
        extracted_communities, key=lambda x: len(x), reverse=True
    )
    
    unique_communities = []
    extracted_ids = set()

    for community in extracted_communities:
        add_cluster = True
        for idx in community:
            if idx in extracted_ids:
                add_cluster = False
                break

        if add_cluster:
            unique_communities.append(community)
            for idx in community:
                extracted_ids.add(idx)

    return unique_communities

In [None]:
import numpy

clusters = community_detection(
    numpy.asarray([clean_np_embeddings_dict[k] for k in clean_noun_phrase_list]),
    min_community_size=2,
    threshold=0.65,
    init_max_size=min(1000, len(clean_noun_phrase_list))
)

cluster_np_dict = {}

for i, cluster in enumerate(clusters): cluster_np_dict[i] = [clean_noun_phrase_list[k] for k in cluster]

In [None]:
print('Number of Clusters:     ', len(cluster_np_dict))
print('Number of Clustered NPs:', len(list(itertools.chain.from_iterable(cluster_np_dict.values()))))

In [None]:
np_topics_dict = {}

for k,v in cluster_np_dict.items():
    
    for np in v:
        
        np_topics_dict[np] = k

# Possible Approaches:
1. Identify the `attitudes` and `polarization` for `PaCTE` topics with the existance of the exact keywords.
2. Expand the existance of keywords using the clustering approach: Cluster the `Dipole NPs` and map with `PaCTE` topics based on the distance from centroids.

In [None]:
def calculate_polarization_index(atts):     
        
    A_minus = [t for t in atts if t < 0.0]
    A_plus = [t for t in atts if t > 0.0]
    
    if (len(A_minus) + len(A_plus)) == 0.0: return 0.0
    
    D_A = abs(
        (len(A_plus) / (len(A_plus) + len(A_minus))) - \
        (len(A_minus) / (len(A_plus) + len(A_minus)))
    )

    gc_minus = numpy.mean(A_minus) if len(A_minus) > 0 else 0.0
    gc_plus = numpy.mean(A_plus) if len(A_plus) > 0 else 0.0

    gc_d = (abs(gc_plus - gc_minus)) / 2

    m = (1-D_A) * gc_d
    
    return m

In [None]:
def undersample_dipole_attitudes(
    dipole_tuple,
    entity_np_sentiment_attitudes,
    dipole_topic_dict
):
    fi, fj = dipole_tuple[0]
    dipole_dict = dipole_tuple[1]
    
    if (fi, fj) not in dipole_topic_dict: return []
    
    fi_entities = dipole_dict['simap_1']
    fj_entities = dipole_dict['simap_2'] 
    
    fi_np_attitudes_dict = {}
    fj_np_attitudes_dict = {}
   
    for e in fi_entities:
        if e not in entity_np_sentiment_attitudes: continue
            
        for np, atts in entity_np_sentiment_attitudes[e].items():
            if not np in fi_np_attitudes_dict: fi_np_attitudes_dict[np] = []
            
            sentiment = 0.0
            sentiments = []           
                        
            for i in range(len(atts['POSITIVE'])):
                
                total_p_n = atts['POSITIVE'][i] + atts['NEGATIVE'][i]
                
                if total_p_n == 0: sentiments.append(0)
                else: sentiments.append(sentiment_threshold_difference(
                    atts['POSITIVE'][i] / total_p_n,
                    atts['NEGATIVE'][i] / total_p_n
                ))
                
            sentiments = [s for s in sentiments if s != 0]    

            if len(sentiments) > 0: sentiment = numpy.median(sentiments)
            
            fi_np_attitudes_dict[np].append(sentiment)
                    
    for e in fj_entities:
        if e not in entity_np_sentiment_attitudes: continue
            
        for np, atts in entity_np_sentiment_attitudes[e].items():
            if not np in fj_np_attitudes_dict: fj_np_attitudes_dict[np] = []
            
            sentiment = 0.0
            sentiments = []           
                        
            for i in range(len(atts['POSITIVE'])):
                
                total_p_n = atts['POSITIVE'][i] + atts['NEGATIVE'][i]
                
                if total_p_n == 0: sentiments.append(0)
                else: sentiments.append(sentiment_threshold_difference(
                    atts['POSITIVE'][i] / total_p_n,
                    atts['NEGATIVE'][i] / total_p_n
                ))
                
            sentiments = [s for s in sentiments if s != 0]    

            if len(sentiments) > 0: sentiment = numpy.median(sentiments)
                            
            fj_np_attitudes_dict[np].append(sentiment)
    
    fi_frame_attitudes = {}
    fj_frame_attitudes = {}

    for np, np_atts in fi_np_attitudes_dict.items():

        for ci in dipole_topic_dict[(fi, fj)]['np_clusters']:
            if np not in dipole_topic_dict[(fi, fj)]['np_clusters'][ci]: continue
            if ci not in fi_frame_attitudes: fi_frame_attitudes[ci] = []
            fi_frame_attitudes[ci] += np_atts

    for np, np_atts in fj_np_attitudes_dict.items():

        for ci in dipole_topic_dict[(fi, fj)]['np_clusters']:
            if np not in dipole_topic_dict[(fi, fj)]['np_clusters'][ci]: continue
            if ci not in fj_frame_attitudes: fj_frame_attitudes[ci] = []
            fj_frame_attitudes[ci] += np_atts

    polarization_list = []
        
    for ci in dipole_topic_dict[(fi, fj)]['np_clusters']:
        if ci not in fi_frame_attitudes: continue
        if ci not in fj_frame_attitudes: continue
                        
        polarization_list.append({
            'dipole': (fi, fj),
            'atts_fi': fi_frame_attitudes[ci],
            'atts_fj': fj_frame_attitudes[ci],
            'topic': {
                'id': ci,
                'nps': dipole_topic_dict[(fi, fj)]['np_clusters'][ci]
            }
        })
        
    return polarization_list

In [None]:
def extract_dipole_topics(dipole_tuple, entity_np_sentiment_attitudes, np_topics_dict):
    
    dipole_id, dipole_obj, np_attitudes_dict = dipole_tuple[0], dipole_tuple[1], {}
    
    for entity in dipole_obj['d_ij'].nodes():
        
        if entity not in entity_np_sentiment_attitudes: continue
            
        for np, att_obj in entity_np_sentiment_attitudes[entity].items():
                        
            if np not in np_attitudes_dict: np_attitudes_dict[np] = {'POSITIVE': [], 'NEGATIVE': []}

            np_attitudes_dict[np]['POSITIVE'] += att_obj['POSITIVE'].copy()
            np_attitudes_dict[np]['NEGATIVE'] += att_obj['NEGATIVE'].copy()
                    
    dipole_np_list = list(sorted(np_attitudes_dict.keys()))
    dipole_np_labels = {np:set(np_topics_dict[clean_np_dict[np]]) for np in dipole_np_list if np in clean_np_dict and clean_np_dict[np] in np_topics_dict}
            
    np_attitudes_dict = {k:v for k, v in np_attitudes_dict.items() if k in dipole_np_labels}.copy()
        
    if len(dipole_np_labels) == 0: return None
    
    cluster_dict = {}

    for k,v in dipole_np_labels.items():
        
        for _ in v:
            if _ not in cluster_dict: cluster_dict[_] = []
            cluster_dict[_].append(k)
    
    return {
        'fellowship_1': dipole_id[0],
        'fellowship_2': dipole_id[1],
        'dipole_topics': {            
            'np_attitudes': np_attitudes_dict.copy(),
            'np_clusters': dict(cluster_dict).copy()
        }
    }

In [None]:
def sentiment_threshold_difference(swn_pos, swn_neg):
    swn_pos = abs(swn_pos)
    swn_neg = abs(swn_neg)
    return numpy.sign(swn_pos - swn_neg) * (abs(swn_pos - swn_neg))

def resample_attitudes(atts, n):
    total_v, v_ratios = len(atts), {}
    
    for v in Counter(atts).most_common(): v_ratios[v[0]] = v[1] / total_v
    r_atts = list(itertools.chain.from_iterable([[v for i in range(math.floor(n * v_ratios[v]))] for v in v_ratios]))
    
    return r_atts

import scipy.stats

def mean_confidence_interval(data, confidence=0.95):
    a = 1.0 * numpy.array(data)
    n = len(a)
    m, se = numpy.median(a), scipy.stats.sem(a)
    h = se * scipy.stats.t.ppf((1 + confidence) / 2., n-1)
    return (m, [m-h, m+h])

In [None]:
overall_e_np_attitude_dict = {}

for d in e_np_attitude_dict:
    
    for e in e_np_attitude_dict[d]:
        if not e: continue
        if e not in overall_e_np_attitude_dict: overall_e_np_attitude_dict[e] = {}
            
        for np, atts in e_np_attitude_dict[d][e].items():
            
            if np not in overall_e_np_attitude_dict[e]: overall_e_np_attitude_dict[e][np] = {'POSITIVE': [], 'NEGATIVE': []}
            overall_e_np_attitude_dict[e][np]['POSITIVE'] += atts['POSITIVE'].copy()
            overall_e_np_attitude_dict[e][np]['NEGATIVE'] += atts['NEGATIVE'].copy()

In [None]:
dipole_topics = []

for dipole in tqdm(fellowship_dipoles): 
    if not dipole: continue

    d_topics = extract_dipole_topics(dipole, e_np_attitude_dict, np_topics_dict)
    if not d_topics: continue

    dipole_topics.append(d_topics)

In [None]:
dipole_topics_dict = {
    (d['fellowship_1'], d['fellowship_2']):d['dipole_topics']
    for d in dipole_topics if d
}

In [None]:
ts = 0
ts_ann = []

for d in dipole_topics_dict:
    ts += len(dipole_topics_dict[d]['np_clusters'])
    ts_ann += dipole_topics_dict[d]['np_clusters'].keys()

ts_ann = len(set(ts_ann))

In [None]:
print('Topics:', ts)
print('Annotated Topics:', ts_ann)

In [None]:
import numpy 

dipole_topic_attitudes = []

for dipole in tqdm(fellowship_dipoles): 
    if not dipole: continue

    dipole_topic_attitudes.append(
        undersample_dipole_attitudes(
            dipole,
            overall_e_np_attitude_dict,
            dipole_topics_dict
        )
    )

In [None]:
dipole_topic_attitudes = list(itertools.chain.from_iterable(dipole_topic_attitudes))

In [None]:
filtered_topic_attitudes = []

for dipole_t in dipole_topic_attitudes:
    if len(set(dipole_t['atts_fi'])) == 1 and dipole_t['atts_fi'][0] == 0.0: continue
    if len(set(dipole_t['atts_fj'])) == 1 and dipole_t['atts_fj'][0] == 0.0: continue

    filtered_topic_attitudes.append(dipole_t.copy())

In [None]:
from collections import Counter

for i, dipole_t in tqdm(list(enumerate(filtered_topic_attitudes))):

    ################################################################
    # Remove any 0.0 attitudes from Fi and Fj for the resampling.  #                               
    # This code might also remove from original dipole_t object.   #
    ################################################################

    dipole_t['atts_fi'] = [v for v in dipole_t['atts_fi'] if v != 0.0]
    dipole_t['atts_fj'] = [v for v in dipole_t['atts_fj'] if v != 0.0]

    if len(dipole_t['atts_fi']) == 0 or len(dipole_t['atts_fj']) == 0: continue

    ###########################################################
    # If Fi and Fj attitudes have the same size then they do  #
    # not need resampling.                                    #
    ###########################################################

    if len(dipole_t['atts_fi']) == len(dipole_t['atts_fj']):
        filtered_topic_attitudes[i]['X'] = dipole_t['atts_fi'] + dipole_t['atts_fj']
        filtered_topic_attitudes[i]['pi'] = calculate_polarization_index(filtered_topic_attitudes[i]['X'])
    else:

        if len(dipole_t['atts_fi']) > len(dipole_t['atts_fj']):

            fj_res = resample_attitudes(dipole_t['atts_fj'], len(dipole_t['atts_fi']))

            filtered_topic_attitudes[i]['X']     = dipole_t['atts_fi'] + dipole_t['atts_fj']
            filtered_topic_attitudes[i]['X_res'] = dipole_t['atts_fi'] + fj_res

            filtered_topic_attitudes[i]['pi'] = calculate_polarization_index(
                filtered_topic_attitudes[i]['X']
            )

            filtered_topic_attitudes[i]['pi_res'] = calculate_polarization_index(
                filtered_topic_attitudes[i]['X_res']
            )

        else: 

            fi_res = resample_attitudes(dipole_t['atts_fi'], len(dipole_t['atts_fj']))

            filtered_topic_attitudes[i]['X']     = dipole_t['atts_fi'] + dipole_t['atts_fj']
            filtered_topic_attitudes[i]['X_res'] = dipole_t['atts_fj'] + fi_res

            filtered_topic_attitudes[i]['pi'] = calculate_polarization_index(
                filtered_topic_attitudes[i]['X']
            )

            filtered_topic_attitudes[i]['pi_res'] = calculate_polarization_index(
                filtered_topic_attitudes[i]['X_res']
            )

In [None]:
topic_to_polarization_dict = {}

for dipole_t in filtered_topic_attitudes:
    if dipole_t['topic']['id'] not in topic_to_polarization_dict: topic_to_polarization_dict[dipole_t['topic']['id']] = []
    topic_to_polarization_dict[dipole_t['topic']['id']].append(dipole_t['pi_res'] if 'pi_res' in dipole_t else dipole_t['pi'])

In [None]:
print('{0:^5} {1:^80} {2:^5} {3:^5} {4:^5} {5:^5} {6:^5}'.format(
    'No.', 
    'Topic',
    '#D',
    'Median',
    'From',
    'To',
    'Score'
))

print('='.join(['' for i in range(120)]))

for i, t in enumerate(sorted(
    topic_to_polarization_dict.items(),
    key=lambda kv: len(kv[1]) * numpy.median(kv[1]),
    reverse=True
)):

    pis = t[1]
    t = t[0]
    t_i = t
    t = pacte_top_10[t]
    
    m, h_m_p = mean_confidence_interval(pis)
    h_m = h_m_p[0]
    h_p = h_m_p[1]
    
    print(colored('{0:5}. {1:80} {2:<5} {3:<5} {4:<5} {5:<5} {6:<5}'.format(
        i + 1, 
        str(str(t) + ' = ' + ', '.join(cluster_np_dict[t_i]))[:80],
        len(pis),
        round(m, 2),
        round(h_m, 2),
        round(h_p, 2),
        round(
            len(pis) * numpy.median(pis), 2
        )
    ), 'blue'))
            
    print('-'.join(['' for i in range(120)]))    