## Reddit EC detection

In [1]:
import os
import glob
import json
import pickle

import operator
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import seaborn as sns

from collections import defaultdict
from datetime import datetime
from cdlib import AttrNodeClustering, evaluation, TemporalClustering
from Eva import eva_best_partition, modularity, purity
from tqdm import tqdm
from tqdm.notebook import tqdm
import matplotlib.lines as mlines
sns.set_palette("Set2")

sns.set_style(style='white')

In [3]:
def load_json(file):
    '''Load a json file'''
    with open(file) as f:
        json_data = json.loads(f.read())
    return json_data

def read_net(filename):
    '''Read a network from a csv file'''
    print("\nEdgelist: ", filename)
    g = nx.Graph()
    with open(filename) as f:
        f.readline()
        for l in f:
            l = l.split(",")
            g.add_edge(l[0], l[1], weight=int(l[2]))
    return g

def read_labels(filename):
    '''Read labels from a csv file'''
    node_to_label = {}
    with open(filename) as f:
        f.readline()
        for l in f:
            l = l.rstrip().split(",")
            node_to_label[l[0]] = l[2]
    return node_to_label


In [7]:
src_results = "../results/reddit_results/"
text_data = "../text_data/"

categories = ["guncontrol", 'minority', "politics"]
semesters = [('01/01/2017','01/07/2017'), ('01/07/2017','01/01/2018'),
             ('01/01/2018','01/07/2018'), ('01/07/2018','01/01/2019'), ('01/01/2019','01/07/2019')]

mods = load_json("../data/moderators.json")
bots = load_json("../data/bots_reddit.json")

mods_bots_list = list(mods.keys())
mods_bots_list += list(bots.keys())

x

In [27]:
def map_labels(g):
    comps_list = list(nx.connected_components(g))
    max_len = sorted([[len(el),el] for el in comps_list], reverse=True)
    comp_0 = nx.subgraph(g, max_len[0][1])
    mapping = dict(zip(comp_0, range(0, len(comp_0))))
    relabel_comp_0 = nx.relabel_nodes(comp_0, mapping)
    inv_map = {v: k for k, v in mapping.items()}
    return relabel_comp_0, inv_map


def purity_func(coms, labels):
    count_coms = defaultdict(int)
    purities = []
    top_label = []
    for n, c in coms.items():
        count_coms[c] += 1
    for el in labels: 
        max_label = max(labels[el]["leaning"].items(), key = operator.itemgetter(1))[0]
        max_label_val = labels[el]["leaning"][max_label]
        purity = max_label_val/count_coms[el]
        purities.append(purity)
        top_label.append(max_label)
    return purities, top_label


def cd_eva(node_label):
    coms, com_labels = eva_best_partition(node_label, alpha=0.5)
    return coms, com_labels


def extract_EC(categories):
    matches_dict = defaultdict(list)
    for topic in tqdm(categories):
        tc = TemporalClustering()
        sem = 0
        data_path = f"../data/topic_networks/{topic}/"
        for semester in semesters:
            period0_nodelist= datetime.datetime.strptime(semester[0], "%d/%m/%Y").strftime("%d-%m-%Y")
            period1_nodelist = datetime.datetime.strptime(semester[1], "%d/%m/%Y").strftime("%d-%m-%Y")
            period0_labels= datetime.datetime.strptime(semester[0], "%d/%m/%Y").strftime("%Y-%m-%d")
            period1_labels = datetime.datetime.strptime(semester[1], "%d/%m/%Y").strftime("%Y-%m-%d")
            semester_edgelist = os.path.join(data_path, f'{topic}_{period0_nodelist}_{period1_nodelist}_complete.csv') 
            semester_labels = os.path.join(data_path, f'{topic}_{period0_labels}_{period1_labels}_labels.csv') 
            # read the network and add the attributes
            g = read_net(semester_edgelist)
            nth = read_labels(semester_labels) 
            nx.set_node_attributes(g, nth, "leaning")
            nx.set_node_attributes(g, sem, "snapshot_id")
            
            # remove nodes without attributes, mods and bots from the network
            nodes_to_remove = list()
            for node in g.nodes:
                node_dict = g.nodes[node]
                if node_dict.get('leaning') is None:
                    nodes_to_remove.append(node)
                if node in mods_bots_list:
                    nodes_to_remove.append(node)
            g.remove_nodes_from(nodes_to_remove)

            relabel, mapping = map_labels(g)
            # community detection with EVA
            coms, labels = cd_eva(relabel)
            coms_to_node = defaultdict(list)
            for n, c in coms.items():
                coms_to_node[c].append(n) 
            coms_eva = [list(c) for c in coms_to_node.values()]
            eva_attr_node_clustering = AttrNodeClustering(coms_eva, relabel, "Eva", labels, method_parameters={"weight": 'weight', "resolution": 1,
                                                                             "randomize": False, "alpha":0.5})
            tc.add_clustering(eva_attr_node_clustering, sem)
            eva_comm = list()
            for com in eva_attr_node_clustering.communities:
                each_com = list()
                for node in com:
                    each_com.append(mapping[node])
                eva_comm.append(each_com)
            
            # build a list of labels for the coms for the EVA evaluation df and the user ids df
            df_label_coms = []
            user_coms = {}
            for el in range(len(eva_comm)):
                df_label_coms.append(f"{sem}_{el}")
                for user in eva_comm[el]:
                    user_coms[str(user)] = f"{sem}_{el}"       
                        
            df_user_coms = pd.DataFrame(user_coms, index=["community",]).T.rename_axis('user_id').reset_index()
            df_user_coms.to_csv(os.path.join(src_results, f"snapshots/users_community_{topic}_t_{sem}.csv"),
                            index = False)
            
            # build the df for the EVA evaluation (by semester)
            eva_results = pd.DataFrame()
            size = evaluation.size(relabel, eva_attr_node_clustering, summary=False)
            avg_internal_deg = evaluation.average_internal_degree(relabel, eva_attr_node_clustering, summary=False)
            int_edge_dens = evaluation.internal_edge_density(relabel, eva_attr_node_clustering, summary=False)
            conductance = evaluation.conductance(relabel, eva_attr_node_clustering, summary=False)
            cut_ratio = evaluation.cut_ratio(relabel, eva_attr_node_clustering, summary=False)
            link_modularity = evaluation.link_modularity(relabel, eva_attr_node_clustering, summary=False)
            edge_inside = evaluation.edges_inside(relabel, eva_attr_node_clustering,summary=False)
            purity, max_label = purity_func(coms, labels)
            eva_results['purity'] = purity
            eva_results["max_label"] = max_label
            eva_results['internal_edge_density'] = int_edge_dens
            eva_results['average_internal_degree'] = avg_internal_deg
            eva_results['conductance'] = conductance
            eva_results['cut_ratio'] = cut_ratio
            eva_results['edge_inside'] = edge_inside
            eva_results['size'] = size
            eva_results["timestamp"] = sem
            eva_results["community"] = df_label_coms 

            # save the df for the evaluation (by semester)
            eva_results.to_csv(os.path.join(src_results, f"snapshots/eva_snapshot_{topic}_{sem}_com_stats.csv"), index = False)
            
            eva_users_stats = pd.merge(left=eva_results, right = df_user_coms, left_on="community",
                                       right_on = "community" , how = "outer")
            
            eva_users_stats.to_csv(os.path.join(src_results, f"{topic}/eva_users_merged_{sem}_com_stats.csv"), index = False)
            ec = eva_results.loc[(eva_results['purity'] >= 0.7) & (eva_results['conductance'] <= 0.5)].copy()
            ec.to_csv(os.path.join(src_results, f"{topic}/EC/EC_{topic}_{sem}.csv"), index = False)
            not_ec = eva_results.loc[((eva_results['purity'] < 0.7) & (eva_results['conductance'] >= 0.5) | 
                                  (eva_results['purity'] < 0.7) |(eva_results['conductance'] > 0.5)   ) ].copy()
            not_ec.to_csv(os.path.join(src_results, f"{topic}/non_EC/non_EC_{topic}_{sem}.csv"), index = False)
            sem += 1

            
        matches = tc.community_matching(jaccard, two_sided=False)
        matches_dict[topic] = matches
        
    with open(os.path.join(src_results, f"matches_dict.pickle"), 'wb') as r:
        pickle.dump(matches_dict, r, protocol=pickle.HIGHEST_PROTOCOL)
    
    return matches_dict
        


In [None]:
def preprocess_df_text_pipeline(cat, ec_val):
    data_path = f"../tesi_cau/topic_comments/" 
    for topic in tqdm(categories):
        i = 0
        for sem in semesters:
            print("Current iteration:", i)
        # prendiamo il df delle EC
            df_EC = pd.read_csv(os.path.join(src_results, f"{topic}/{ec_val}/{ec_val}_{topic}_{i}.csv"))
        # global community stats + users
            df_users_original = pd.read_csv(os.path.join(src_results, f"{topic}/eva_users_merged_{i}_com_stats.csv"))
            df_users = df_users_original[(df_users_original["size"] >= 20)].copy()
            list_ECs = df_EC.community.tolist()
        #a questo punto, abbiamo solo gli utenti nelle EC
            df_EC_users = df_users[df_users['community'].isin(list_ECs)].copy()
            
            print(df_EC_users.community.unique())
            
            df_EC_users["EC"] = ec_val
            ec_users = df_EC_users[["user_id", "EC", "max_label"]].copy()
            ec_users.to_csv(os.path.join(src_results, f"{topic}/{ec_val}_user_TM_{topic}_{i}.csv"), index = False)
            # dobbiamo recuperare i dati da commenti e post
            # i commenti sono all'interno di JSON e NON sono stati ripuliti
            # i post sono stati sottoposti a una lieve pulizia del testo
            comments = list()
            list_temp_df = []
            period0 = datetime.datetime.strptime(sem[0], "%d/%m/%Y").strftime("%d-%m-%Y")
            period1 = datetime.datetime.strptime(sem[1], "%d/%m/%Y").strftime("%d-%m-%Y")
            semester_user_comments = os.path.join(f'../topic_comments/{topic}/{topic}_{period0}_{period1}/')
#             json_files = glob.glob(os.path.join(semester_user_comments, '*.json'))
#             dfs = [pd.read_json(fn, lines = True) for fn in json_files]
            json_files = [pos_json for pos_json in os.listdir(semester_user_comments) if pos_json.endswith('.json')]
            for f in json_files:
                f = load_json(os.path.join(semester_user_comments, f))
                df = pd.json_normalize(f["comments"],max_level=1)
                list_temp_df.append(df)

            df_text_users = pd.concat(list_temp_df)
            print(topic, i)
            # per risovere il problema dei commenti nella colonna sbagliata in alcuni dataset
            if topic == "minority" and i == 4:
                df_text_users.clean_text.fillna(df_text_users['body'], inplace=True)
                del df_text_users["body"]
            elif "body" in df_text_users.columns:
                df_text_users["clean_text"] = df_text_users["body"]
                del df_text_users["body"]
                  
            df_final_users = pd.merge(left=df_EC_users[["community", "user_id"]], right = df_text_users, left_on="user_id", 
                                      right_on = "author" , how = "inner")
     #       cols_to_delete_comments = "parent_id","link_id", "subreddit_id", "date", "score"]
        
        #    df_final_users.drop(cols_to_delete_comments, axis = 1, inplace = True)  
            df_final_users.to_csv(os.path.join(f"../text_data/comments/{topic}/{ec_val}_comments_{topic}_{i}.csv"), index = False)
            i+=1
     # estraiamo quindi un dataframe con la seguente struttura:
        # nome utente | subreddit | timestamp | topic | testo | post (booleani) | commento (booleani) | stats 

In [None]:
preprocess_df_text_pipeline(categories, "EC")


  0%|          | 0/3 [00:00<?, ?it/s]

Current iteration: 0
['0_1' '0_3' '0_4']


KeyboardInterrupt: 

In [None]:
preprocess_df_text_pipeline(categories, "non_EC")

NameError: name 'categories' is not defined

In [None]:
def preprocess_post_text_pipeline(cat, ec_val):
    data_path = f"../topic_posts/" 
    for topic in tqdm(categories):
        i = 0
        for sem in semesters:
            print("Current iteration:", i)
        # prendiamo il df delle EC per vedere l'id delle community EC
            df_community = pd.read_csv(os.path.join(src_results, f"{topic}/{ec_val}/{ec_val}_{topic}_{i}.csv"))
        #prendiamo il df degli utenti nelle community
            df_stats = pd.read_csv(os.path.join(src_results, f"{topic}/eva_users_merged_{i}_com_stats.csv"))
            df_users = df_stats
            coms_list = df_community.community.tolist()
            
            
            df_EC_users = df_users[df_users['community'].isin(coms_list)].copy()
            ec_users = df_EC_users[["user_id", "max_label"]].copy()
            # TM users' list
            ec_users.to_csv(os.path.join(src_results, f"{topic}/{ec_val}_user_TM_{topic}_{i}.csv"), index = False)
            
            
            # i post sono stati sottoposti a una lieve pulizia del testo
            period0 = datetime.datetime.strptime(sem[0], "%d/%m/%Y").strftime("%d-%m-%Y")
            period1 = datetime.datetime.strptime(sem[1], "%d/%m/%Y").strftime("%d-%m-%Y")
            filename = os.path.join(data_path, f'{topic}/{topic}_{period0}_{period1}.csv')
            df_post = pd.read_csv(filename)
            df_filtered_post = df_post[df_post["author"].isin(df_EC_users["user_id"].tolist())].copy()
            
            # author
            df_final = pd.merge(left=df_EC_users[["community", "user_id"]], right = df_filtered_post, left_on="user_id", 
                                       right_on = "author" , how = "inner")

           # df_final.drop(cols_to_delete_posts, axis = 1, inplace = True)  
            df_final["EC_val"] = ec_val
            df_final.to_csv(os.path.join(f"../text_data/posts/{topic}/{ec_val}_post_{topic}_{i}.csv"), index = False)
            i+=1
     # estraiamo quindi un dataframe con la seguente struttura:
        # nome utente | subreddit | timestamp | topic | testo | post (booleani) | commento (booleani) | stats 

In [None]:
preprocess_post_text_pipeline(categories, "EC")

In [None]:
preprocess_post_text_pipeline(categories, "non_EC")

  0%|          | 0/3 [00:00<?, ?it/s]

Current iteration: 0
              user_id  max_label
4575        gatsby137  antitrump
4576  wcarterchambers  antitrump
4577     FutureAvenir  antitrump
4578      rowenkariya  antitrump
4579        hjvteffer  antitrump
...               ...        ...
7848       VirjhinBoy  antitrump
7849           hmwith  antitrump
7850       lithobolos  antitrump
7858   fuckin_bubbles  antitrump
7859  VinylAndOctavia  antitrump

[1948 rows x 2 columns]
Current iteration: 1
               user_id  max_label
0     Sherlock--Holmes   protrump
1             jblack94   protrump
2                UDT22   protrump
3           mikesteane   protrump
4          Taxus_Calyx   protrump
...                ...        ...
8811          Juyil900   protrump
8812      mairedemerde  antitrump
8813     Ignatiusloyal    neutral
8814      finiksrising    neutral
8815    nguyenkhuong92  antitrump

[8273 rows x 2 columns]
Current iteration: 2
                   user_id max_label
0               TCDWarrior  protrump
1     bre

KeyboardInterrupt: 