# Derive edgelist for dynamic graph from hyphe graph

This script is used to derive the edgelist that will be used for the dynamic graph visualization to import to Gephi. It takes each gexf file downloaded from hyphe, subsets only the media entities that participated to either the immigration or the climate change debate (similarly to what we has been done in Script 1 of the .../4_Datasets/1_SMPNA folder) and then appends it to the main edgelist adding the time period column. 

The output is saved in the '*/.../{topic}/*' folder as *{file_name}_edgelist.csv*. The MAIN file is saves as *00_{topic}_dynamic_edgelist_MAIN.csv*.

## Step 1 - import libraries and the media dataset

In [2]:
import networkx as nx
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt

In [3]:
# Load the media domains names.
meta_media_df = pd.read_csv('MEDIA_entities_unique_imm&clim_COMBINED.csv')
meta_media_df.head(3)

Unnamed: 0,domain,count_climate,count_immigration,total_count
0,lemonde.fr,1037,515,1552
1,francetvinfo.fr,295,179,474
2,liberation.fr,245,161,406


## Step 2 - create the file name lists

This includes all the file names of the individual gephi networks downloaded from hyphe.

In [3]:
imm_file_names = ['IMM_1_2017_01',
 'IMM_2_2017_01',
 'IMM_3_2017_01',
 'IMM_4_2017_02',
 'IMM_5_2017_02',
 'IMM_6_2017_03',
 'IMM_7_2017_03',
 'IMM_8_2017_04',
 'IMM_9_2017_04',
 'IMM_10_2017_05',
 'IMM_11_2017_05',
 'IMM_12_2017_06',
 'IMM_13_2017_06',
 'IMM_14_2017_07',
 'IMM_15_2017_07',
 'IMM_16_2017_07',
 'IMM_17_2017_08',
 'IMM_18_2017_08',
 'IMM_19_2017_09',
 'IMM_20_2017_09',
 'IMM_21_2017_10',
 'IMM_22_2017_10',
 'IMM_23_2017_11',
 'IMM_24_2017_11',
 'IMM_25_2017_12',
 'IMM_26_2017_12',
 'IMM_27_2017_12',
 'IMM_28_2018_01',
 'IMM_29_2018_01',
 'IMM_30_2018_02',
 'IMM_31_2018_02',
 'IMM_32_2018_03',
 'IMM_33_2018_03',
 'IMM_34_2018_04',
 'IMM_35_2018_04',
 'IMM_36_2018_05',
 'IMM_37_2018_05',
 'IMM_38_2018_06',
 'IMM_39_2018_06',
 'IMM_40_2018_07',
 'IMM_41_2018_07',
 'IMM_42_2018_07',
 'IMM_43_2018_08',
 'IMM_44_2018_08',
 'IMM_45_2018_09',
 'IMM_46_2018_09',
 'IMM_47_2018_10',
 'IMM_48_2018_10',
 'IMM_49_2018_11',
 'IMM_50_2018_11',
 'IMM_51_2018_12',
 'IMM_52_2018_12',
 'IMM_53_2018_12',
 'IMM_54_2019_01',
 'IMM_55_2019_01',
 'IMM_56_2019_02',
 'IMM_57_2019_02',
 'IMM_58_2019_03',
 'IMM_59_2019_03',
 'IMM_60_2019_04',
 'IMM_61_2019_04',
 'IMM_62_2019_05',
 'IMM_63_2019_05',
 'IMM_64_2019_06',
 'IMM_65_2019_06',
 'IMM_66_2019_06',
 'IMM_67_2019_07',
 'IMM_68_2019_07']


clim_file_names = ['CLIM_1_2017_01',
 'CLIM_2_2017_01',
 'CLIM_3_2017_01',
 'CLIM_4_2017_02',
 'CLIM_5_2017_02',
 'CLIM_6_2017_03',
 'CLIM_7_2017_03',
 'CLIM_8_2017_04',
 'CLIM_9_2017_04',
 'CLIM_10_2017_05',
 'CLIM_11_2017_05',
 'CLIM_12_2017_06',
 'CLIM_13_2017_06',
 'CLIM_14_2017_07',
 'CLIM_15_2017_07',
 'CLIM_16_2017_07',
 'CLIM_17_2017_08',
 'CLIM_18_2017_08',
 'CLIM_19_2017_09',
 'CLIM_20_2017_09',
 'CLIM_21_2017_10',
 'CLIM_22_2017_10',
 'CLIM_23_2017_11',
 'CLIM_24_2017_11',
 'CLIM_25_2017_12',
 'CLIM_26_2017_12',
 'CLIM_27_2017_12',
 'CLIM_28_2018_01',
 'CLIM_29_2018_01',
 'CLIM_30_2018_02',
 'CLIM_31_2018_02',
 'CLIM_32_2018_03',
 'CLIM_33_2018_03',
 'CLIM_34_2018_04',
 'CLIM_35_2018_04',
 'CLIM_36_2018_05',
 'CLIM_37_2018_05',
 'CLIM_38_2018_06',
 'CLIM_39_2018_06',
 'CLIM_40_2018_07',
 'CLIM_41_2018_07',
 'CLIM_42_2018_07',
 'CLIM_43_2018_08',
 'CLIM_44_2018_08',
 'CLIM_45_2018_09',
 'CLIM_46_2018_09',
 'CLIM_47_2018_10',
 'CLIM_48_2018_10',
 'CLIM_49_2018_11',
 'CLIM_50_2018_11',
 'CLIM_51_2018_12',
 'CLIM_52_2018_12',
 'CLIM_53_2018_12',
 'CLIM_54_2019_01',
 'CLIM_55_2019_01',
 'CLIM_56_2019_02',
 'CLIM_57_2019_02',
 'CLIM_58_2019_03',
 'CLIM_59_2019_03',
 'CLIM_60_2019_04',
 'CLIM_61_2019_04',
 'CLIM_62_2019_05',
 'CLIM_63_2019_05',
 'CLIM_64_2019_06',
 'CLIM_65_2019_06',
 'CLIM_66_2019_06',
 'CLIM_67_2019_07',
 'CLIM_68_2019_07']

## Step 3 - iterate over each gexf file and append to edgelist

### 3.1 – Immigration

In [4]:
imm_dynamic_edgelist = pd.DataFrame()
imm_nodes = pd.DataFrame()

for i in range(len(imm_file_names)):
    # step 0 - initialise empty edgelist
    edgelist = pd.DataFrame()
    nodes = pd.DataFrame()
    
    # step 1 – read in gexf file
    G = nx.read_gexf(f'/.../1_immigration/{imm_file_names[i]}.gexf')
    
    # step 2 - create pandas edgelist
    edgelist = nx.to_pandas_edgelist(G)
    
    # step 3 - create nodes info dataset to derive nodes' labels
    node_data = []
    attributes = ['name']
    node_data = {node: {attr: G.nodes[node][attr] for attr in attributes} for node in G.nodes}
    nodes = pd.DataFrame.from_dict(node_data, orient='index')
    nodes.index.name = 'node_id'
    nodes.reset_index(inplace=True)
    
    nodes['Id'] = nodes.node_id
    nodes['Label'] = nodes.name.str.lower()
    
    # step 4 - add the nodes labels to the edgelist
    edgelist['source_label'] = edgelist['source'].map(nodes.set_index('node_id')['name']).str.lower()
    edgelist['target_label'] = edgelist['target'].map(nodes.set_index('node_id')['name']).str.lower()
    
    # step 5 - subset only media organisations
    edgelist = edgelist[edgelist['target_label'].isin(meta_media_df['domain'])]
    edgelist = edgelist[edgelist['source_label'].isin(meta_media_df['domain'])]
    edgelist.reset_index(inplace=True, drop=True)
    
    nodes = nodes[nodes['Label'].isin(meta_media_df['domain'])]
    nodes.reset_index(inplace=True, drop=True)
    
    # step 6 - add time period column
    edgelist['timeset'] = f'"[{i+1},{i+1}]"'
    edgelist['time_period'] = i+1
    
    # step 7 - append to the dynamic_edgelist
    imm_dynamic_edgelist = pd.concat([imm_dynamic_edgelist, edgelist], ignore_index = True)
    imm_nodes = pd.concat([imm_nodes, nodes], ignore_index = True)
    
    # step 8 - save singular edgelist
    edgelist.to_csv(f'/.../1_immigration/{imm_file_names[i]}_edgelist.csv', index = False)
    nodes.to_csv(f'/.../1_immigration/{imm_file_names[i]}_nodes.csv', index = False)

### 3.2 – Climate

In [5]:
clim_dynamic_edgelist = pd.DataFrame()
clim_nodes = pd.DataFrame()

for i in range(len(clim_file_names)):
    # step 0 - initialise empty edgelist
    edgelist = pd.DataFrame()
    nodes = pd.DataFrame()
    
    # step 1 – read in gexf file
    G = nx.read_gexf(f'/.../2_climate/{clim_file_names[i]}.gexf')
    
    # step 2 - create pandas edgelist
    edgelist = nx.to_pandas_edgelist(G)
    
    # step 3 - create nodes info dataset to derive nodes' labels
    node_data = []
    attributes = ['name']
    node_data = {node: {attr: G.nodes[node][attr] for attr in attributes} for node in G.nodes}
    nodes = pd.DataFrame.from_dict(node_data, orient='index')
    nodes.index.name = 'node_id'
    nodes.reset_index(inplace=True)
    
    nodes['Id'] = nodes.node_id
    nodes['Label'] = nodes.name.str.lower()
    
    # step 4 - add the nodes labels to the edgelist
    edgelist['source_label'] = edgelist['source'].map(nodes.set_index('node_id')['name']).str.lower()
    edgelist['target_label'] = edgelist['target'].map(nodes.set_index('node_id')['name']).str.lower()
    
    # step 5 - subset only media organisations
    edgelist = edgelist[edgelist['target_label'].isin(meta_media_df['domain'])]
    edgelist = edgelist[edgelist['source_label'].isin(meta_media_df['domain'])]
    edgelist.reset_index(inplace=True, drop=True)
    
    nodes = nodes[nodes['Label'].isin(meta_media_df['domain'])]
    nodes.reset_index(inplace=True, drop=True)
    
    # step 6 - add time period column
    edgelist['timeset'] = f'"[{i+1},{i+1}]"'
    edgelist['time_period'] = i+1
    
    # step 7 - append to the dynamic_edgelist
    clim_dynamic_edgelist = pd.concat([clim_dynamic_edgelist, edgelist], ignore_index = True)
    clim_nodes = pd.concat([clim_nodes, nodes], ignore_index = True)
    
    # step 8 - save singular edgelist
    edgelist.to_csv(f'/.../2_climate/{clim_file_names[i]}_edgelist.csv', index = False)
    nodes.to_csv(f'/.../2_climate/{clim_file_names[i]}_nodes.csv', index = False)

## Step 5 - adjust edgelist and nodes table

This is needed to assign to each outlet its own ID. Otherwise each timeperiod assignes different IDs to the same outlet. 

### 5.1 - Immigration

In [7]:
imm_nodes.drop('node_id', axis=1, inplace=True)
imm_nodes.drop('name', axis = 1, inplace=True)
imm_nodes = imm_nodes.drop_duplicates(subset=['Label']).reset_index(drop=True)
imm_nodes['Id'] = np.arange(len(imm_nodes))
imm_nodes.head(3)

Unnamed: 0,Id,Label
0,0,francetvinfo.fr
1,1,slate.fr
2,2,lavoixdunord.fr


In [8]:
imm_dynamic_edgelist['source'] = imm_dynamic_edgelist['source_label'].map(imm_nodes.set_index('Label')['Id'])
imm_dynamic_edgelist['target'] = imm_dynamic_edgelist['target_label'].map(imm_nodes.set_index('Label')['Id'])
imm_dynamic_edgelist['source'] = imm_dynamic_edgelist['source'].astype(int)
imm_dynamic_edgelist['target'] = imm_dynamic_edgelist['target'].astype(int)
imm_dynamic_edgelist.drop('id', axis = 1, inplace=True)
imm_dynamic_edgelist.head(3)

Unnamed: 0,source,target,count,source_label,target_label,timeset,time_period
0,0,16,1,francetvinfo.fr,lemonde.fr,"""[1,1]""",1
1,0,19,1,francetvinfo.fr,franceculture.fr,"""[1,1]""",1
2,0,12,2,francetvinfo.fr,francebleu.fr,"""[1,1]""",1


### 5.2 – Climate change

In [9]:
clim_nodes.drop('node_id', axis=1, inplace=True)
clim_nodes.drop('name', axis = 1, inplace=True)
clim_nodes = clim_nodes.drop_duplicates(subset=['Label']).reset_index(drop=True)
clim_nodes['Id'] = np.arange(len(imm_nodes))
clim_nodes.head(3)

Unnamed: 0,Id,Label
0,0,francetvinfo.fr
1,1,slate.fr
2,2,lavoixdunord.fr


In [11]:
clim_dynamic_edgelist['source'] = clim_dynamic_edgelist['source_label'].map(clim_nodes.set_index('Label')['Id'])
clim_dynamic_edgelist['target'] = clim_dynamic_edgelist['target_label'].map(clim_nodes.set_index('Label')['Id'])
clim_dynamic_edgelist['source'] = clim_dynamic_edgelist['source'].astype(int)
clim_dynamic_edgelist['target'] = clim_dynamic_edgelist['target'].astype(int)
clim_dynamic_edgelist.drop('id', axis = 1, inplace=True)
clim_dynamic_edgelist.head(3)

Unnamed: 0,source,target,count,source_label,target_label,timeset,time_period
0,36,70,2,futura-sciences.com,euronews.com,"""[1,1]""",1
1,0,15,1,francetvinfo.fr,franceinter.fr,"""[1,1]""",1
2,0,12,6,francetvinfo.fr,francebleu.fr,"""[1,1]""",1


## Step 5 - save to hardrive

In [12]:
imm_dynamic_edgelist.to_csv('/.../1_immigration/00_IMM_dynamic_edgelist_MAIN.csv', index = False)
imm_nodes.to_csv('/.../1_immigration/00_IMM_nodes_MAIN.csv', index = False)

In [13]:
clim_dynamic_edgelist.to_csv('/.../2_climate/00_CLIM_dynamic_edgelist_MAIN.csv', index = False)
clim_nodes.to_csv('/.../2_climate/00_CLIM_nodes_MAIN.csv', index = False)

## Step 6 - networks descriptive stats

In [19]:
df = pd.DataFrame(columns=['issue','nodes','edges'])
df['issue'] = ['immigration','climate']
df['nodes'] = [len(imm_nodes),len(clim_nodes)]

count_edges_df_imm = imm_dynamic_edgelist[['source_label','target_label']].drop_duplicates()
count_edfes_df_clim = clim_dynamic_edgelist[['source_label','target_label']].drop_duplicates()

df['edges'] = [len(count_edges_df_imm), len(count_edfes_df_clim)]
df

Unnamed: 0,issue,nodes,edges
0,immigration,103,831
1,climate,103,1025
