In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import networkx as nx

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split

In [2]:
df_edges = pd.read_csv("/Social Network Analysis/gephi_export.csv", sep=",");
df_nodes = pd.read_csv("/Social Network Analysis/gephi_export_nodes.csv", sep=",");

In [None]:
nodes_cols = list(df_nodes.columns.to_list());
edges_cols = list(df_edges.columns.to_list());
nodes_cols

In [None]:
edges_cols

In [5]:
def make_clusters(nodelist):
    cluster_list = list();
    while(len(nodelist) > 0):
        new_cluster = list();
        new_cluster.append(nodelist.pop(0));
        all_checked = False;
        while(not all_checked):
            all_checked = True;
            for potential_node in nodelist:
                #print(f"pot {potential_node}, clust {new_cluster[-1]} : \r\n -------------------------------------------")
                for i in range(0,len(df_edges)):
                    #print(f"Source {df_edges.iloc[i].Source}, Target {df_edges.iloc[i].Source}")
                    if(((df_edges.iloc[i].Source == potential_node) and (df_edges.iloc[i].Target in new_cluster)) or
       ((df_edges.iloc[i].Source in new_cluster) and (df_edges.iloc[i].Target == potential_node))):
                        new_cluster.append(potential_node);
                        nodelist.remove(potential_node);
                        all_checked = False;
                        #print("true")
                        break;
        cluster_list.append(new_cluster);
    return cluster_list;

In [6]:
communities = df_nodes["community"].drop_duplicates().to_list();
graph = {};

for c in communities:
    b = df_nodes.loc[df_nodes["community"] == c, 'Id'].to_list();
    graph[c] = make_clusters(b);
    

In [31]:
df_corr = pd.DataFrame();
df_corr.assign(Male=[0], Female=[0], Organization=[0])
gender = ['Male','Female','Organization']
for i in range(0,3):
    for i2 in range(0,3):
        df_corr.loc[i,i2] = round(len(df_edges.loc[
            df_edges["Source"].isin(df_nodes.loc[df_nodes["gender"] == gender[i],"Id"])
            & df_edges["Target"].isin(df_nodes.loc[df_nodes["gender"] == gender[i2],"Id"])
        ]), 1)

In [36]:
df_corr

Unnamed: 0,0,1,2
0,56.1,32.9,31.6
1,30.1,53.1,28.9
2,13.8,14.1,39.5


In [None]:
for i in gender:
    print(i)
    print(df_nodes.loc[df_nodes["gender"] == i, "Betweenness Centrality"].mean())

fig, ax = plt.subplots(1,1,figsize=(5,5))

ax.set_title(f"Verteilung der Betweenness Centrality", fontsize=13)
ax.set_ylabel("Betweenness Centrality", fontsize=13)
ax.set_xlabel("index");
ax.scatter(range(0,len(df_nodes.loc[df_nodes["gender"] == "Male"])), df_nodes.loc[df_nodes["gender"] == "Male","Betweenness Centrality"], color="blue", label="Male")

ax.scatter(range(0,len(df_nodes.loc[df_nodes["gender"] == "Female"])), df_nodes.loc[df_nodes["gender"] == "Female","Betweenness Centrality"], color="pink", label="Female")

ax.scatter(range(0,len(df_nodes.loc[df_nodes["gender"] == "Organization"])), df_nodes.loc[df_nodes["gender"] == "Organization","Betweenness Centrality"], color="orange", label="Organization")
ax.legend()
plt.show()

In [None]:


fig, ax = plt.subplots(figsize=(7, 5))

img = ax.imshow(df_corr, cmap="Blues", interpolation="nearest", vmin=0, vmax=100)
plt.colorbar(img, shrink=0.7)

for i in range(df_corr.shape[0]):
    for j in range(df_corr.shape[1]):
        ax.text(j, i, f'{df_corr.iloc[i, j]:.1f}', ha="center", va="center", color="black", fontsize=20)

ax.set_ylabel("Ausgehend", fontsize = 13)
ax.set_xlabel("Eingehend", fontsize = 13)
ax.set_xticks(np.arange(len(df_corr.columns)), ['Male','Female','Org'], rotation=90, fontsize = 16)
ax.set_yticks(np.arange(len(df_corr.columns)), ['Male','Female','Org'], fontsize = 16)
ax.set_title("Eingehende Kanten prozentual und auf eine Stelle gerundet")

plt.show()

In [11]:
new_node_dfs = {};
for c in communities:
    for i in range(0,len(graph[c])):
        s = f"{c}{i}"
        if(len(graph[c][i]) == 1):
            s = graph[c][i][0];
        new_node_dfs[s] = df_nodes.loc[df_nodes["Id"].isin(graph[c][i])]

In [None]:
df_new_edges = pd.DataFrame();
df_new_nodes = pd.DataFrame();

for s in ['Source', 'Target', 'Mutuality', 'Weight']:
    df_new_edges[s] = None;

node_attributes = ['Node_Id', 'community', 'Anz_Knoten', 'Anteil_Mutual_edges', 'In/Out_verhältnis', 'Anteil Männlich', 'AVG_inner_Closness_Centrality']
single_node_atts = ['in-Degree',
 'Excentricity',
 'Closness Centrality',
 'Harmonic Closness Centrality',
 'Betweenness Centrality',
 'Eigenvector Centrality',
 'Clustering Coefficient']

for s in single_node_atts:
    node_attributes.append(f"AVG_{s}")
    node_attributes.append(f"SIGMA_{s}")
    
for s in node_attributes:
    df_new_nodes[s] = None;

node_attributes

In [None]:
new_node_dfs

In [None]:
for key in list(new_node_dfs.keys()):
    
    node_id = key;
    
    male_ratio = len(new_node_dfs[key].loc[new_node_dfs[key]["gender"] == "Male"]) / len(new_node_dfs[key].loc[ new_node_dfs[key]["gender"].isin(["Male", "Female"])])
    
    community = new_node_dfs[key].iloc[0]["community"]
    
    Anz_Knoten = len(new_node_dfs[key])
    
    df_temp = df_edges.loc[df_edges["Source"].isin(new_node_dfs[key]["Id"].to_list()) 
                          |
                       df_edges["Target"].isin(new_node_dfs[key]["Id"].to_list())
                      ]
    
    df_temp['pair'] = df_temp.apply(lambda row: tuple(sorted([row['Source'], row['Target']])), axis=1)
    mutual_edges = df_temp[df_temp.duplicated('pair', keep = False)].drop(columns=['pair'])
    
    Anteil_Mutual_edges = len(mutual_edges) / len(df_temp)
    
    
    
    Out_verhältnis = len(
        df_edges.loc[
            df_edges["Source"].isin(new_node_dfs[key]["Id"].to_list())
            & ~ df_edges["Target"].isin(new_node_dfs[key]["Id"].to_list())
        ]) / len(df_edges.loc[df_edges["Source"].isin(new_node_dfs[key]["Id"].to_list())])
    
    
    G = nx.from_pandas_edgelist(df_temp, source='Source', target='Target', create_using=nx.DiGraph())
    
    inner_closness = list();
    for i in range(0,len(new_node_dfs[key])):
        path_length_sum = 0
        number_of_paths = 0
        for j in range(0,len(new_node_dfs[key])):
            try:
                shortest_path = nx.shortest_path(G, source=new_node_dfs[key].iloc[i]["Id"], target=new_node_dfs[key].iloc[j]["Id"])
                number_of_paths = number_of_paths + 1
                path_length_sum += len(shortest_path)
            except Exception as e :
                a = 0
        if (path_length_sum > 0):
            inner_closness.append(number_of_paths / path_length_sum)
        else:
            inner_closness.append(0)
    
    
    AVG_inner_Closness_Centrality = 0
    for d in inner_closness:
        AVG_inner_Closness_Centrality += d
    AVG_inner_Closness_Centrality = AVG_inner_Closness_Centrality / len(inner_closness)
    
    
    
    AVG_in_Degree = new_node_dfs[key]["in-Degree"].mean()
    SIGMA_in_Degree = new_node_dfs[key]["in-Degree"].std()
    AVG_Excentricity = new_node_dfs[key]["Excentricity"].mean()
    SIGMA_Excentricity = new_node_dfs[key]["Excentricity"].std()
    AVG_Closness_Centrality = new_node_dfs[key]["Closness Centrality"].mean()
    SIGMA_Closness_Centrality = new_node_dfs[key]["Closness Centrality"].std()
    AVG_Harmonic_Closness_Centrality = new_node_dfs[key]["Harmonic Closness Centrality"].mean()
    SIGMA_Harmonic_Closness_Centrality = new_node_dfs[key]["Harmonic Closness Centrality"].std()
    AVG_Betweenness_Centrality = new_node_dfs[key]["Betweenness Centrality"].mean()
    SIGMA_Betweenness_Centrality = new_node_dfs[key]["Betweenness Centrality"].std()
    AVG_Eigenvector_Centrality = new_node_dfs[key]["Eigenvector Centrality"].mean()
    SIGMA_Eigenvector_Centrality = new_node_dfs[key]["Eigenvector Centrality"].std()
    AVG_Clustering_Coefficient = new_node_dfs[key]["Clustering Coefficient"].mean()
    SIGMA_Clustering_Coefficient = new_node_dfs[key]["Clustering Coefficient"].std()
    
    new_entry = {
    'Node_Id': node_id,
    'community': community,
    'Anz_Knoten': Anz_Knoten,
    'Anteil_Mutual_edges': Anteil_Mutual_edges,
    'In/Out_verhältnis': Out_verhältnis,
    'Anteil Männlich' : male_ratio,
    'AVG_inner_Closness_Centrality': AVG_inner_Closness_Centrality,
    'AVG_in-Degree': AVG_in_Degree,
    'SIGMA_in-Degree': SIGMA_in_Degree,
    'AVG_Excentricity': AVG_Excentricity,
    'SIGMA_Excentricity': SIGMA_Excentricity,
    'AVG_Closness Centrality': AVG_Closness_Centrality,
    'SIGMA_Closness Centrality': SIGMA_Closness_Centrality,
    'AVG_Harmonic Closness Centrality': AVG_Harmonic_Closness_Centrality,
    'SIGMA_Harmonic Closness Centrality': SIGMA_Harmonic_Closness_Centrality,
    'AVG_Betweenness Centrality': AVG_Betweenness_Centrality,
    'SIGMA_Betweenness Centrality': SIGMA_Betweenness_Centrality,
    'AVG_Eigenvector Centrality': AVG_Eigenvector_Centrality,
    'SIGMA_Eigenvector Centrality': SIGMA_Eigenvector_Centrality,
    'AVG_Clustering Coefficient': AVG_Clustering_Coefficient,
    'SIGMA_Clustering Coefficient': SIGMA_Clustering_Coefficient
    }

    df_new_nodes = df_new_nodes.append(new_entry, ignore_index=True)

In [None]:
df_new_nodes

In [None]:
for source_key in list(new_node_dfs.keys()):
    for target_key in list(new_node_dfs.keys()):
        if (source_key == target_key):
            continue;
        
        weight = len(df_edges.loc[df_edges["Source"].isin(new_node_dfs[source_key]["Id"]) & df_edges["Target"].isin(new_node_dfs[target_key]["Id"])])
            
        df_temp['pair'] = df_temp.apply(lambda row: tuple(sorted([row['Source'], row['Target']])), axis=1)
        mutual_edges = df_temp[df_temp.duplicated('pair', keep = False)].drop(columns=['pair'])    
            
        if(weight == 0):
            mutuality = 0;
        else:    
            mutuality = len(mutual_edges.loc[
                mutual_edges["Source"].isin(new_node_dfs[source_key]["Id"]) & mutual_edges["Target"].isin(new_node_dfs[target_key]["Id"])
            ]) / weight
        
        new_entry = {
            'Source': source_key,
            'Target': target_key,
            'Mutuality' : mutuality,
            'Weight' : weight
        }
        print(new_entry)
        if(weight > 0):
            df_new_edges = df_new_edges.append(new_entry, ignore_index=True)


In [17]:
df_new_nodes.to_csv('new_nodes.csv', sep='\t', index=False)
df_new_edges.to_csv('new_edges.csv', sep='\t', index=False)