In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import networkx as nx


%config Completer.use_jedi = False

In [None]:
# definitions 

inpath_skorea = "../data/coronavirusdataset/"

In [None]:
# read data
#path = 'kaggle/input/coronavirusdataset/'
path = inpath_skorea

case = p_info = pd.read_csv(path+'Case.csv')
#p_route = pd.read_csv(path+'PatientRoute.csv')
time = pd.read_csv(path+'Time.csv')
t_age = pd.read_csv(path+'TimeAge.csv')
t_gender = pd.read_csv(path+'TimeGender.csv')
t_provin = pd.read_csv(path+'TimeProvince.csv')
region = pd.read_csv(path+'Region.csv')
weather = pd.read_csv(path+'Weather.csv')
search = pd.read_csv(path+'SearchTrend.csv')
floating = pd.read_csv(path+'SeoulFloating.csv')
policy = pd.read_csv(path+'Policy.csv')

# further plots
- waterfall: 
    - patients
    - ... with tracking
    -

In [None]:
case

In [None]:
p_info = pd.read_csv(path+'PatientInfo.csv')
p_info.confirmed_date = pd.to_datetime(p_info.confirmed_date)
print(f'dropping {p_info.confirmed_date.isna().sum()} rows (missing confirmed date)')
p_info = p_info[~p_info.confirmed_date.isna()]
p_info


In [None]:
p_info['days_since_start'] = (p_info.confirmed_date - p_info.confirmed_date.min()).dt.days
p_info.days_since_start.isna().sum()
p_info.days_since_start

In [None]:
p_info[~p_info.infected_by.isna()].infection_case.value_counts()

In [None]:
labels = list(pos.keys())

#p_info[labels]

[(l, edges.loc[edges.target  == l,:].days_since_start.values) for l in labels]

days_dim = []
for l in labels: 
    days = edges.loc[edges.target  == l,:].days_since_start.values
    if len(days) == 1: 
        days_dim.append(days[0])
    elif len(days) > 1: 
        days_dim.append('Error: len > 1')
    elif len(days) == 0: 
        days_dim.append(np.nan)
    else: 
        raise Exception()


    



In [None]:
#edges.loc[edges.target == 1000000006,:].days_since_start

tmp = edges.loc[edges.target == 1000000006,:]['days_since_start'].values
len(tmp)

In [None]:
days_dim

In [None]:
p_info[p_info.infected_by == '2000000205']

In [None]:
p_info.state.value_counts()

In [None]:
def get_edges_from_p_info(p_info): 
    edges = pd.DataFrame({
        'source': p_info.infected_by, 
        'target': p_info.patient_id, 
        'days_since_start': p_info.days_since_start
    })
    

    edges = edges[~edges.source.isna()]
    edges.reset_index(drop=True, inplace=True)


    # filter invalid source ids (3)
    len('2000000205')
    # set([i if len(s) != 10 for i, s in enumerate(edgelist.source)])
    f = [len(s) == 10 for s in edges.source]
    edges = edges.loc[f,:]

    # cast source to int
    edges.source = edges.source.astype(int)
    return edges


def plot_network(G, pos): 

    #N = G.number_of_nodes
    #E=[e.tuple for e in G.edges]# list of edges
    labels = list(pos.keys())

    #https://plotly.com/python/v3/igraph-networkx-comparison/

    #Xv=[pos[k][0] for k in range(N)]
    #Yv=[pos[k][1] for k in range(N)]

    Xv=[p[0] for p in pos.values()]
    Yv=[p[1] for p in pos.values()]

    Xed=[]
    Yed=[]
    for edge in G.edges:
        Xed+=[pos[edge[0]][0],pos[edge[1]][0], None]
        Yed+=[pos[edge[0]][1],pos[edge[1]][1], None]

    edge_trace=go.Scatter(x=Xed,
                   y=Yed,
                   mode='lines',
                   line=dict(color='rgb(210,210,210)', width=1),
                   hoverinfo='none'
                   )


    days_dim = []
    for l in labels: 
        days = edges.loc[edges.target  == l,:].days_since_start.values
        if len(days) == 1: 
            days_dim.append(days[0])
        elif len(days) > 1: 
            days_dim.append('Error: len > 1')
        elif len(days) == 0: 
            days_dim.append(np.nan)
        else: 
            raise Exception()
        
        

    node_trace=go.Scatter(
        x=Xv,
        y=Yv,
        mode='markers',
        name='net',
        marker=dict(showscale=True,
            symbol='circle-dot',
            size=5,
            color='#6959CD',
            line=dict(color='rgb(50,50,50)', width=0.5)
        ),
        text=labels, 
        hoverinfo='text'
    )

    node_adjacencies = []
    node_text = []
    for node, adjacencies in enumerate(G.adjacency()):
        node_adjacencies.append(len(adjacencies[1])+2)
        node_text.append('# of connections: '+str(len(adjacencies[1])))

    node_trace.marker.color = days_dim
    node_trace.marker.size = node_adjacencies
    #node_trace.text = node_text

    axis=dict(showline=False, # hide axis line, grid, ticklabels and  title
              zeroline=False,
              showgrid=False,
              showticklabels=False,
              title=''
              )

    width=800
    height=800
    layout=go.Layout(title= "Coauthorship network of scientists working on network theory and experiment"+\
                  "<br> Data source: <a href='https://networkdata.ics.uci.edu/data.php?id=11'> [1]</a>",
        font= dict(size=12),
        showlegend=False,
        autosize=False,
        width=width,
        height=height,
        xaxis=go.layout.XAxis(axis),
        yaxis=go.layout.YAxis(axis),
        margin=go.layout.Margin(
            l=40,
            r=40,
            b=85,
            t=100,
        ),
        hovermode='closest',
        annotations=[
               dict(
               showarrow=False,
                text='This igraph.Graph has the Kamada-Kawai layout',
                xref='paper',
                yref='paper',
                x=0,
                y=-0.1,
                xanchor='left',
                yanchor='bottom',
                font=dict(
                size=14
                )
                )
            ]
        )

    annot="This networkx.Graph has the Fruchterman-Reingold layout<br>Code:"+\
    "<a href='http://nbviewer.ipython.org/gist/empet/07ea33b2e4e0b84193bd'> [2]</a>"

    data1=[edge_trace, node_trace]
    fig1=go.Figure(data=data1, layout=layout)
    fig1['layout']['annotations'][0]['text']=annot

    return fig1



In [None]:
"""
https://networkx.org/documentation/stable/reference/generated/networkx.convert_matrix.from_pandas_edgelist.html
"""

if True:
    edges = get_edges_from_p_info(p_info)
else: 
    edges = pd.DataFrame(
        {
            "source": [0, 1, 2],
            "target": [2, 2, 3]
            ,
            "weight": [3, 4, 5],
            "color": ["red", "blue", "blue"],
        }
    )
    


G = nx.from_pandas_edgelist(edges, edge_attr=True)

print(f'Size: {G.size()} | No nodes: {G.number_of_nodes()} | No edges: {G.number_of_edges()}')

#https://stackoverflow.com/questions/52400380/assign-edge-weights-to-a-networkx-graph-using-pandas-dataframe
#pos = nx.kamada_kawai_layout(G)
pos = nx.spring_layout(G, seed=444)



In [None]:

fig = plot_network(G, pos)
fig.show()
#py.iplot(fig1, filename='Coautorship-network-nx')


In [None]:
edges.source.value_counts()

In [None]:
edges.dtypes

In [None]:

#nx.from_pandas_adjacency()

def build_network(df): 
    pass



In [None]:
dict(G.adjacency())#[2000000205]

In [None]:
#G = nx.random_geometric_graph(200, 0.125)

for i, edge in enumerate(G.edges()): 
    print(i, edge)
    print(i,  G.nodes[edge[0]])
    print(i,  G.nodes[edge[1]])

    if i > 10: 
        break 


In [None]:
node_text

In [None]:
node_adjacencies

# Clustering

In [None]:
# https://stackoverflow.com/questions/62902871/how-can-i-cluster-a-graph-g-created-in-networkx
# https://towardsdatascience.com/node2vec-embeddings-for-graph-data-32a866340fef
# https://github.com/eliorc/Medium/blob/master/Nod2Vec-FIFA17-Example.ipynb

"""
Step 1: get the embedding of each node in the graph. 
That means you need to get a continuous vector representation for each node. 
You can use graph embedding methods like node2vec, deepwalk, etc to obtain the embedding. 
Note that such methods preserve the structural similarity between the nodes of a graph in the vector representation 
(embedding space). The following example shows how you can do that.
"""

import networkx as nx
G=nx.Graph();
G=nx.read_edgelist("edges.txt") # edges.txt contains the edge list of your graph

# help to draw https://networkx.github.io/documentation/networkx-1.9/examples/drawing/labels_and_colors.html
nx.draw(G,with_labels = True,node_color='b',node_size=500);

from node2vec import Node2Vec
# Generate walks
node2vec = Node2Vec(G, dimensions=2, walk_length=20, num_walks=10,workers=4)
# Learn embeddings 
model = node2vec.fit(window=10, min_count=1)
#model.wv.most_similar('1')
model.wv.save_word2vec_format("embedding.emb") #save the embedding in file embedding.emb


"""
Step 2: apply the clustering method. 
Once you get vector representation of the nodes, you can cluster the nodes based on those representations. 
See the example below.
"""

from sklearn.cluster import KMeans
import numpy as np


X = np.loadtxt("embedding.emb", skiprows=1) # load the embedding of the nodes of the graph
#print(X)
# sort the embedding based on node index in the first column in X
X=X[X[:,0].argsort()]; 
#print(X)
Z=X[0:X.shape[0],1:X.shape[1]]; # remove the node index from X and save in Z

kmeans = KMeans(n_clusters=2, random_state=0).fit(Z) # apply kmeans on Z
labels=kmeans.labels_  # get the cluster labels of the nodes.
print(labels)
