In [None]:
import os

import pandas as pd
import numpy as np
import plotly.graph_objects as go
import networkx as nx

import matplotlib.pyplot as plt



%config Completer.use_jedi = False

In [None]:
# definitions 

inpath_skorea = "../data/coronavirusdataset/"

plots_path = "../plots/"
if not os.path.exists(plots_path): os.mkdir(plots_path)

In [None]:
# read data
#path = 'kaggle/input/coronavirusdataset/'
path = inpath_skorea

case = p_info = pd.read_csv(path+'Case.csv')
time = pd.read_csv(path+'Time.csv')
t_age = pd.read_csv(path+'TimeAge.csv')
t_gender = pd.read_csv(path+'TimeGender.csv')
t_provin = pd.read_csv(path+'TimeProvince.csv')
region = pd.read_csv(path+'Region.csv')
weather = pd.read_csv(path+'Weather.csv')
search = pd.read_csv(path+'SearchTrend.csv')
floating = pd.read_csv(path+'SeoulFloating.csv')
policy = pd.read_csv(path+'Policy.csv')

# further plots
- waterfall: 
    - patients
    - ... with tracking
    -

In [None]:
case

In [None]:
p_info = pd.read_csv(path+'PatientInfo.csv')
p_info.confirmed_date = pd.to_datetime(p_info.confirmed_date)
print(f'dropping {p_info.confirmed_date.isna().sum()} rows (missing confirmed date)')
p_info = p_info[~p_info.confirmed_date.isna()]

print(f'dropping {p_info.infected_by.isna().sum()} rows (missing confirmed date)')
p_info = p_info[~p_info.infected_by.isna()]

p_info


In [None]:
p_info.info()

In [None]:
p_info['days_since_start'] = (p_info.confirmed_date - p_info.confirmed_date.min()).dt.days
p_info.days_since_start.isna().sum()
p_info.days_since_start

In [None]:
p_info[~p_info.infected_by.isna()].infection_case.value_counts()

In [None]:
p_info[p_info.infected_by == '2000000205']

In [None]:
p_info.state.value_counts()

In [None]:
def get_edges_from_p_info(p_info): 
    edges = pd.DataFrame({
        'source': p_info.infected_by, 
        'target': p_info.patient_id, 
        'days_since_start': p_info.days_since_start, 
        'contact_number': p_info.contact_number
    })
    

    edges = edges[~edges.source.isna()]
    edges.reset_index(drop=True, inplace=True)


    # filter invalid source ids (3)
    len('2000000205')
    # set([i if len(s) != 10 for i, s in enumerate(edgelist.source)])
    f = [len(s) == 10 for s in edges.source]
    edges = edges.loc[f,:]

    # cast source to int
    edges.source = edges.source.astype(int)
    return edges


def describe_graph(G):   
    print(f'Size: {G.size()} | No nodes: {G.number_of_nodes()} | No edges: {G.number_of_edges()}')
   


In [None]:
"""
https://networkx.org/documentation/stable/reference/generated/networkx.convert_matrix.from_pandas_edgelist.html
"""

if True:
    edges = get_edges_from_p_info(p_info)
else: 
    edges = pd.DataFrame(
        {
            "source": [0, 1, 2],
            "target": [2, 2, 3]
            ,
            "weight": [3, 4, 5],
            "color": ["red", "blue", "blue"],
        }
    )
    

# overview graph types
# https://networkx.org/documentation/stable/reference/classes/index.html
G = nx.from_pandas_edgelist(edges, edge_attr=True, create_using=nx.Graph)

 
describe_graph(G)


In [None]:
edges.contact_number.isna().sum()


In [None]:


# https://networkx.org/documentation/stable//reference/drawing.html#module-networkx.drawing.layout
#https://stackoverflow.com/questions/52400380/assign-edge-weights-to-a-networkx-graph-using-pandas-dataframe
#pos = nx.kamada_kawai_layout(G)
pos = nx.spring_layout(G, seed=444)



In [None]:
def prepare_edge_trace(G, pos): 

    Xed=[]
    Yed=[]
    for edge in G.edges:
        Xed+=[pos[edge[0]][0],pos[edge[1]][0], None]
        Yed+=[pos[edge[0]][1],pos[edge[1]][1], None]
        
    return Xed, Yed


def prepare_node_trace(G, pos): 
    labels = list(pos.keys())

    Xv=[p[0] for p in pos.values()]
    Yv=[p[1] for p in pos.values()]
    
    return Xv, Yv, labels


def add_target_node_dimension(labels, edges, dim = 'days_since_start'):
    days_dim = []
    for l in labels: 
        days = edges.loc[edges.target  == l,:][dim].values
        if len(days) == 1: 
            days_dim.append(days[0])
        elif len(days) > 1: 
            raise Exception('Error: len > 1')
        elif len(days) == 0: 
            days_dim.append(np.nan)
        else: 
            raise Exception()
    return days_dim
            
            
def add_node_adjacencies(G): 
    
    node_adjacencies = []
    node_text = []
    for node, adjacencies in enumerate(G.adjacency()):
        node_adjacencies.append(len(adjacencies[1])+2)
        node_text.append('# of connections: '+str(len(adjacencies[1])))
    
    return node_adjacencies, node_text


def build_layout_networkgraph(height, width): 
    axis=dict(showline=False, # hide axis line, grid, ticklabels and  title
              zeroline=False,
              showgrid=False,
              showticklabels=False,
              title=''
              )

    width=width
    height=height
    
    
    layout=go.Layout(
        title= None,
        font= dict(size=12),
        showlegend=False,
        autosize=False,
        width=width,
        height=height,
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        xaxis=go.layout.XAxis(axis),
        yaxis=go.layout.YAxis(axis),
        margin=go.layout.Margin(
            l=40,
            r=40,
            b=85,
            t=100,
        ),
        hovermode='closest',
        annotations=[
               dict(
               showarrow=False,
                text='',
                xref='paper',
                yref='paper',
                x=0,
                y=-0.1,
                xanchor='left',
                yanchor='bottom',
                font=dict(
                size=14
                )
                )
            ]
        )
    return layout


def plot_network(G, pos, edges, height=800, width=800):
    """
    https://plotly.com/python/v3/igraph-networkx-comparison/
    """

    # ------------------------------------edge_trace------------------------------------
    Xed, Yed = prepare_edge_trace(G, pos)
    edge_trace = go.Scatter(x=Xed,
                   y=Yed,
                   mode='lines',
                   line=dict(color='rgb(210,210,210)', width=1),
                   hoverinfo='none'
                   )

    # ------------------------------------node_trace------------------------------------

    """
    https://plotly.com/python/network-graphs/
    """
    
    Xv, Yv, labels = prepare_node_trace(G, pos)
    node_trace=go.Scatter(
        x=Xv,
        y=Yv,
        mode='markers',
        name='net',
        marker=dict(
            showscale=True,
            colorscale='Oranges',
            reversescale=True,
            symbol='circle-dot',
            size=5,
            color='#6959CD',
            colorbar=dict(
                thickness=15,
                title='Days since day0',
                xanchor='left',
                titleside='right'),
            line=dict(color='rgb(50,50,50)', width=0.5)
        ),
        text=labels, 
        hoverinfo='text'
    )



    node_adjacencies, node_text = add_node_adjacencies(G)
    days_dim = add_target_node_dimension(labels, edges, dim = 'days_since_start')
    node_trace.marker.color = days_dim
    node_trace.marker.size = node_adjacencies
    #node_trace.text = node_text

    # ------------------------------------build figure------------------------------------

    annot=""

    data1=[edge_trace, node_trace]
    fig1=go.Figure(data=data1, layout=build_layout_networkgraph(height, width))
    fig1['layout']['annotations'][0]['text']=annot

    return fig1


In [None]:

fig = plot_network(G, pos, edges)
fig.show()
fig.write_image(os.path.join(plots_path, "infection_network.svg"))
fig.write_image(os.path.join(plots_path, "infection_network.png"), scale=5)


#py.iplot(fig1, filename='Coautorship-network-nx')


## largest compontents

https://networkx.org/documentation/stable//reference/algorithms/generated/networkx.algorithms.components.connected_components.html


In [None]:
comps = {len(c):c for c in sorted(nx.connected_components(G), key=len, reverse=True)}
len(comps)
comps


In [None]:
S = [G.subgraph(c).copy() for c in nx.connected_components(G)]
S = [G.subgraph(c).copy() for c in sorted(nx.connected_components(G), key=len, reverse=True)]



In [None]:
len(S)

In [None]:
S1 = S[0]

In [None]:
describe_graph(S1)

sorted(S1.nodes())

In [None]:
# extract p_info per subgraph
tmp = p_info[
    (p_info.patient_id.isin(S1.nodes())) 
    | (p_info.infected_by.isin(S1.nodes()))
]


origin = tmp[tmp.confirmed_date == tmp.confirmed_date.min()].infected_by.mode()[0]
n_infected
days_since_start_mean = tmp.days_since_start.mean()
days_since_start_sd = tmp.days_since_start.std()

#tmp

In [None]:
#fig, ax = plt.subplots(figsize=(10,10))
#nx.draw_networkx(S[0], pos=pos, ax=ax)

def plot_subgraph(S) -> plt.Figure:

    pos1 = nx.spring_layout(S, seed=444)

    fig = plot_network(S, pos1, edges, height=500, width=500)
    return fig

plot_subgraph(S[0]).show()

In [None]:
Smax = max(nx.connected_components(G), key=len)
len(Smax)

## over time ...

In [None]:
import plotly.express as px
fig = px.histogram(p_info, x="days_since_start", template='simple_white')
print(f' data range: {p_info.confirmed_date.dt.date.min()}, {p_info.confirmed_date.dt.date.max()}')
fig.show()



In [None]:
p_info[p_info.days_since_start > 100]

In [None]:
#cases_per_day = p_info.groupby(p_info.confirmed_date.dt.weekofyear).size().reset_index().rename(columns={0:'cases'})
cases_per_day = p_info.set_index('confirmed_date').groupby(pd.Grouper(freq='W')).size().reset_index().rename(columns={0:'cases'})

fig = px.line(cases_per_day, x='confirmed_date', y='cases', template='simple_white')
fig.show()
fig.write_image(os.path.join(plots_path, 'p_info_cases_weekly.png'), scale=5)


In [None]:
p_info.set_index('confirmed_date').groupby(pd.Grouper(freq='W')).size()


In [None]:
#G = nx.random_geometric_graph(200, 0.125)

for i, edge in enumerate(G.edges()): 
    print(i, edge)
    print(i,  G.nodes[edge[0]])
    print(i,  G.nodes[edge[1]])

    if i > 10: 
        break 


# Clustering

In [None]:
if false: 
    # https://stackoverflow.com/questions/62902871/how-can-i-cluster-a-graph-g-created-in-networkx
    # https://towardsdatascience.com/node2vec-embeddings-for-graph-data-32a866340fef
    # https://github.com/eliorc/Medium/blob/master/Nod2Vec-FIFA17-Example.ipynb

    """
    Step 1: get the embedding of each node in the graph. 
    That means you need to get a continuous vector representation for each node. 
    You can use graph embedding methods like node2vec, deepwalk, etc to obtain the embedding. 
    Note that such methods preserve the structural similarity between the nodes of a graph in the vector representation 
    (embedding space). The following example shows how you can do that.
    """

    import networkx as nx
    G=nx.Graph();
    G=nx.read_edgelist("edges.txt") # edges.txt contains the edge list of your graph

    # help to draw https://networkx.github.io/documentation/networkx-1.9/examples/drawing/labels_and_colors.html
    nx.draw(G,with_labels = True,node_color='b',node_size=500);

    from node2vec import Node2Vec
    # Generate walks
    node2vec = Node2Vec(G, dimensions=2, walk_length=20, num_walks=10,workers=4)
    # Learn embeddings 
    model = node2vec.fit(window=10, min_count=1)
    #model.wv.most_similar('1')
    model.wv.save_word2vec_format("embedding.emb") #save the embedding in file embedding.emb


    """
    Step 2: apply the clustering method. 
    Once you get vector representation of the nodes, you can cluster the nodes based on those representations. 
    See the example below.
    """

    from sklearn.cluster import KMeans
    import numpy as np


    X = np.loadtxt("embedding.emb", skiprows=1) # load the embedding of the nodes of the graph
    #print(X)
    # sort the embedding based on node index in the first column in X
    X=X[X[:,0].argsort()]; 
    #print(X)
    Z=X[0:X.shape[0],1:X.shape[1]]; # remove the node index from X and save in Z

    kmeans = KMeans(n_clusters=2, random_state=0).fit(Z) # apply kmeans on Z
    labels=kmeans.labels_  # get the cluster labels of the nodes.
    print(labels)
