## Case Study: Korean Centers for Disease Control and Prevention (KCDC)
[[NeurlIPS 2020] DS4C: Data Science for COVID-19 in South Korea](https://www.kaggle.com/datasets/kimjihoo/coronavirusdataset)

> COVID-19 has infected more than 10,000 people in South Korea. KCDC (Korea Centers for Disease Control & Prevention) announces the information of COVID-19 quickly and transparently. We make a structured dataset based on the report materials of KCDC and local governments. Also, we analyze and visualize the data using various data mining or visualization techniques.

**With this data, the KDCA would like to better understand two questions:**
1) How is the COVID-19 virus is spreading throughout the different parts of the country? 

2) On whom, and which areas should they focus on as ‘high-transmission’ in order to inform how to tailor aspects of the organization's next wave of COVID-19 healthcare measures?


In [1]:
import os
import pandas as pd 
import numpy as np 

# Gather all data into a dictionary, with each file recieving an entry
sk_data = dict()
for dirname, _, filenames in os.walk('/kaggle/input'):
    if (len(filenames) > 1):
        print(", ".join(filenames))
        for file in filenames:
            sk_data[file[:-4]] = pd.read_csv(dirname + '/' + file)

TimeAge.csv, Case.csv, Policy.csv, Region.csv, PatientInfo.csv, SearchTrend.csv, Weather.csv, TimeProvince.csv, TimeGender.csv, SeoulFloating.csv, Time.csv


In [2]:
from kaggle_secrets import UserSecretsClient
import plotly.express as px  
import plotly.figure_factory as ff

my_access_token = UserSecretsClient().get_secret("mapbox_access_token")
px.set_mapbox_access_token(my_access_token)

  shapely_geos_version, geos_capi_version_string


In [3]:
fig = px.scatter_mapbox(sk_data['Region'], lat="latitude", lon="longitude", color='province', color_discrete_sequence=px.colors.qualitative.Pastel,
                            zoom=5, title="Inventory of Schools, Universities, and Nursing Homes", size="nursing_home_count",
                           hover_data=['elementary_school_count', 'kindergarten_count', 'university_count', 'academy_ratio', 'elderly_population_ratio',
                                      'elderly_alone_ratio', 'nursing_home_count'])
fig.update_layout(width=1000)
fig.show()

In [4]:
fig = px.histogram(sk_data['Time'], x="date", y=["deceased", "released", "confirmed"],
     title="Monthly COVID-19 Infections: February - June 2020", color_discrete_sequence=px.colors.qualitative.Pastel, 
        hover_data=['variable'], labels={'date':'Date', 'variable': "Infection Result", 'sum of value': "Infections to Date"})

fig.update_traces(xbins_size="D1")
fig.update_xaxes(showgrid=True, ticklabelmode="period", tickformat="%b %d")
fig.update_layout(bargap=0.2, yaxis_title="Total Infections")
fig.show()

In [5]:
cases_df = sk_data['Case']
group_cases_sum_df = cases_df[cases_df.group == True]
group_cases_sum_df = group_cases_sum_df[group_cases_sum_df.confirmed > 0]

group_cases_sum_df = group_cases_sum_df.drop(['group', ' case_id'], axis=1)
group_cases_sum_df = group_cases_sum_df.groupby('infection_case').sum()
group_cases_sum_df = group_cases_sum_df[group_cases_sum_df.confirmed > 2]

In [6]:
from nltk.corpus import stopwords 

infection_cases_list = group_cases_sum_df.index.values
infection_cases_words = [word_list.split() for word_list in infection_cases_list]
infection_cases_words =  [word for sub_list in infection_cases_words for word in sub_list]

infection_cases_words = [word for word in infection_cases_words if not word in stopwords.words('english')]
infection_cases_words = pd.value_counts(np.array(infection_cases_words))

infection_cases_words_df = infection_cases_words.reset_index()
infection_cases_words_df = infection_cases_words_df.rename({'index': 'Word', 0: 'Occurence'}, axis=1)
infection_cases_words_df = infection_cases_words_df[infection_cases_words_df.Occurence > 1]

In [7]:
print("Top Words From COVID-19 Group Transmission Cases\n")
fig = ff.create_table(infection_cases_words_df)
fig.layout.height=300
fig.layout.width=300
fig.show()

Top Words From COVID-19 Group Transmission Cases



In [8]:
# Which group infections had the most impact on COVID-19 tranmission
fig = px.pie(group_cases_sum_df, values='confirmed', names=group_cases_sum_df.index, color_discrete_sequence=px.colors.qualitative.Pastel,
title="COVID-19 Transmission By Group", hover_data=[group_cases_sum_df.index], labels={'confirmed':'# Infections: ',
                                                                                       'infection_case': "Case:"})
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(width=800)
fig.show()

In [9]:
group_cases_location_df = cases_df[['latitude', 'longitude', 'infection_case', 'confirmed', 'province']].dropna()
group_cases_location_df = group_cases_location_df[group_cases_location_df.latitude != '-'] 
group_cases_location_df = group_cases_location_df[group_cases_location_df.longitude != '-']

group_cases_location_df['longitude'] = group_cases_location_df['longitude'].astype('float')
group_cases_location_df['latitude'] = group_cases_location_df['latitude'].astype('float')

In [10]:
fig = px.scatter_mapbox(group_cases_location_df, lat="latitude", lon="longitude", 
                  color='province',  color_discrete_sequence=px.colors.qualitative.Pastel,
                                size='confirmed', hover_name='infection_case', 
                                zoom=6.5, title="COVID-19 Transmission By Group",
                                          labels={'province':'Province', 'confirmed': '# Infections', 'latitude': 'Lat', 'longitude':'Long'})
fig.update_layout(width=800)
fig.show()

In [11]:
patient_case_data_df = sk_data['Case'].merge(sk_data['PatientInfo'], on='infection_case')
patient_case_data_df.rename(columns={
    'province_x': 'case_province',
    'province_y' : 'patient_province',
    'city_x' : 'case_city',
    'city_y' : 'patient_city',
    }, inplace=True)
patient_case_data_df.drop(['sex', 'age', 'contact_number', 'released_date', 'deceased_date'], inplace=True, axis=1)

In [12]:
fig = px.area(sk_data['TimeGender'], x='date', y=['confirmed','deceased'], facet_col='sex',
              color_discrete_sequence=px.colors.qualitative.Pastel,
              title="COVID-19 Infections by Sex (March - June 2020)", 
              hover_data=['date'], labels={'value':'Number Infections', 'date': "Date", 'variable': 'Status', 'sex': 'Sex'})
fig.show()

In [13]:
fig = px.line(sk_data['TimeAge'], x='date', y='deceased', color='age',
              color_discrete_sequence=px.colors.qualitative.Pastel, 
              title="COVID-19 Deaths by Age Group (March - June 2020)",
              labels={'deceased':'Infections', 'age': 'Age Range', 'date': 'Date'})
fig.show()

In [14]:
fig = px.line(sk_data['TimeProvince'], x='date', y='deceased', color='province', 
              color_discrete_sequence=px.colors.qualitative.Pastel, title="COVID-19 Deaths by Province (March - June 2020)",
              hover_data=['date', 'province', 'confirmed', 'released', 'deceased'],
             labels={'deceased': 'Deceased', 'date': 'Date', 'province': "Province",
                    'released':'Recovered', 'confirmed': 'Confirmed'})
fig.show()

In [15]:
# Are there any possible correlations between search terms during this period?
search_trend_df = sk_data['SearchTrend']
search_trend_df['year'] = search_trend_df['date'].apply(lambda date: date[:4])
search_trend_df = search_trend_df[search_trend_df.year == '2020']
search_trend_df = search_trend_df.drop(['year', 'date'],axis=1)
 
fig = px.scatter_matrix(search_trend_df,  color_discrete_sequence=px.colors.qualitative.Pastel,
                        title="COVID-19 Search Term Occurence January - July 2020)")
fig.update_layout(height=750)
fig.show()

In [16]:
# What types of policies is South Korea focusing on?
policy_types_count = sk_data['Policy']['type'].value_counts().reset_index()
policy_types_count.rename({'index': 'Type', 'type': '# Policies'}, axis=1, inplace=True)

fig = ff.create_table(policy_types_count)
fig.layout.height=250
fig.layout.width=250
fig.show()

In [17]:
# How long do these policies last, and when were they set in place?
timed_policies_df = sk_data['Policy'].dropna(subset=['start_date','end_date'])

fig = px.timeline(timed_policies_df, x_start="start_date", x_end="end_date", y="gov_policy",
                  color_discrete_sequence=px.colors.qualitative.Pastel, color="type",
                                title="COVID-19 Related Policies Completed in 2020", 
                                  hover_data=["gov_policy", "detail"], 
                                  labels={'start_date':'Start Date', 'end_date': 'End Date', 'type': 'Policy Type',
                                          'gov_policy': 'Government Policy', 'detail': "Description:"})
fig.update_yaxes(autorange="reversed")
fig.show()