### 한국에서 코로나 바이러스에 감염된 사람들 데이터셋 처리

 - 비교적 최신 데이터  

 - csv 파일 형식  
 
 - column 항목  
 
 > 0. id: 확진자 id  
 > 1. global_num: n번째 확진자  
 > 2. sex: 성별  
 > 3. birth_year: 출생 연도  
 > 4. age: 나이  
 > 5. country: 국적  
 > 6. province: 방문지  
 > 7. city: 발생 도시  
 > 8. disease: 기저 질환  
 > 9. infection_case: 감염 종류  
 > 10. infection_order: 감염 차수 (n차 감염)  
 > 11. infected_by: 해당 확진자의 감염원 id  
 > 12. contact_number: 접촉자 수  
 > 13. symptom_onset_date: 증상 발현일  
 > 14. confirmed_date: 확진 일자  
 > 15. released_date: 퇴원 일자 (격리 해제 일자)
 > 16. deceased_date: 사망 일자
 > 17. state: 상태

In [2]:
# draw charts in jupyter
%matplotlib notebook

# import package
from matplotlib import pyplot as plt
import numpy as np
import csv
import pprint

 - 환자 데이터를 읽어드리고 일부의 column을 확인

In [3]:
# define file path
file_path = 'data/PatientInfo.csv'

# open patient.csv data
with open(file_path) as file:
    reader = csv.reader(file)  # read CSV file
    raw = list(reader)         # CSV file convert to list type

pprint.pprint(raw[:3])         # print somw column in CSV

[['patient_id',
  'global_num',
  'sex',
  'birth_year',
  'age',
  'country',
  'province',
  'city',
  'disease',
  'infection_case',
  'infection_order',
  'infected_by',
  'contact_number',
  'symptom_onset_date',
  'confirmed_date',
  'released_date',
  'deceased_date',
  'state'],
 ['1000000001',
  '2',
  'male',
  '1964',
  '50s',
  'Korea',
  'Seoul',
  'Gangseo-gu',
  '',
  'overseas inflow',
  '1',
  '',
  '75',
  '2020-01-22',
  '2020-01-23',
  '2020-02-05',
  '',
  'released'],
 ['1000000002',
  '5',
  'male',
  '1987',
  '30s',
  'Korea',
  'Seoul',
  'Jungnang-gu',
  '',
  'overseas inflow',
  '1',
  '',
  '31',
  '',
  '2020-01-30',
  '2020-03-02',
  '',
  'released']]


 - 전체 확진자 수 확인

In [4]:
# patient about COVID 19 in South Korea
print("patient about COVID 19 in South Korea : " + str(len(raw[1:])))

patient about COVID 19 in South Korea : 4004


 - list 형식의 데이터 중 첫번째는 안내 및 설명을 위한 데이터이므로 제거

In [5]:
raw_ = raw[1:]

#### COVID 19 환자의 status 

In [6]:
# num of each status
deceased = 0
released = 0
infected = 0

for i in range(len(raw_)):
    # status: 17
    state_of_patient = raw_[i][17]
    
    if(state_of_patient == 'deceased'):
        deceased = deceased + 1
    elif(state_of_patient == 'released'):
        released = released + 1
    else:
        infected = infected + 1
        
print("deceased : " + str(deceased))
print("released : " + str(released))
print("infected : " + str(infected))

deceased : 74
released : 2340
infected : 1590


In [7]:
type_of_patient_status = ['deceased', 'released', 'infected']

num_of_patient_status = [deceased, released, infected]
sum_of_patient = deceased + released + infected
ratio_of_patient_status = []

for i in range(len(num_of_patient_status)):
    ratio_of_patient_status.append(num_of_patient_status[i] / sum_of_patient)

for i in range(len(ratio_of_patient_status)):
    type_of_patient_status[i] = type_of_patient_status[i] + " : {0:0.1f}".format(ratio_of_patient_status[i] * 100)

colors = ['lightcoral', 'yellowgreen', 'gold']

patches, texts = plt.pie(ratio_of_patient_status, colors=colors, startangle=90)

plt.legend(patches, type_of_patient_status, loc="best")
plt.axis('equal')
plt.title("Num. of COVID 19 patients by status in South Korea")
plt.show()

<IPython.core.display.Javascript object>

#### Num. of COVID 19 patients by age group in South Korea

In [9]:
# 0s, 10s, .... 90s, 100s, unknown
age_list = [0, 0, 0, 0, 0, 
            0, 0, 0, 0, 0, 
            0, 0]
type_of_age = ['<10s', '10s', '20s', '30s', '40s', 
               '50s', '60s', '70s', '80s', '90s', 
               '>100s', 'unknown']

for i in range(len(raw_)):
    # birth_year: 3
    # age: 4
    index = 0
    age_data = raw_[i][3]
    if(age_data == '' or age_data == ' '):
        index = 11
    else:
        age_of_patient = 2020 - int(age_data)

        if(0 <= age_of_patient < 10):
            index = 0
        elif(10 <= age_of_patient < 20):
            index = 1
        elif(20 <= age_of_patient < 30):
            index = 2
        elif(30 <= age_of_patient < 40):
            index = 3
        elif(40 <= age_of_patient < 50):
            index = 4
        elif(50 <= age_of_patient < 60):
            index = 5
        elif(60 <= age_of_patient < 70):
            index = 6
        elif(70 <= age_of_patient < 80):
            index = 7
        elif(80 <= age_of_patient < 90):
            index = 8
        elif(90 <= age_of_patient < 100):
            index = 9
        else:
            index = 10
            
    age_list[index] = age_list[index] + 1
            

for i in range(len(age_list)):
    print(type_of_age[i] + " : " + str(age_list[i]))

<10s : 59
10s : 166
20s : 794
30s : 434
40s : 447
50s : 599
60s : 381
70s : 196
80s : 145
90s : 43
>100s : 2
unknown : 738


In [10]:
# draw bar chart
patient_age = age_list

y_pos = [i for i, _ in enumerate(type_of_age)]

plt.barh(y_pos, patient_age, color='teal')
plt.ylabel("ages")
plt.xlabel("number of patients")
plt.title("Num. of COVID 19 patients by age group in South Korea")

plt.yticks(y_pos, type_of_age)

plt.tight_layout()

for i, v in enumerate(patient_age):
    x_axis_txt_mover = 60
    y_axis_txt_mover = .15
    txt_color = 'white'
    
    if(v < 100):
        x_axis_txt_mover = -10
        txt_color = 'black'
    
    plt.text(v - x_axis_txt_mover, i - y_axis_txt_mover, str(v), color=txt_color, fontweight='bold')

plt.show()

<IPython.core.display.Javascript object>

In [11]:
ratio_of_10s = patient_age[1] / sum(patient_age)
print("Ratio of 10s of COVID 19 patient : " + str(ratio_of_10s * 100) + "%")

Ratio of 10s out of all COVID 19 patient : 4.145854145854146%


### 개학이 10대의 코로나 감염에 미치는 영향

 - 개학일을 기준으로 확인  
 - 고3부터 개학을 시작했으며 이에 따른 유동인구 증가 (학생을 데려다주거나 등등)에 따른 영향  
 - 또 개학을 한다는 것은 상대적으로 안전해졌다는 것을 의미하여 활동 인구의 증가에 따른 영향이 있을 것으로 사료됨  
 - 고3 기준 개학일은 5월 20일  

In [12]:
# 감염 날짜에 따른 10대 확진자 확인
starting_school_date = "2020-05-20"

# before_starting_school = BSS // after_starting_school = ASS
data_of_10s_patient_in_BSS = []
data_of_10s_patient_in_ASS = []
num_of_ex10s_patient_BSS = 0
num_of_ex10s_patient_ASS = 0

for i in range(len(raw_)):
    trg_data = raw_[i]
    
    # confirm_date: 14
    date = trg_data[14] 
    
    # birth_year: 3
    # age: 4
    age_data = trg_data[3]
    age_of_patient = 0
    
    if(age_data != '' and age_data != ' '):
        age_of_patient = 2020 - int(age_data)
    
    if(age_of_patient >= 10 and age_of_patient < 20):
        if(date >= starting_school_date):
            data_of_10s_patient_in_ASS.append(trg_data)
        elif(date < starting_school_date):
            data_of_10s_patient_in_BSS.append(trg_data)
    else:
        if(date >= starting_school_date):
            num_of_ex10s_patient_ASS = num_of_ex10s_patient_ASS + 1
        elif(date < starting_school_date):
            num_of_ex10s_patient_BSS = num_of_ex10s_patient_BSS + 1
            
#pprint.pprint(data_of_10s_patient_in_ASS[:5])
#pprint.pprint(data_of_10s_patient_in_BSS[:5])

all_patient_after_school = (len(data_of_10s_patient_in_ASS) + num_of_ex10s_patient_ASS)
all_patient_before_school = (len(data_of_10s_patient_in_BSS) + num_of_ex10s_patient_BSS)

print("Number of confirmed 10s after starting school : " + str(len(data_of_10s_patient_in_ASS)))
print("Number of confirmed 10s before starting school : " + str(len(data_of_10s_patient_in_BSS)))
print("Number of confirmed patient excluding 10s after starting school : " + str(num_of_ex10s_patient_ASS))
print("Number of confirmed patient excluding 10s before starting school : " + str(num_of_ex10s_patient_BSS))

print("Ratio of 10s of COVID 19 patient after starting school : " + str(len(data_of_10s_patient_in_ASS)
                                                                        / (len(data_of_10s_patient_in_ASS) + num_of_ex10s_patient_ASS) * 100))
print("Ratio of 10s of COVID 19 patient before starting school : " + str(len(data_of_10s_patient_in_BSS)
                                                                         / (len(data_of_10s_patient_in_BSS) + num_of_ex10s_patient_BSS) * 100))

Number of confirmed 10s after starting school : 18
Number of confirmed 10s before starting school : 148
Number of confirmed patient excluding 10s after starting school : 337
Number of confirmed patient excluding 10s before starting school : 3501
Ratio of 10s of COVID 19 patient after starting school : 5.070422535211268
Ratio of 10s of COVID 19 patient before starting school : 4.055905727596602


In [14]:
x_axis_text = ['before', 'after']

ex10s_patient_list = [num_of_ex10s_patient_BSS / all_patient_before_school * 100, 
                      num_of_ex10s_patient_ASS / all_patient_after_school * 100]

patient_10s_list = [len(data_of_10s_patient_in_BSS) / all_patient_before_school * 100, 
                    len(data_of_10s_patient_in_ASS) / all_patient_after_school * 100]

plt.xticks([0, 1], x_axis_text)

plt.bar(range(len(ex10s_patient_list)), ex10s_patient_list, color='bisque') 
plt.bar(range(len(patient_10s_list)), patient_10s_list, bottom=ex10s_patient_list, color='coral')

plt.title("Num. of COVID 19 patients by starting school in South Korea")
plt.ylabel("ratio of num. patients [%]")
plt.xlabel("starting school")
plt.legend(['except 10s', '10s'])

plt.ylim([0, 100])

for i, v in enumerate(ex10s_patient_list):
    x_axis_txt_mover = 0.08
    y_axis_txt_mover = 6
    plt.text(i - x_axis_txt_mover, v - y_axis_txt_mover, "{0:0.1f}%".format(patient_10s_list[i]), color='coral', fontweight='bold')
    
plt.show()

<IPython.core.display.Javascript object>

#### 개학 전 후 10대 확진자들의 감염 원인

In [22]:
# 개학 전 10대 확진자의 감염 원인
data_of_10s_patient_in_BSS
infection_case_of_10s_patient_in_BSS = []

for i in data_of_10s_patient_in_BSS:
    # infection case : 9
    tmp_infection_case = i[9]
    if tmp_infection_case in infection_case_of_10s_patient_in_BSS:
        pass
    else:
        infection_case_of_10s_patient_in_BSS.append(tmp_infection_case)

# 개학 후 10대 확진자의 감염 원인
data_of_10s_patient_in_ASS
infection_case_of_10s_patient_in_ASS = []

for i in data_of_10s_patient_in_ASS:
    # infection case : 9
    tmp_infection_case = i[9]
    if tmp_infection_case in infection_case_of_10s_patient_in_ASS:
        pass
    else:
        infection_case_of_10s_patient_in_ASS.append(tmp_infection_case)

In [23]:
print("infection case before starting school ... ")
pprint.pprint(infection_case_of_10s_patient_in_BSS)

infection case before starting school ... 
['Seongdong-gu APT',
 'contact with patient',
 'etc',
 'Dongan Church',
 'Guro-gu Call Center',
 'overseas inflow',
 'Onchun Church',
 'Shincheonji Church',
 '',
 'Changnyeong Coin Karaoke']


In [24]:
print("infection case after starting school ... ")
pprint.pprint(infection_case_of_10s_patient_in_ASS)

infection case after starting school ... 
['etc', 'contact with patient', 'overseas inflow', 'Coupang Logistics Center']


In [30]:
# 개학 전 10대 확진자의 감염 원인 수 확인
num_infection_case_of_10s_patient_in_BSS = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

for i in data_of_10s_patient_in_BSS:
    # infection case : 9
    tmp_infection_case = i[9]
    for j in range(len(infection_case_of_10s_patient_in_BSS)):
        if(infection_case_of_10s_patient_in_BSS[j] == tmp_infection_case
           and (tmp_infection_case == 'etc' or tmp_infection_case == '')):
            num_infection_case_of_10s_patient_in_BSS[2] = num_infection_case_of_10s_patient_in_BSS[2] + 1
        elif(infection_case_of_10s_patient_in_BSS[j] == tmp_infection_case):
            num_infection_case_of_10s_patient_in_BSS[j] = num_infection_case_of_10s_patient_in_BSS[j] + 1
            
del infection_case_of_10s_patient_in_BSS[8]
del num_infection_case_of_10s_patient_in_BSS[8] 
            
# 개학 후 10대 확진자의 감염 원인 수 확인
num_infection_case_of_10s_patient_in_ASS = [0, 0, 0, 0]

for i in data_of_10s_patient_in_ASS:
    # infection case : 9
    tmp_infection_case = i[9]
    for j in range(len(infection_case_of_10s_patient_in_ASS)):
        if(infection_case_of_10s_patient_in_ASS[j] == tmp_infection_case):
            num_infection_case_of_10s_patient_in_ASS[j] = num_infection_case_of_10s_patient_in_ASS[j] + 1

In [31]:
# 개학 전 10대 확진자의 감염 원인 수 확인 그래프
# draw bar chart
infection_case = infection_case_of_10s_patient_in_BSS
num_of_case = num_infection_case_of_10s_patient_in_BSS

y_pos = [i for i, _ in enumerate(infection_case)]

plt.barh(y_pos, num_of_case, color='slateblue')
plt.ylabel("infection case")
plt.xlabel("number of patients")
plt.title("Num. of COVID 19 patients 10s by infection case" + "\n" + "before school in South Korea")

plt.yticks(y_pos, infection_case)
plt.xlim([0, 90])

plt.tight_layout()

for i, v in enumerate(num_of_case):
    x_axis_txt_mover = 60
    y_axis_txt_mover = .15
    txt_color = 'white'
    
    if(v < 100):
        x_axis_txt_mover = -2
        txt_color = 'black'
    
    plt.text(v - x_axis_txt_mover, i - y_axis_txt_mover, str(v), color=txt_color, fontweight='bold')

plt.show()

<IPython.core.display.Javascript object>

In [32]:
# 개학 후 10대 확진자의 감염 원인 수 확인 그래프
# draw bar chart
infection_case = infection_case_of_10s_patient_in_ASS
num_of_case = num_infection_case_of_10s_patient_in_ASS

y_pos = [i for i, _ in enumerate(infection_case)]

plt.barh(y_pos, num_of_case, color='slateblue')
plt.ylabel("infection case")
plt.xlabel("number of patients")
plt.title("Num. of COVID 19 patients 10s by infection case" + "\n" + "before school in South Korea")

plt.yticks(y_pos, infection_case)
plt.xlim([0, 20])

plt.tight_layout()

for i, v in enumerate(num_of_case):
    x_axis_txt_mover = 60
    y_axis_txt_mover = .15
    txt_color = 'white'
    
    if(v < 100):
        x_axis_txt_mover = -2
        txt_color = 'black'
    
    plt.text(v - x_axis_txt_mover, i - y_axis_txt_mover, str(v), color=txt_color, fontweight='bold')

plt.show()

<IPython.core.display.Javascript object>