In [2]:
import pandas as pd

In [5]:
# apply() 함수 사용법
# 열 또는 행에 우리가 정의한 함수를 적용할 수 있는 것.

df = pd.DataFrame({
    '영어': [60, 70],
    '수학': [100, 70],
}, index=['Dave', 'David'])
df

Unnamed: 0,영어,수학
Dave,60,100
David,70,70


In [8]:
def func(df_data):
    print(type(df_data))
    print(df_data.index)
    print(df_data.values)
    return df_data

In [9]:
# axis=0 : 수직 (열)
# aixs=1 : 수평 (행)
df_func = df.apply(func, axis=0)

<class 'pandas.core.series.Series'>
Index(['Dave', 'David'], dtype='object')
[60 70]
<class 'pandas.core.series.Series'>
Index(['Dave', 'David'], dtype='object')
[100  70]


In [12]:
# 참고로 행이 두개 인데, 3번 func가 호출되는 이유는 applu() 함수 자체가
# 첫번째 행에 대해서는 두번 호출하도록 구현되어 있다.

df_func = df.apply(func, axis=1)
df_func

<class 'pandas.core.series.Series'>
Index(['영어', '수학'], dtype='object')
[ 60 100]
<class 'pandas.core.series.Series'>
Index(['영어', '수학'], dtype='object')
[70 70]


Unnamed: 0,영어,수학
Dave,60,100
David,70,70


---
### apply() 함수 사용해서, 국가 컬럼값 변경하기

In [40]:
doc = pd.read_csv('./file/COVID-19-master/csse_covid_19_data/csse_covid_19_daily_reports/01-22-2020.csv', encoding='utf-8-sig')

try:
    doc = doc[['Province_State', 'Country_Region', 'Confirmed']]
except:
    doc = doc[['Province/State', 'Country/Region', 'Confirmed']]
    doc.columns = ['Province_State', 'Country_Region', 'Confirmed']

doc = doc.dropna(subset=['Confirmed'])
doc = doc.astype({'Confirmed': 'int64'})

doc.head()

Unnamed: 0,Province_State,Country_Region,Confirmed
0,Anhui,Mainland China,1
1,Beijing,Mainland China,14
2,Chongqing,Mainland China,6
3,Fujian,Mainland China,1
5,Guangdong,Mainland China,26


In [41]:
# 다르게 들어간 국가명을 바로 지정하기
import json

In [42]:
with open('./file/COVID-19-master/csse_covid_19_data/country_convert.json', 'r', encoding='utf-8-sig') as json_file:
    json_data = json.load(json_file)
    print(json_data.keys())

dict_keys(['Mainland China', 'Macau', 'South Korea', 'Aruba', ' Azerbaijan', 'Bahamas, The', 'Cape Verde', 'Cayman Islands', 'Channel Islands', 'Curacao', 'Czech Republic', 'East Timor', 'Faroe Islands', 'French Guiana', 'Gambia, The', 'Gibraltar', 'Greenland', 'Guadeloupe', 'Guam', 'Guernsey', 'Hong Kong', 'Hong Kong SAR', 'Iran (Islamic Republic of)', 'Ivory Coast', 'Jersey', 'Macao SAR', 'Martinique', 'Mayotte', 'North Ireland', 'Palestine', 'Puerto Rico', 'Republic of Ireland', 'Republic of Korea', 'Republic of Moldova', 'Republic of the Congo', 'Reunion', 'Russian Federation', 'Saint Barthelemy', 'Saint Martin', 'St. Martin', 'Taipei and environs', 'The Bahamas', 'The Gambia', 'UK', 'Vatican City', 'Viet Nam', 'occupied Palestinian territory', 'Taiwan*', 'Malawi', 'South Sudan', 'Western Sahara', 'Namibia'])


In [43]:
json_data

{'Mainland China': 'China',
 'Macau': 'China',
 'South Korea': 'Korea, South',
 'Aruba': 'Netherlands',
 ' Azerbaijan': 'Azerbaijan',
 'Bahamas, The': 'Bahamas',
 'Cape Verde': 'Cabo Verde',
 'Cayman Islands': 'United Kingdom',
 'Channel Islands': 'United Kingdom',
 'Curacao': 'Netherlands',
 'Czech Republic': 'Czechia',
 'East Timor': 'Timor-Leste',
 'Faroe Islands': 'Denmark',
 'French Guiana': 'France',
 'Gambia, The': 'Gambia',
 'Gibraltar': 'United Kingdom',
 'Greenland': 'Denmark',
 'Guadeloupe': 'France',
 'Guam': 'US',
 'Guernsey': 'US',
 'Hong Kong': 'China',
 'Hong Kong SAR': 'China',
 'Iran (Islamic Republic of)': 'Iran',
 'Ivory Coast': "Cote d'Ivoire",
 'Jersey': 'US',
 'Macao SAR': 'China',
 'Martinique': 'France',
 'Mayotte': 'France',
 'North Ireland': 'United Kingdom',
 'Palestine': 'West Bank and Gaza',
 'Puerto Rico': 'US',
 'Republic of Ireland': 'Ireland',
 'Republic of Korea': 'Korea, South',
 'Republic of Moldova': 'Moldova',
 'Republic of the Congo': 'Congo (Bra

In [44]:
# 컬럼값 변경하기
def func(row):
    if row['Country_Region'] in json_data:
        row['Country_Region'] = json_data[row['Country_Region']]
    return row

In [45]:
doc = doc.apply(func, axis=1)
doc.head()

Unnamed: 0,Province_State,Country_Region,Confirmed
0,Anhui,China,1
1,Beijing,China,14
2,Chongqing,China,6
3,Fujian,China,1
5,Guangdong,China,26


----

In [46]:
# 파일명으로 데이터 변환하기
date = '01-22-2020.csv'
date_column = date.split('.')[0].lstrip('0').replace('-', '/')
date_column

'1/22/2020'

In [47]:
doc.columns

Index(['Province_State', 'Country_Region', 'Confirmed'], dtype='object')

In [48]:
doc.columns = ['Province_State', 'Country_Region', date_column]
doc.columns

Index(['Province_State', 'Country_Region', '1/22/2020'], dtype='object')

In [49]:
doc.head()

Unnamed: 0,Province_State,Country_Region,1/22/2020
0,Anhui,China,1
1,Beijing,China,14
2,Chongqing,China,6
3,Fujian,China,1
5,Guangdong,China,26


---

In [51]:
# 중복 데이터 합치기

df = pd.DataFrame({
    '성별': ['남', '남', '남'],
    '이름': ['David', 'Dave', 'Dave'],
    '수학': [100,50,80],
    '국어': [80,70,50]
})
df

Unnamed: 0,성별,이름,수학,국어
0,남,David,100,80
1,남,Dave,50,70
2,남,Dave,80,50


In [52]:
# groubby는 집계함수를 쓰면 문자열 컬럼은 없어진다.
# 결과 데이터 프레임의 인덱스로 변경된다.

df.groupby('이름').mean()

Unnamed: 0_level_0,수학,국어
이름,Unnamed: 1_level_1,Unnamed: 2_level_1
Dave,65,60
David,100,80


In [53]:
df.groupby('이름').sum()

Unnamed: 0_level_0,수학,국어
이름,Unnamed: 1_level_1,Unnamed: 2_level_1
Dave,130,120
David,100,80


In [54]:
doc.groupby('Country_Region').sum()

Unnamed: 0_level_0,1/22/2020
Country_Region,Unnamed: 1_level_1
China,548
Japan,2
"Korea, South",1
Taiwan,1
Thailand,2
US,1


----

In [55]:
# 데이터 전처리 하기
# 1) csv 파일 읽기
# 2) 'Country_Region', 'Confirmed' 두개 컬럼 가져오기
# 3) 'Confirmed' 에 데이터가 없는 행 삭제하기
# 4) 'Country_Region' 의 국가명 일관되게 변경
# 5) 'Confirmed' 데이터 정수형으로 변경
# 6) 'Country_Region' 를 기준으로 중복된 데이터 합치기
# 7) 'Confirmed' 컬럼 날짜 컬럼으로 변경하기

In [59]:
import pandas as pd
import json

PATH = './file/COVID-19-master/csse_covid_19_data/csse_covid_19_daily_reports/'

with open('./file/COVID-19-master/csse_covid_19_data/country_convert.json', encoding='utf-8-sig') as json_file:
    json_data = json.load(json_file)


def country_name_convert(row):
    if row['Country_Region'] in json_data:
        return json_data[row['Country_Region']]
    return row['Country_Region']


def create_dateframe(filename):
    doc = pd.read_csv(PATH + filename, encoding='utf-8-sig')

    try:
        doc = doc[['Country_Region', 'Confirmed']]
    except:
        doc = doc[['Country/Region', 'Confirmed']]
        doc.columns = ['Country_Region', 'Confirmed']

    doc = doc.dropna(subset=['Confirmed'])
    doc['Country_Region'] = doc.apply(country_name_convert, axis=1)
    doc = doc.astype({'Confirmed': 'int64'})
    doc = doc.groupby('Country_Region').sum()

    date_column = filename.split(".")[0].lstrip('0').replace('-', '/')
    doc.columns = [date_column]

    return doc


In [64]:
doc1 = create_dateframe('01-22-2020.csv')
doc2 = create_dateframe('04-01-2020.csv')
doc2.head()

Unnamed: 0_level_0,4/01/2020
Country_Region,Unnamed: 1_level_1
Afghanistan,237
Albania,259
Algeria,847
Andorra,390
Angola,8


---

In [65]:
# 데이터 프레임 합치기

doc = pd.merge(doc1, doc2, how='outer', left_index=True, right_index=True)
doc.head(15)

Unnamed: 0_level_0,1/22/2020,4/01/2020
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,,237
Albania,,259
Algeria,,847
Andorra,,390
Angola,,8
Antigua and Barbuda,,7
Argentina,,1054
Armenia,,571
Australia,,4862
Austria,,10711


In [66]:
doc = doc.fillna(0)
doc

Unnamed: 0_level_0,1/22/2020,4/01/2020
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1
Afghanistan,0.0,237
Albania,0.0,259
Algeria,0.0,847
Andorra,0.0,390
Angola,0.0,8
...,...,...
Venezuela,0.0,143
Vietnam,0.0,218
West Bank and Gaza,0.0,134
Zambia,0.0,36


---

In [70]:
# 특정 폴더 리스트 가져오기

import os

file_list = os.listdir(PATH)
csv_list = list()

for file in file_list:
    if file.split('.')[-1] == 'csv':
        csv_list.append(file)

# csv_list

---


In [71]:
# 최종 코드

def generate_dateframe_by_path(PATH):
    file_list = os.listdir(PATH)
    csv_list = list()
    first_doc = True

    for file in file_list:
        if file.split('.')[-1] == 'csv':
            csv_list.append(file)
    csv_list.sort()

    for file in csv_list:
        doc = create_dateframe(file)
        if first_doc:
            final_doc, first_doc = doc, False
        else:
            final_doc = pd.merge(final_doc, doc, how='outer', left_index=True, right_index=True)
    
    final_doc = final_doc.fillna(0)
    return final_doc

In [72]:
PATH = './file/COVID-19-master/csse_covid_19_data/csse_covid_19_daily_reports/'
doc = generate_dateframe_by_path(PATH)
doc

Unnamed: 0_level_0,1/22/2020,1/23/2020,1/24/2020,1/25/2020,1/26/2020,1/27/2020,1/28/2020,1/29/2020,1/30/2020,1/31/2020,...,6/08/2020,6/09/2020,6/10/2020,6/11/2020,6/12/2020,6/13/2020,6/14/2020,6/15/2020,6/16/2020,6/17/2020
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,20917.0,21459.0,22142.0,22890.0,23546.0,24102.0,24766.0,25527.0,26310.0,26874.0
Albania,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1263.0,1299.0,1341.0,1385.0,1416.0,1464.0,1521.0,1590.0,1672.0,1722.0
Algeria,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10265.0,10382.0,10484.0,10589.0,10698.0,10810.0,10919.0,11031.0,11147.0,11268.0
Andorra,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,852.0,852.0,852.0,852.0,853.0,853.0,853.0,853.0,854.0,854.0
Angola,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,92.0,96.0,113.0,118.0,130.0,138.0,140.0,142.0,148.0,155.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Vietnam,0.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,332.0,332.0,332.0,332.0,333.0,334.0,334.0,334.0,334.0,335.0
West Bank and Gaza,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,473.0,481.0,485.0,487.0,489.0,489.0,492.0,505.0,514.0,555.0
Yemen,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,496.0,524.0,560.0,591.0,632.0,705.0,728.0,844.0,885.0,902.0
Zambia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1200.0,1200.0,1200.0,1200.0,1321.0,1357.0,1358.0,1382.0,1405.0,1412.0


In [73]:
doc = doc.astype('int64')
doc

Unnamed: 0_level_0,1/22/2020,1/23/2020,1/24/2020,1/25/2020,1/26/2020,1/27/2020,1/28/2020,1/29/2020,1/30/2020,1/31/2020,...,6/08/2020,6/09/2020,6/10/2020,6/11/2020,6/12/2020,6/13/2020,6/14/2020,6/15/2020,6/16/2020,6/17/2020
Country_Region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Afghanistan,0,0,0,0,0,0,0,0,0,0,...,20917,21459,22142,22890,23546,24102,24766,25527,26310,26874
Albania,0,0,0,0,0,0,0,0,0,0,...,1263,1299,1341,1385,1416,1464,1521,1590,1672,1722
Algeria,0,0,0,0,0,0,0,0,0,0,...,10265,10382,10484,10589,10698,10810,10919,11031,11147,11268
Andorra,0,0,0,0,0,0,0,0,0,0,...,852,852,852,852,853,853,853,853,854,854
Angola,0,0,0,0,0,0,0,0,0,0,...,92,96,113,118,130,138,140,142,148,155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Vietnam,0,2,2,2,2,2,2,2,2,2,...,332,332,332,332,333,334,334,334,334,335
West Bank and Gaza,0,0,0,0,0,0,0,0,0,0,...,473,481,485,487,489,489,492,505,514,555
Yemen,0,0,0,0,0,0,0,0,0,0,...,496,524,560,591,632,705,728,844,885,902
Zambia,0,0,0,0,0,0,0,0,0,0,...,1200,1200,1200,1200,1321,1357,1358,1382,1405,1412


In [75]:
doc.to_csv('result/final_df.csv')