## data

full_dataset feature : Province/state, country/region, lat, long, date, confirmed, death, recovered 

In [0]:
import requests
import pandas as pd
import wget

In [0]:
# data file url (실시간 데이터)
urls = ['https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_confirmed_global.csv', 
        'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_deaths_global.csv',
        'https://raw.githubusercontent.com/CSSEGISandData/COVID-19/master/csse_covid_19_data/csse_covid_19_time_series/time_series_covid19_recovered_global.csv']

# 파일 다운
for url in urls:
    filename = wget.download(url)

In [0]:
# data
conf_df = pd.read_csv('time_series_covid19_confirmed_global.csv')
deaths_df = pd.read_csv('time_series_covid19_deaths_global.csv')
recv_df = pd.read_csv('time_series_covid19_recovered_global.csv')

In [0]:
# 데이터에서 Date만 따로 추출
dates = conf_df.columns[4:]

In [51]:
conf_df_long = conf_df.melt(id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], 
                            value_vars=dates, var_name='Date', value_name='Confirmed')

deaths_df_long = deaths_df.melt(id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], 
                            value_vars=dates, var_name='Date', value_name='Deaths')

recv_df_long = recv_df.melt(id_vars=['Province/State', 'Country/Region', 'Lat', 'Long'], 
                            value_vars=dates, var_name='Date', value_name='Recovered')

recv_df_long = recv_df_long[recv_df_long['Country/Region']!='Canada']

print(conf_df_long.shape)
print(deaths_df_long.shape)
print(recv_df_long.shape)

(35112, 6)
(35112, 6)
(33264, 6)


In [52]:
#merge
full_table = pd.merge(left=conf_df_long, right=deaths_df_long, how='left',
                      on=['Province/State', 'Country/Region', 'Date', 'Lat', 'Long'])
full_table = pd.merge(left=full_table, right=recv_df_long, how='left',
                      on=['Province/State', 'Country/Region', 'Date', 'Lat', 'Long'])

full_table.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered
0,,Afghanistan,33.0,65.0,1/22/20,0,0,0.0
1,,Albania,41.1533,20.1683,1/22/20,0,0,0.0
2,,Algeria,28.0339,1.6596,1/22/20,0,0,0.0
3,,Andorra,42.5063,1.5218,1/22/20,0,0,0.0
4,,Angola,-11.2027,17.8739,1/22/20,0,0,0.0


In [54]:
full_table.shape # matrix shape check

(35112, 8)

In [55]:
full_table.isna().sum() # 결측치 확인 

Province/State    24420
Country/Region        0
Lat                   0
Long                  0
Date                  0
Confirmed             0
Deaths                0
Recovered          3696
dtype: int64

In [59]:
full_table['Province/State'].value_counts()

Faroe Islands         132
Hubei                 132
Beijing               132
Gansu                 132
Fujian                132
                     ... 
Anhui                 132
Tibet                 132
Victoria              132
Northern Territory    132
Yunnan                132
Name: Province/State, Length: 80, dtype: int64

In [0]:
# 여러개 지역이 있는 경우 제거 
full_table = full_table[full_table['Province/State'].str.contains(',')!=True]

In [0]:
# Province/State에 Recovered string이 포함된 것 제거
full_table = full_table[full_table['Province/State'].str.contains('Recovered')!=True]

In [0]:
full_table.to_csv('full_data.csv', index=False)

In [62]:
full_table.head()

Unnamed: 0,Province/State,Country/Region,Lat,Long,Date,Confirmed,Deaths,Recovered
0,,Afghanistan,33.0,65.0,1/22/20,0,0,0.0
1,,Albania,41.1533,20.1683,1/22/20,0,0,0.0
2,,Algeria,28.0339,1.6596,1/22/20,0,0,0.0
3,,Andorra,42.5063,1.5218,1/22/20,0,0,0.0
4,,Angola,-11.2027,17.8739,1/22/20,0,0,0.0


## 새로운 feature생성 
- confirmed --> death or recoveries or active로 구분가능, active 변수 생성
- 지역감염이라는 새로운 잠재변수 생성

In [64]:
# 주어진 데이터 처음과 마지막 날짜 확인 
a = full_table.Date.value_counts().sort_index()
print(a.index[0])
print(a.index[-1])

1/22/20
6/1/20


In [65]:
full_table[full_table['Recovered'].isna()]['Country/Region'].value_counts()

Canada         1848
China           528
Cameroon        132
Grenada         132
Syria           132
South Sudan     132
Tajikistan      132
Czechia         132
Timor-Leste     132
Mozambique      132
Yemen           132
Laos            132
Name: Country/Region, dtype: int64

In [0]:
# 새로운 상태 추가 : active
full_table['active'] = full_table['Confirmed']-full_table['Deaths']-full_table['Recovered']

In [67]:
full_table.columns

Index(['Province/State', 'Country/Region', 'Lat', 'Long', 'Date', 'Confirmed',
       'Deaths', 'Recovered', 'active'],
      dtype='object')

In [0]:
# 지역 감염 feature 생성 


## full_datas바탕, 장소+시간에 따른 감염 시각화

In [0]:
import plotly.express as px

In [73]:
df = full_table
df['Date'] = pd.to_datetime(df['Date'])
df['Date'] = df['Date'].dt.strftime('%m/%d/%Y')
df = df.fillna('-')
fig = px.density_mapbox(df, lat='Lat', lon='Long', z='Confirmed', radius=20,zoom=1, hover_data=["Country/Region",'Province/State',"Confirmed"],
                        mapbox_style="carto-positron", animation_frame = 'Date', range_color= [0, 2000],title='Spread of Covid-19')
fig.update_layout(margin={"r":0,"t":30,"l":0,"b":0})
fig.show()