In [30]:
## Query real time data from DingXiangYuan, and keep the latest records every day for each city

## Created on Sat Feb  8 12:41:50 2020
## Author: leebond
#### resource: https://github.com/jianxu305/nCov2019_analysis

import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import pickle as pkl
import numpy as np
import math
import datetime
import warnings
warnings.filterwarnings('ignore')
from googletrans import Translator # package used to translate Chinese into English
translator = Translator()

In [31]:
## Query the latest data
! python dataset.py

2020-02-14 17:35:15Update records successfully to ../data/DXY_Chinese.csv


In [32]:
DXYArea = pd.read_csv('../data/DXY_Chinese.csv') # Read Chinese version 
# select column
DXYArea = DXYArea[['date','country','countryCode','province', 'city', 'confirmed', 'suspected', 'cured', 'dead']]

DXYArea.tail(2)  

Unnamed: 0,date,country,countryCode,province,city,confirmed,suspected,cured,dead
10542,2020-02-14,美国,US,,,15,0,3,0
10543,2020-02-14,越南,VN,,,16,0,5,0


In [33]:
DXYArea[(DXYArea['countryCode'] == 'CN') & (DXYArea['province'].isnull())].tail()

Unnamed: 0,date,country,countryCode,province,city,confirmed,suspected,cured,dead
7832,2020-02-10,中国,CN,,,42708,21675,3998,1017
8375,2020-02-11,中国,CN,,,44730,16067,4742,1114
8919,2020-02-12,中国,CN,,,58839,13435,5646,1260
9462,2020-02-13,中国,CN,,,63932,10109,6728,1381
10005,2020-02-14,中国,CN,,,63936,10109,6869,1381


In [34]:
def isNaN(num):
    return num != num

## Resoruce for Chinese - English Translation
with open('chineseProvince_to_EN.pkl','rb') as f:
    prov_dict = pkl.load(f)

    
with open('chineseCity_to_EN.pkl','rb') as f:
    city_dict = pkl.load(f)    
        
def translate_to_English(data, prov_dict, city_dict):
    """Translate Chinese in dataset to English
    """        
    data['province'] = data['province'].apply(getProvinceTranslation)
    data['city'] = data['city'].apply(getCityTranslation)
    
    for city in unable_translation: # remove these unable translated data
        data = data[data['city']!=city]
    return data
    
def getProvinceTranslation(name):
    if not isNaN(name):
        return prov_dict[name]
    else: 
        return name

unable_translation = []
def getCityTranslation(name):
    try:
        if not isNaN(name): 
            return city_dict[name]
        else:
            return name
    except:
        unable_translation.append(name)
        #print(name + ' cannot be translated\n')
        return name
    

In [35]:
daily_frm_DXYArea = translate_to_English(DXYArea, prov_dict, city_dict)

daily_frm_DXYArea.head(3)

Unnamed: 0,date,country,countryCode,province,city,confirmed,suspected,cured,dead
0,2019-12-01,中国,CN,,,1,0,0,0
1,2019-12-01,中国,CN,Hubei Province,,1,0,0,0
2,2019-12-01,中国,CN,Hubei Province,Wuhan,1,0,0,0


In [36]:
daily_frm_DXYArea[(daily_frm_DXYArea['countryCode'] == 'CN') & (daily_frm_DXYArea['province'].isnull())].tail()

Unnamed: 0,date,country,countryCode,province,city,confirmed,suspected,cured,dead
7832,2020-02-10,中国,CN,,,42708,21675,3998,1017
8375,2020-02-11,中国,CN,,,44730,16067,4742,1114
8919,2020-02-12,中国,CN,,,58839,13435,5646,1260
9462,2020-02-13,中国,CN,,,63932,10109,6728,1381
10005,2020-02-14,中国,CN,,,63936,10109,6869,1381


In [37]:
def add_days(DXYArea: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:
    """
    Create a new column: Days, number of days after 2019-12-08 (detect the first case)
    """
    DXYArea['date'] = pd.to_datetime(DXYArea['date'])
    first_day = datetime.datetime(2019, 12, 8) # the time when detected the first case (2019-12-08)
    DXYArea['Days'] = (DXYArea['date'] - first_day).dt.days
    return DXYArea

def add_net_confirmed_case(DXYArea: pd.core.frame.DataFrame)-> pd.core.frame.DataFrame:
    """
    Add net confirmed case = confirmed - cured - dead
    """
    DXYArea['net_confirmed'] = DXYArea['confirmed'] - DXYArea['cured'] - DXYArea['dead']
    return DXYArea

In [38]:
daily_frm_DXYArea = add_days(daily_frm_DXYArea)  # add the number of days after 2019-12-08
daily_frm_DXYArea = add_net_confirmed_case(daily_frm_DXYArea) # add net confirmed case

In [39]:
daily_frm_DXYArea.tail()

Unnamed: 0,date,country,countryCode,province,city,confirmed,suspected,cured,dead,Days,net_confirmed
10539,2020-02-14,瑞典,SE,,,1,0,0,0,68,1
10540,2020-02-14,新加坡,SG,,,58,0,15,0,68,43
10541,2020-02-14,泰国,TH,,,33,0,10,0,68,23
10542,2020-02-14,美国,US,,,15,0,3,0,68,12
10543,2020-02-14,越南,VN,,,16,0,5,0,68,11


In [40]:
daily_frm_DXYArea.to_csv ('../data/DXYArea.csv', index = None, header=True)