In [1]:
import pandas as pd
import numpy as np

### Load Data

In [2]:
confirmed_xlsx = '../data/confirmed_2020-02-08.xlsx'
recovered_xlsx = '../data/recovered_2020-02-08.xlsx'
deaths_xlsx = '../data/deaths_2020-02-08.xlsx'
sars_xlsx = '../data/sars_final.xlsx'

In [3]:
confirmed = pd.read_excel(confirmed_xlsx)
recovered = pd.read_excel(recovered_xlsx)
deaths = pd.read_excel(deaths_xlsx)
sars = pd.read_excel(sars_xlsx)

In [4]:
confirmed.head()

Unnamed: 0,Province/State,Country/Region,First Confirmed Date,Lat,Long,2020-02-05 23:00:00,2020-02-06 14:20:00,2020-02-07 22:50:00
0,Anhui,Mainland China,2020-01-03,31.82571,117.2264,591,591,733
1,Beijing,Mainland China,2020-01-03,40.18238,116.4142,274,274,315
2,Chongqing,Mainland China,2020-01-03,30.05718,107.874,389,400,426
3,Fujian,Mainland China,2020-01-03,26.07783,117.9895,215,215,239
4,Gansu,Mainland China,2020-01-03,36.0611,103.8343,62,62,71


### Melt each dataframe so date columns and counts are rows

In [15]:
### 

confirmed = confirmed.melt(id_vars=["Province/State", "Country/Region", "First Confirmed Date", "Lat", "Long"], 
        var_name="Date", 
        value_name="Confirmed Cases").sort_values(by=['Province/State', 'Date'])
recovered = recovered.melt(id_vars=["Province/State", "Country/Region", "First Confirmed Date", "Lat", "Long"], 
        var_name="Date", 
        value_name="Recovered Cases").sort_values(by=['Province/State', 'Date'])
deaths = deaths.melt(id_vars=["Province/State", "Country/Region", "First Confirmed Date", "Lat", "Long"], 
        var_name="Date", 
        value_name="Death Cases").sort_values(by=['Province/State', 'Date'])

confirmed.head()

Unnamed: 0,Province/State,Country/Region,First Confirmed Date,Lat,Long,Date,Confirmed Cases
0,Anhui,Mainland China,2020-01-03,31.82571,117.2264,2020-02-05 23:00:00,591
72,Anhui,Mainland China,2020-01-03,31.82571,117.2264,2020-02-06 14:20:00,591
144,Anhui,Mainland China,2020-01-03,31.82571,117.2264,2020-02-07 22:50:00,733
1,Beijing,Mainland China,2020-01-03,40.18238,116.4142,2020-02-05 23:00:00,274
73,Beijing,Mainland China,2020-01-03,40.18238,116.4142,2020-02-06 14:20:00,274


### Check to make sure all data frames have the same number of rows

In [16]:
confirmed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 216 entries, 0 to 174
Data columns (total 7 columns):
Province/State          216 non-null object
Country/Region          216 non-null object
First Confirmed Date    216 non-null datetime64[ns]
Lat                     216 non-null float64
Long                    216 non-null float64
Date                    216 non-null datetime64[ns]
Confirmed Cases         216 non-null int64
dtypes: datetime64[ns](2), float64(2), int64(1), object(2)
memory usage: 13.5+ KB


In [17]:
recovered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 216 entries, 0 to 174
Data columns (total 7 columns):
Province/State          216 non-null object
Country/Region          216 non-null object
First Confirmed Date    216 non-null datetime64[ns]
Lat                     216 non-null float64
Long                    216 non-null float64
Date                    216 non-null datetime64[ns]
Recovered Cases         216 non-null int64
dtypes: datetime64[ns](2), float64(2), int64(1), object(2)
memory usage: 13.5+ KB


In [18]:
deaths.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 216 entries, 0 to 174
Data columns (total 7 columns):
Province/State          216 non-null object
Country/Region          216 non-null object
First Confirmed Date    216 non-null datetime64[ns]
Lat                     216 non-null float64
Long                    216 non-null float64
Date                    216 non-null datetime64[ns]
Death Cases             216 non-null int64
dtypes: datetime64[ns](2), float64(2), int64(1), object(2)
memory usage: 13.5+ KB


### Merge all data frames to one data frame

In [19]:
df = pd.concat([confirmed, recovered, deaths], axis=1)
### Remove duplicated columns
df = df.iloc[:, [0,1,2,3,4,5,6,13,20]]

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 216 entries, 0 to 215
Data columns (total 9 columns):
Province/State          216 non-null object
Country/Region          216 non-null object
First Confirmed Date    216 non-null datetime64[ns]
Lat                     216 non-null float64
Long                    216 non-null float64
Date                    216 non-null datetime64[ns]
Confirmed Cases         216 non-null int64
Recovered Cases         216 non-null int64
Death Cases             216 non-null int64
dtypes: datetime64[ns](2), float64(2), int64(3), object(2)
memory usage: 16.9+ KB


### Sort data by Province/State then by Date

In [21]:
df = df.sort_values(by=['Province/State', 'Date'])

In [22]:
df.head()

Unnamed: 0,Province/State,Country/Region,First Confirmed Date,Lat,Long,Date,Confirmed Cases,Recovered Cases,Death Cases
0,Anhui,Mainland China,2020-01-03,31.82571,117.2264,2020-02-05 23:00:00,591,23,0
72,Anhui,Mainland China,2020-01-03,31.82571,117.2264,2020-02-06 14:20:00,591,34,0
144,Anhui,Mainland China,2020-01-03,31.82571,117.2264,2020-02-07 22:50:00,733,47,0
1,Beijing,Mainland China,2020-01-03,40.18238,116.4142,2020-02-05 23:00:00,274,31,1
73,Beijing,Mainland China,2020-01-03,40.18238,116.4142,2020-02-06 14:20:00,274,31,1


### Set up MongoDB Connection

In [23]:
import pymongo
import dns
username = 'Terra925'
password = 'H%40mmond271'

In [24]:
conn = 'mongodb+srv://' + username +':' + password + '@cluster0-paegd.mongodb.net/test?retryWrites=true&w=majority'
client = pymongo.MongoClient(conn)
db = client['corona_virus']
collection = db['cases']

### Itterate through rows of df, create a post object, then post new document to MongoDB database

In [25]:

for index, row in df.iterrows():
    post = {
        'location': row['Province/State'],
        'region': row['Country/Region'],
        'firstConfirmedDate': row['First Confirmed Date'],
        'lat': row['Lat'],
        'lng': row['Long'],
        'cases': {
            'date': row['Date'],
            'confirmed': row['Confirmed Cases'],
            'recovered': row['Recovered Cases'],
            'deaths': row['Death Cases']
        }
    }
    id = collection.insert_one(post).inserted_id


In [26]:
post

{'location': 'Zhejiang',
 'region': 'Mainland China',
 'firstConfirmedDate': Timestamp('2020-01-03 00:00:00'),
 'lat': 29.18251,
 'lng': 120.0985,
 'cases': {'date': Timestamp('2020-02-07 22:50:00'),
  'confirmed': 1048,
  'recovered': 127,
  'deaths': 0},
 '_id': ObjectId('5e3f25879140a2ab951d9b53')}

### Create a new collection with dates as the indexes

In [27]:
collection = db['cases_by_date']

#### Create DF that groups by dates then by province/state

In [28]:
by_date = df.groupby(["Date", "Province/State"]).max().reset_index()
by_date.head()



Unnamed: 0,Date,Province/State,Country/Region,First Confirmed Date,Lat,Long,Confirmed Cases,Recovered Cases,Death Cases
0,2020-02-05 23:00:00,Anhui,Mainland China,2020-01-03,31.82571,117.2264,591,23,0
1,2020-02-05 23:00:00,Beijing,Mainland China,2020-01-03,40.18238,116.4142,274,31,1
2,2020-02-05 23:00:00,Belgium,Belgium,2020-02-04,50.5039,4.4699,1,0,0
3,2020-02-05 23:00:00,"Boston, MA",US,2020-02-02,42.3601,-71.0589,1,0,0
4,2020-02-05 23:00:00,British Columbia,Canada,2020-01-28,49.2827,-123.121,2,0,0


#### Create DF that groups by dates

In [29]:
totals = by_date.groupby("Date")["Confirmed Cases", "Recovered Cases", "Death Cases"].sum().reset_index()
totals.head()

Unnamed: 0,Date,Confirmed Cases,Recovered Cases,Death Cases
0,2020-02-05 23:00:00,28274,1198,565
1,2020-02-06 14:20:00,28353,1382,565
2,2020-02-07 22:50:00,34899,1995,724


### Create the documents and upload to MongoDB Collection

In [30]:
### iterate through totals df so there is only one document per day
for index, row in totals.iterrows():
    date = row['Date']
    ### base post object
    post = {
        'date': row['Date'],
        'total_confirmed': row['Confirmed Cases'],
        'total_recovered': row['Recovered Cases'],
        'total_deaths': row['Death Cases'],
        'locations': {} ### this will be updated in the next itteration
        }
    ### iterate through by_date df to store relevant data
    for i, r in by_date.iterrows():
        ### only add to post if the date in by_date df matches the date in current itteration of totals df
        if r['Date'] == date:
            location = r['Province/State']
            ### temporary object to hold data that will be added to "locations" in post
            obj = {
                        "region": r['Country/Region'],
                        "lat": r["Lat"],
                        "lng": r["Long"],
                        "confirmed": r["Confirmed Cases"],
                        "recovered": r["Recovered Cases"],
                        "deaths": r["Death Cases"]
                    }
            ### add obj data to posted data
            post['locations'][location] = obj
            
            ### upload post to Mongo DB
        else:
            
            continue
    id = collection.insert_one(post).inserted_id


### Load SARS data to MongoDB

In [31]:
sars.head()

Unnamed: 0.1,Unnamed: 0,Date,Infected,Mortality,URL
0,0,2003-03-17,167,4,https://www.who.int/csr/sars/country/table/en/
1,1,2003-03-18,219,4,https://www.who.int/csr/sars/country/tablemarc...
2,2,2003-03-19,264,9,https://www.who.int/csr/sars/country/2003_19_0...
3,3,2003-03-20,306,10,https://www.who.int/csr/sars/country/2003_03_2...
4,4,2003-03-21,350,10,https://www.who.int/csr/sars/country/2003_03_2...


In [7]:
del sars["Unnamed: 0"]

In [8]:
collection = db['sars']

In [10]:
for index, row in sars.iterrows():
    post = {
        'date': row['Date'],
        'infected': row['Infected'],
        'deaths': row['Mortality'] 
    }
    id = collection.insert_one(post).inserted_id