In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
import json
import time
import requests

In [3]:
def main():
    
    decision_1 = input("Do you want to do API requests? (requires ~20 minutes)? y/n: ")
    
    if decision_1 == "y":
        
        url = "https://api.covid19api.com"
        
        # connection checking with base
        check_connection(base=url)
        # returns: print: None
        
        # get list of countries
        countries_list_var = request_countries(base=url)
        # returns: countries_list: list
        
        # print list of countries and count them
        print_countries(c_list=countries_list_var)
        # returns: print: None

        # gather data for available countries (entire year)
        countries_data_var = gather_countries(c_list=countries_list_var, base=url)
        # returns: [countries_with_empty_response: list, countries_with_response: list, countries_df: DataFrame]

        empty_response_list = countries_data_var[0]
        non_empty_response_list = countries_data_var[1]
        world = countries_data_var[2]

        # makes date periods for day by day requests
        dates_list_1 = make_period(from_data=[2020, 4, 1], to_date=[2021, 3, 30])
        dates_list_2 = make_period(from_data=[2020, 4, 2], to_date=[2021, 3, 31])
        # returns: dates_list: list

        # gather data for USA (entire year)
        united_states = gather_day_by_day(periods_1=dates_list_1, periods_2=dates_list_2, base=url)
        # returns: usa_data_list: DataFrame

        # update and print response list for countries
        update_to_non_empty_response(non_empty=non_empty_response_list, 
                                     empty=empty_response_list, 
                                     item="united-states")
        # returns [non_empty: list, empty: list]

        # joins world and usa data (saves it to /data)
        join_countries(df1=world, df2=united_states)
        # returns: countries_and_usa_df: DataFrame
        
    elif decision_1 == "n":
        try:
            # loads csv file from data folder
            skip_api(path="data/world_and_usa_df.csv")
            # returns: data: DataFrame
        except Exception as e:
            print("Error has occurred: ", e, "\n")
        
    else:
        print("Nothing to do here, go home")
    

In [4]:
def check_connection(base: str) -> None:
    try:
        response = requests.get(base)
        print("Connection is fine!\n")
    except Exception as e:
        print("Error has occurred: ", e, "\n")

In [5]:
def request_countries(base: str, show_response: bool = False) -> list:
    """ """
    
    response = requests.get(base + "/countries")
    
    if show_response:
        print(response)
    
    countries_json = response.json()
    countries_list = []

    for item in countries_json:
        my_list = []
        my_list = item.get('Slug')
        countries_list.append(my_list)
    
    return countries_list

In [6]:
def print_countries(c_list: list) -> None:
    """ """
    
    data = c_list
    print("Number of all countries: ", len(data))
    print(*data, "\n", sep = ", ")

In [7]:
def gather_countries(c_list: list, base: str, save: bool = False) -> list:
    """time: ~ 9-15 min"""
    
    data = c_list
    countries_with_empty_response = []
    countries_with_response = []
    countries_data_list = []
    
    for country in data:
        response = requests.get(base + '/country/' + country + '?from=2020-04-01T00:00:00Z&to=2021-03-31T00:00:00Z')
        country_response = response.json()

        if len(country_response) <= 2:
            countries_with_empty_response.append(country)
            #continue
        else:
            countries_with_response.append(country)

            for item in country_response:
                my_dict = {}
                my_dict['Country'] = item.get('Country')
                my_dict['Confirmed'] = item.get('Confirmed')
                my_dict['Deaths'] = item.get('Deaths')
                my_dict['Recovered'] = item.get('Recovered')
                my_dict['Active'] = item.get('Active')
                my_dict['Date'] = item.get('Date')

                countries_data_list.append(my_dict)
        time.sleep(1)
    
    countries_data_json = json.dumps(countries_data_list)
    countries_df = pd.read_json(countries_data_json)
    
    if save:
        countries_df.to_csv('data/countries_df.csv', index=False)
    
    return [countries_with_empty_response, countries_with_response, countries_df]

In [8]:
def make_period(from_data: list, to_date: list) -> list:
    """ """
    
    start_date = datetime.date(from_data[0], from_data[1], from_data[2])
    end_date = datetime.date(to_date[0], to_date[1], to_date[2])
    delta = datetime.timedelta(days=1)
    
    dates_list = []

    while start_date <= end_date:
        dates_list.append(start_date)
        start_date += delta

    return dates_list

In [9]:
def gather_day_by_day(periods_1: list, periods_2: list, base: str, save: bool = False) -> list:
    """time: ~ 27-30 min"""
    
    usa_data_list = []
    
    for from_date, to_date in zip(periods_1, periods_2):
        response = requests.get(base + '/country/united-states' + f'?from={from_date}T00:00:00Z&to={to_date}T00:00:00Z')
        usa_data_json = response.json()
        
        for item in usa_data_json:
            my_dict = {}
            my_dict['Country'] = item.get('Country')
            my_dict['Confirmed'] = item.get('Confirmed')
            my_dict['Deaths'] = item.get('Deaths')
            my_dict['Recovered'] = item.get('Recovered')
            my_dict['Active'] = item.get('Active')
            my_dict['Date'] = item.get('Date')

            usa_data_list.append(my_dict)
        time.sleep(1)
    
    usa_data = json.dumps(usa_data_list)
    country_usa_df = pd.read_json(usa_data)

    if save:
        country_usa_df.to_csv('data/country_usa_df.csv', index=False)
    
    return country_usa_df

In [10]:
def update_to_non_empty_response(non_empty: list, empty: list, item: str = "united-states") -> list:
    """ """
    
    non_empty.append(item)
    print("Number of countries with data: ", len(non_empty))
    print(*non_empty, "\n", sep = ", ") 

    empty.remove(item)
    print("Number of countries without data: ", len(empty))
    print(*empty, "\n", sep = ", ")
    return [non_empty, empty]

In [11]:
def join_countries(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:   
    """ """
    
    countries_and_usa_df = df1.append(df2)
    countries_and_usa_df.to_csv('data/world_and_usa_df.csv', index=False)
    print("Finished dataframe:\n", countries_and_usa_df)
    return countries_and_usa_df

In [12]:
def skip_api(path : str = "data/world_and_usa_df.csv") -> pd.DataFrame:
    """save 20 minutes of your life, skip api, load csv"""
    
    data = pd.read_csv("data/world_and_usa_df.csv")
    print(" *magic* Data successfully loaded! *magic* ")
    return data

In [16]:
main()

Do you want to do API requests? (requires ~20 minutes)? y/n:  y


Connection is fine!

Number of all countries:  248
switzerland, tuvalu, solomon-islands, us-minor-outlying-islands, fiji, guatemala, norfolk-island, bulgaria, equatorial-guinea, jordan, japan, korea-south, liberia, martinique, mauritania, andorra, bahamas, gambia, oman, zambia, israel, nepal, sierra-leone, western-sahara, comoros, ghana, hong-kong-sar-china, hungary, new-zealand, united-arab-emirates, barbados, germany, congo-brazzaville, estonia, french-southern-territories, grenada, luxembourg, afghanistan, austria, china, monaco, rwanda, virgin-islands, peru, mali, moldova, slovenia, turkmenistan, brunei, falkland-islands-malvinas, lebanon, guadeloupe, guam, latvia, uzbekistan, bangladesh, holy-see-vatican-city-state, tanzania, macao-sar-china, marshall-islands, saint-pierre-and-miquelon, saint-helena, sao-tome-and-principe, suriname, croatia, iran, niue, guernsey, mexico, burkina-faso, senegal, costa-rica, niger, uganda, serbia, cameroon, kiribati, netherlands, algeria, argentina, 

In [163]:
def top10():
    """ """
    
    
    def data_preparation():
        pass
    
    
    def data_visualization():
        pass
    
    
    pass

In [None]:
# picking columns from countries_and_usa_df
top_recovered_df = countries_and_usa_df[['Country', 'Recovered', 'Date']]

In [None]:
# data string slicing - reducting data to year-month-day
top_recovered_df['Date'] = top_recovered_df['Date'].copy().astype(str).str[:10]
# back to datatime type
top_recovered_df['Date'] = pd.to_datetime(top_recovered_df['Date'])  
# masking date data
date_mask = (top_recovered_df['Date'] >= '2021-3-1') & (top_recovered_df['Date'] <= '2021-3-31')
# re-assign to variable
top_recovered_df = top_recovered_df.loc[date_mask]
top_recovered_df.reset_index(drop=True, inplace=True)

In [None]:
# data string slicing - reducting data to year-month-day
top_recovered_df['Date'] = top_recovered_df['Date'].astype(str).str[:10]
# back to datatime type
top_recovered_df['Date'] = pd.to_datetime(top_recovered_df['Date'])  
# masking date data
date_mask = (top_recovered_df['Date'] >= '2021-3-1') & (top_recovered_df['Date'] <= '2021-3-31')
# re-assign to variable
top_recovered_df = top_recovered_df.loc[date_mask]
top_recovered_df.reset_index(drop=True, inplace=True)

In [None]:
top_recovered_df = top_recovered_df[['Country', 'Date', 'Recovered']].groupby(['Country', 'Date']).sum()
top_recovered_df.reset_index(inplace=True)

In [None]:
# Recovered column copy
top_recovered_df['Test'] = top_recovered_df['Recovered']
# increment empty column
top_recovered_df['Recovered_increment'] = 0
top_recovered_df

In [None]:
# 1 month limit means records for 1 country from last month
month_limit = 30
# 5889 is last index, so there're 5890 rows -> 5890 / 190 = 31
df_limit = 5889
index_counter = 0


while month_limit <= df_limit:          
    
    while index_counter <= month_limit and index_counter <= df_limit: 
        if index_counter == 5889: 
            break
        else:
            top_recovered_df['Recovered_increment'][index_counter+1] = top_recovered_df['Recovered'][index_counter+1] - top_recovered_df['Test'][index_counter]
            index_counter += 1
            
    if index_counter >= month_limit:                               
        month_limit += 31        

In [None]:
top_recovered_df

In [None]:
# cleaner look to df
top_recovered_df = top_recovered_df[['Country', 'Recovered_increment', 'Date']]
# group by country and sum increment
top_recovered_df = top_recovered_df.groupby(by="Country").sum()

In [None]:
# sorting by increment and return first 10
top_recovered_df = top_recovered_df.sort_values(by='Recovered_increment', ascending=False).head(10)

In [None]:
top_recovered_df

In [None]:
plt.figure(figsize=(10 ,7))
sns.barplot(x=top_recovered_df.index, y='Recovered_increment', data=top_recovered_df)
#plt.yticks(range(0, int(14e6), int(2e6)))
plt.xticks(rotation=45)
plt.title("TOP 10 państw z największą liczbą wyzdrowień w ostatnim miesiącu")
plt.grid()
plt.tight_layout();

# TOP 10 państw z największą liczbą potwierdzonych nowych przypadków zachorowań w ostatnim miesiącu

In [None]:
# picking columns from countries_and_usa_df
top_confirmed_df = countries_and_usa_df[['Country', 'Confirmed', 'Date']]

In [None]:
# data string slicing - reducting data to year-month-day
top_confirmed_df['Date'] = top_confirmed_df['Date'].astype(str).str[:10]
# back to datatime type
top_confirmed_df['Date'] = pd.to_datetime(top_confirmed_df['Date'])  
# masking date data
date_mask = (top_confirmed_df['Date'] >= '2021-3-1') & (top_confirmed_df['Date'] <= '2021-3-31')
# re-assign to variable
top_confirmed_df = top_confirmed_df.loc[date_mask]
top_confirmed_df.reset_index(drop=True, inplace=True)

In [None]:
top_confirmed_df = top_confirmed_df[['Country', 'Date', 'Confirmed']].groupby(['Country', 'Date']).sum()
top_confirmed_df.reset_index(inplace=True)

In [None]:
# Recovered column copy
top_confirmed_df['Test'] = top_confirmed_df['Confirmed']
# increment empty column
top_confirmed_df['Confirmed_increment'] = 0

In [None]:
month_limit = 30
df_limit = 5889
index_counter = 0


while month_limit <= df_limit:          
    
    while index_counter <= month_limit and index_counter <= df_limit: 
        if index_counter == 5889: 
            break
        else:
            top_confirmed_df['Confirmed_increment'][index_counter+1] = top_confirmed_df['Confirmed'][index_counter+1] - top_confirmed_df['Test'][index_counter]
            index_counter += 1
            #print(index_counter)                 

    if index_counter >= month_limit:                               
        month_limit += 31        

In [None]:
# cleaner look to df
top_confirmed_df = top_confirmed_df[['Country', 'Confirmed_increment', 'Date']]
# group by date(month) and sum increment
top_confirmed_df = top_confirmed_df.groupby(by="Country").sum()

In [None]:
# sorting by increment and return first 10
top_confirmed_df = top_confirmed_df.sort_values(by='Confirmed_increment', ascending=False).head(10)

In [None]:
top_confirmed_df

In [None]:
plt.figure(figsize=(10 ,7))
sns.barplot(x=top_confirmed_df.index, y='Confirmed_increment', data=top_confirmed_df)
#plt.yticks(range(0, int(14e6), int(2e6)))
plt.xticks(rotation=45)
plt.title("TOP 10 państw z największą liczbą potwierdzonych nowych przypadków zachorowań w ostatnim miesiącu")
plt.grid()
plt.tight_layout()

# TOP 10 państw z największą liczbą przypadków śmiertelnychw ostatnim miesiącu

In [None]:
# picking columns from countries_and_usa_df
top_deaths_df = countries_and_usa_df[['Country', 'Deaths', 'Date']]

In [None]:
# data string slicing - reducting data to year-month-day
top_deaths_df['Date'] = top_deaths_df['Date'].astype(str).str[:10]
# back to datatime type
top_deaths_df['Date'] = pd.to_datetime(top_deaths_df['Date'])  
# masking date data
date_mask = (top_deaths_df['Date'] >= '2021-3-1') & (top_deaths_df['Date'] <= '2021-3-31')
# re-assign to variable
top_deaths_df = top_deaths_df.loc[date_mask]
top_deaths_df.reset_index(drop=True, inplace=True)

In [None]:
top_deaths_df = top_deaths_df[['Country', 'Date', 'Deaths']].groupby(['Country', 'Date']).sum()
top_deaths_df.reset_index(inplace=True)

In [None]:
# Recovered column copy
top_deaths_df['Test'] = top_deaths_df['Deaths']
# increment empty column
top_deaths_df['Deaths_increment'] = 0

In [None]:
month_limit = 30
df_limit = 5889
index_counter = 0


while month_limit <= df_limit:          
    
    while index_counter <= month_limit and index_counter <= df_limit: 
        if index_counter == 5889: 
            break
        else:
            top_deaths_df['Deaths_increment'][index_counter+1] = top_deaths_df['Deaths'][index_counter+1] - top_deaths_df['Test'][index_counter]
            index_counter += 1
            #print(index_counter)                 

    if index_counter >= month_limit:                               
        month_limit += 31        

In [None]:
# cleaner look to df
top_deaths_df = top_deaths_df[['Country', 'Deaths_increment', 'Date']]
# group by date(month) and sum increment
top_deaths_df = top_deaths_df.groupby(by="Country").sum()

In [None]:
# sorting by increment and return first 10
top_deaths_df = top_deaths_df.sort_values(by='Deaths_increment', ascending=False).head(10)

In [None]:
top_deaths_df

In [None]:
plt.figure(figsize=(10 ,7))
sns.barplot(x=top_deaths_df.index, y='Deaths_increment', data=top_deaths_df)
#plt.yticks(range(0, int(14e6), int(2e6)))
plt.xticks(rotation=45)
plt.title("# TOP 10 państw z największą liczbą przypadków śmiertelnychw ostatnim miesiącu")
plt.grid()
plt.tight_layout()

# Statystyki z wyzdrowień, nowych przypadków zachorowań oraz przypadków śmiertelnych dla Polski za ostatni miesiąc

In [None]:
# reduce countries_df to Poland records
poland_df = countries_df.loc[countries_df["Country"]=='Poland']
poland_df.reset_index(drop=True, inplace=True)

In [None]:
# data string slicing - reducting data to year-month-day
poland_df['Date'] = poland_df['Date'].astype(str).str[:10]
# back to datatime type
poland_df['Date'] = pd.to_datetime(poland_df['Date'])  
# masking date data
date_mask = (poland_df['Date'] >= '2021-3-1') & (poland_df['Date'] <= '2021-3-31')
# re-assign to variable
poland_df = poland_df.loc[date_mask]
poland_df.reset_index(drop=True, inplace=True)

In [None]:
# managing outliner in 'Active' column
idx = poland_df.index[poland_df['Active'] > 1e6].tolist()
poland_df.iloc[idx[0], 4] = poland_df.iloc[idx[0], 1] - poland_df.iloc[idx[0], 2] - poland_df.iloc[idx[0], 3]

In [None]:
poland_df

In [None]:
fig, axes = plt.subplots(4, 1, figsize=(12, 10), sharex=True)

plt.suptitle("Statystyki dla Polski za ostatni miesiąc", fontsize=14)

sns.lineplot(ax=axes[0], data=poland_df, x='Date', y='Confirmed', color='orange', linewidth=2)
axes[0].set_title("Confirmed")
axes[0].set_yticks(range(int(16e5), int(25e5), int(1e5)))
axes[0].grid(True)

sns.lineplot(ax=axes[1], data=poland_df, x='Date', y='Deaths', color='red', linewidth=2)
axes[1].set_title("Deaths")
axes[1].set_yticks(range(42000, 56000, 2000))
axes[1].grid(True)

sns.lineplot(ax=axes[2], data=poland_df, x='Date', y='Recovered', color='green', linewidth=2)
axes[2].set_title("Recovered")
axes[2].set_yticks(range(int(14e5), int(20e5), int(1e5)))
axes[2].grid(True)

sns.lineplot(ax=axes[3], data=poland_df, x='Date', y='Active', color='blue', linewidth=2)
axes[3].tick_params(axis='x', rotation=90)
axes[3].set_title("Active")
axes[3].set_yticks(range(200000, 480000, 40000))
axes[3].set_xticks(poland_df['Date'])
axes[3].grid(True)

#plt.tight_layout();

# Miesięczny przyrost wyzdrowień w ostatnim roku

In [None]:
# picking columns from countries_and_usa_df
top_recovered_df = countries_and_usa_df[['Country', 'Recovered', 'Date']]

In [None]:
# data string slicing - reducting data to year-month-day
top_recovered_df['Date'] = top_recovered_df['Date'].astype(str).str[:7]

In [None]:
top_recovered_df = top_recovered_df[['Country', 'Date', 'Recovered']].groupby(['Country', 'Date']).sum()
top_recovered_df.reset_index(inplace=True)

In [None]:
# Recovered column copy
top_recovered_df['Test'] = top_recovered_df['Recovered']
# increment empty column
top_recovered_df['Recovered_increment'] = 0

In [None]:
month_limit = 30
df_limit = 2279
index_counter = 0


while month_limit <= df_limit:          
    
    while index_counter <= month_limit and index_counter <= df_limit: 
        if index_counter == 2279: 
            break
        else:
            top_recovered_df['Recovered_increment'][index_counter+1] = top_recovered_df['Recovered'][index_counter+1] - top_recovered_df['Test'][index_counter]
            index_counter += 1
            #print(index_counter)                 

    if index_counter >= month_limit:                               
        month_limit += 31        

In [None]:
top_recovered_df.loc[top_recovered_df['Recovered_increment'] < 0, 'Recovered_increment'] = 0

In [None]:
# cleaner look to df
top_recovered_df = top_recovered_df[['Country', 'Recovered', 'Recovered_increment', 'Date']]
# group by date(month) and sum increment
top_recovered_df = top_recovered_df.groupby(by="Date").sum()

In [None]:
# sorting by increment and return first 10
top_recovered_df = top_recovered_df.sort_values(by='Date', ascending=True)

In [None]:
top_recovered_df

In [None]:
fig, axes = plt.subplots(2, 1, figsize=(10, 7), sharex=True)

plt.suptitle("Miesięczny przyrost wyzdrowień w ostatnim roku", fontsize=14)

sns.lineplot(ax=axes[0], data=top_recovered_df, x='Date', y='Recovered_increment', color='red', linewidth=2)
axes[0].set_title("Increment")
axes[0].grid(True)

sns.lineplot(ax=axes[1], data=top_recovered_df, x='Date', y='Recovered', color='green', linewidth=2)
axes[1].set_title("Recovered")
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(True)

#plt.tight_layout()

In [None]:
# if __name__ == "__main__":
#     main()

In [None]:
# # shell command convert .ipynb to .py
# !jupyter nbconvert --to script 2021_Rozwiązanie_BAA_Intern_Dawid_Jaskulski.ipynb