In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats 

In [None]:
df = pd.read_csv('covid-data.csv')

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
pd.set_option('display.max_rows', 187)
pd.set_option('display.max_columns', 50)

# A1

In [None]:
date_greek_lockdown_started = df[(df['date'] == '2020-03-23') & (df['location'] != 'World')]['total_cases'].max()
location = df.loc[df['total_cases'] == date_greek_lockdown_started] #A1

#Variables to save name and number of cases
name = location.iloc[0,2]
total_cases = location.iloc[0,4]

print(f"{name} had the most total cases ({total_cases.astype(int)}) when Greece's lockdown started")

# A2

In [None]:
second_to_last_week_of_march = df[(df['date'] >= '2020-03-23') & (df['date'] <= '2020-03-29') & (df['location'] != 'World')]['new_cases'].max()
location = df.loc[df['new_cases'] == second_to_last_week_of_march] #A2 
#Needs output with name and number of new cases

name = location.iloc[0,2]
new_cases = location.iloc[0,5]

print(f"{name} had the most new cases ({new_cases.astype(int)}) during 23-29/03/2020")

# A3

In [None]:
greece = (df['location'] == 'Greece')

dates_before_first_lockdown = df[greece & ((df['date'] >= '2020-03-16') & (df['date'] <= '2020-03-22'))] #A3
sum_of_new_cases_before_first_lockdown = dates_before_first_lockdown['new_cases'].sum()

dates_before_second_lockdown = df[greece & ((df['date'] >= '2020-10-31') & (df['date'] <= '2020-11-06'))] #A3 
sum_of_new_cases_before_second_lockdown = dates_before_second_lockdown['new_cases'].sum()

print(f'Before the first lockdown (16-22/3/2020) Greece had: {sum_of_new_cases_before_first_lockdown.astype(int)} cases')
print(f'Before the second lockdown (31/10/2020 - 6/11/2020) Greece had: {sum_of_new_cases_before_second_lockdown.astype(int)} cases')

# A4

In [None]:
max_total_deaths_per_million = df["total_deaths_per_million"].max()
location = df.loc[df['total_deaths_per_million'] == max_total_deaths_per_million]

name = location.iloc[0,2]
total_deaths_per_million = location.iloc[0,13]

print(f'{name} had the most total deaths per million: {total_deaths_per_million.round(decimals=2)}')

# A5

In [None]:
all_locations_latest_date = df[(df['date'] == df['date'].max()) & (df['location'] != 'World') & (df['location'] != 'International')]
max_percentage = all_locations_latest_date['total_deaths'].div(all_locations_latest_date['total_cases']).max()

location_with_most_deaths_per_case = all_locations_latest_date['total_deaths'].div(all_locations_latest_date['total_cases']) == max_percentage
name = all_locations_latest_date[location_with_most_deaths_per_case]['location'].iloc[0]

print(f'{name} has the most deaths per case: {(max_percentage*100).round(decimals=2)}%')

# B1

In [None]:
greece_new_deaths_and_cases = df[greece].groupby(df[greece]['date'].dt.month)[['new_deaths', 'new_cases']].agg(['median','std']).round(decimals = 2)
greece_new_deaths_and_cases

# B2

In [None]:
portugal = df['location'] == "Portugal"

portugal_new_deaths_and_cases = df[portugal].groupby(df[portugal]['date'].dt.month)[['new_deaths', 'new_cases']].agg(['median','std']).round(decimals = 2)
portugal_new_deaths_and_cases

In [None]:
greece_new_deaths_and_cases.sum()

In [None]:
portugal_new_deaths_and_cases.sum()

In [None]:
print('Based on the above data Greece does better than Portugal')

# B3

In [None]:
pd.set_option('display.max_rows', 330)
pd.set_option('display.max_columns', 50)

japan = df['location'] == 'Japan'

japan_A = df[japan].groupby(df[japan]['date'].dt.month)[['new_deaths', 'new_cases']].agg(['median','std']).round(decimals = 2)

japan_total_cases = df[japan]['total_cases'].dropna()
japan_total_tests = df[japan]['total_tests'].dropna()
japan_total_deaths = df[japan]['total_deaths'].dropna()

In [None]:
print('Japan Total Cases:\nMinimum Cases:',japan_total_cases.min().astype(int))
print('Maximum Cases:',japan_total_cases.max().astype(int))
print('Average Number of Cases:',japan_total_cases.mean().round(decimals=2))
print('Variance:',japan_total_cases.var().round(decimals=2))
print('Standard Deviation:',japan_total_cases.std().round(decimals=2))

In [None]:
plt.plot(japan_total_cases)

In [None]:
print('Japan Total Tests:\nMinimum Cases:',japan_total_tests.min().astype(int))
print('Maximum Cases:',japan_total_tests.max().astype(int))
print('Average Number of Cases:',japan_total_tests.mean().round(decimals=2))
print('Variance:',japan_total_tests.var().round(decimals=2))
print('Standard Deviation:',japan_total_tests.std().round(decimals=2))

In [None]:
plt.plot(japan_total_tests)

In [None]:
print('Japan Total Deaths:\nMinimum Cases:',japan_total_deaths.min().astype(int))
print('Maximum Cases:',japan_total_deaths.max().astype(int))
print('Average Number of Cases:',japan_total_deaths.mean().round(decimals=2))
print('Variance:',japan_total_deaths.var().round(decimals=2))
print('Standard Deviation:',japan_total_deaths.std().round(decimals=2))

In [None]:
plt.plot(japan_total_deaths)

# G1

In [None]:
cont_table = pd.crosstab(df['total_cases'], df['total_deaths'])
cont_table

In [None]:
g, p, dof, expctd = stats.chi2_contingency(cont_table)

g