### Team member eg2938 NYC dataset 
Github: https://github.com/gwoosoft/Bigdatavproject

| file name | local_path | url |  
| :-- | :--- | :--- |
| us-states | /cleandataset/us-states.csv | https://github.com/nytimes/covid-19-data/blob/master/us-states.csv  | 
| us_state_vaccinations | /cleandataset/us_state_vaccinations.csv | https://github.com/owid/covid-19-data/blob/master/public/data/vaccinations/us_state_vaccinations.csv  | 
| owid-covid-data | /cleandataset/owid-covid-data.csv | https://github.com/owid/covid-19-data/blob/master/public/data/owid-covid-data.csv  | 
| ny-data-by-day | /cleandataset/ny-data-by-day.csv | https://github.com/nychealth/coronavirus-data/blob/master/trends/data-by-day.csv  | 

# Analyzing Vaccinations vs Number of Cases

## Analyzing World Data

In [1]:
import altair as alt
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

In [2]:
# importing world_data Dataset 
world_data = pd.read_csv("owid-covid-data.csv", encoding = "latin")

# formatting date column
world_data['date'] = pd.to_datetime(world_data.date)
world_data['date'] = world_data['date'].dt.strftime('%m/%d/%Y')

# dropping columns we don't need
world_data_columns = list(world_data.columns)
columns_to_keep = ['location','date', 'new_cases', 'total_cases', 'new_deaths', 'total_deaths',
                   'new_vaccinations_smoothed', 'total_vaccinations']
columns_to_delete = list(set(world_data_columns) - set(columns_to_keep))
world_data.drop(columns=columns_to_delete, inplace=True)

# renaming columns
world_data.rename(columns={'new_cases': "daily cases", 'total_cases': 'total cases', 'new_deaths': 'daily deaths', 
                           'total_deaths': 'total deaths', 'total_vaccinations':'total vaccinations', 
                           'new_vaccinations_smoothed': 'daily vaccinations', 'location': 'country'}, inplace=True)

# deleting continent totals from the dataset
rows_to_delete = ['World', 'Europe', 'North America', 'Asia', 'European Union', 'South America', 'Africa', 'Oceania']
world_data = world_data[~world_data['country'].isin(rows_to_delete)]



In [3]:
#getting totals for the last day of data available

infection_total = world_data[world_data['date']=='05/01/2021'] 

In [4]:
# graph for 10 top countries
top_10_total_cases = alt.Chart(infection_total, title="Top 10 Countries by Number of Cases").mark_bar().encode(
    x=alt.X('country', sort='-y'),
    y="total cases",
    tooltip=list(world_data.columns)
).properties(
    height=200,
    width=700
).transform_window(
    rank='rank(total cases)',
    sort=[alt.SortField('total cases', order='descending')]
).transform_filter(
    (alt.datum.rank <= 10)
)

top_10_total_cases

In [6]:
# helper functions

def filter_country(country):
    # Filtering Israel data from the World Dataset
    data = world_data[(world_data['country'] == country)]

    # Filtering data from Dec 2020 to Apr 2021
    data['date'] = pd.to_datetime(data['date'])  
    mask = (data['date'] > '2020-12-01') & (data['date'] <= '2021-04-30')
    data_dec = data.loc[mask]
    data_dec['date'] = data_dec['date'].dt.strftime('%m/%d/%Y')

    # Cleaning data
    data_dec['total vaccinations'].fillna(method='ffill', inplace=True)
    data_dec.fillna(0, inplace=True)
    
    return data_dec


def draw_sidebyside(country):
    # Filter data
    data = filter_country(country)
    
    # Draw charts
    chart1 = alt.Chart(data).mark_line(color='#1F77B4').encode(
    x='date:T',
    y='daily cases:Q'
    ).properties(
        height=300,
        width=300,
        title= country + ' Daily Cases')

    chart2 = alt.Chart(data).mark_line(color='#FF7F0E').encode(
        x='date:T',
        y='total vaccinations:Q'
    ).properties(
        height=300,
        width=300,
        title= country + ' Total Vaccinations')

    return chart1 | chart2

def draw_twoaxis(country):
    # Filter data
    data = filter_country(country)
    
    #Draw chart
    base = alt.Chart(data).encode(
    alt.X('date:T')).properties(title= country + ' Data')

    line_cases = base.mark_line(color='#1F77B4').encode(
        y='daily cases')

    line_vac =  base.mark_line(color='#FF7F0E').encode(
        y='total vaccinations:Q')

    return alt.layer(
        line_cases,
        line_vac
    ).resolve_scale(
        y='independent')

### Israel (Dec 2020 - Apr 2021)

In [7]:
draw_sidebyside('Israel')

In [8]:
draw_twoaxis('Israel')

### United States (Dec 2020 - Apr 2021)

In [9]:
draw_sidebyside('United States')

In [10]:
draw_twoaxis('United States')

### United Arab Emirates (Dec 2020 - Apr 2021)

In [11]:
draw_sidebyside('United Arab Emirates')

In [12]:
draw_twoaxis('United Arab Emirates')

### United Kingdom (Dec 2020 - Apr 2021)

In [13]:
draw_sidebyside('United Kingdom')

In [14]:
draw_twoaxis('United Kingdom')

## Analyzing New York Data

### New York (Dec 2020 - Apr 2021)

In [15]:
# importing us-state-vaccinations datasets
vaccinations = pd.read_csv("us_state_vaccinations.csv", encoding = "latin")

In [16]:
# dropping columns we don't need from the vaccinations dataset

vaccinations_columns = list(vaccinations.columns)
columns_to_keep = ['date','location', 'total_vaccinations', 'daily_vaccinations']
columns_to_delete = list(set(vaccinations_columns) - set(columns_to_keep))
vaccinations.drop(columns=columns_to_delete, inplace=True)

# cleaning data
vaccinations.fillna(method='ffill', inplace=True)
vaccinations.fillna(0, inplace=True)

# formatting date column 
vaccinations['date'] = pd.to_datetime(vaccinations.date)
vaccinations['date'] = vaccinations['date'].dt.strftime('%m/%d/%Y')

# renaming columns
vaccinations.rename(columns={"location": "state", 'total_vaccinations': 'total vaccinations',
                            'daily_vaccinations': 'daily vaccinations'}, inplace=True)

# normalizing 'New York State'
vaccinations['state'].replace({'New York State': 'New York'}, inplace=True)

In [17]:
# importing New York daily cases dataset
new_york_daily = pd.read_csv("ny-data-by-day.csv", encoding = "latin")

# dropping columns we don't neeed
ny_daily_columns = list(new_york_daily.columns)
columns_to_keep = ['ALL_CASE_COUNT_7DAY_AVG','date_of_interest']
columns_to_delete = list(set(ny_daily_columns) - set(columns_to_keep))
new_york_daily.drop(columns=columns_to_delete, inplace=True)

# renaming columns
new_york_daily.rename(columns={"date_of_interest": "date", 'ALL_CASE_COUNT_7DAY_AVG': 'daily cases'}, inplace=True)

# filtering New York Vaccinations
new_york_vaccinations = vaccinations[(vaccinations['state'] == 'New York')]
new_york_data = pd.merge(new_york_daily, new_york_vaccinations, on='date')

In [18]:
chart1 = alt.Chart(new_york_data).mark_line(color='#1F77B4').encode(
    x='date:T',
    y='daily cases:Q'
).properties(
    height=300,
    width=300,
title='New York Daily Cases')

chart2 = alt.Chart(new_york_data).mark_line(color='#FF7F0E').encode(
    x='date:T',
    y='total vaccinations:Q'
).properties(
    height=300,
    width=300,
title='New York Total Vaccinations')

chart1 | chart2

In [19]:
base = alt.Chart(new_york_data).encode(
    alt.X('date:T')).properties(title='New York Data')

line_cases = base.mark_line(color='#1F77B4').encode(
    y='daily cases')


line_vac =  base.mark_line(color='#FF7F0E').encode(
    y='total vaccinations:Q')

alt.layer(
    line_cases,
    line_vac
).resolve_scale(
    y='independent')