# GENDER CLEAN_UP PROCESS


In [1]:
# Library dependencies
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
from sqlalchemy.sql import text

#### Original Gender Data

In [2]:
csv_path = "./maryland_covid19-gender_vaccination.csv"
df = pd.read_csv(csv_path)

In [3]:
df.head()

Unnamed: 0,OBJECTID,VACCINATION_DATE,Gender,FirstDoseDaily,FirstDoseCumulative,SecondDoseDaily,SecondDoseCumulative,SingleDoseDaily,SingleDoseCumulative
0,1,2021/04/13 15:00:00+00,Federal Entities,84295,84295,54201.0,54201,2952.0,2952
1,2,2021/04/14 15:00:00+00,Federal Entities,557,84852,11.0,54212,1.0,2953
2,3,2021/04/15 15:00:00+00,Federal Entities,623,85475,823.0,55035,19.0,2972
3,4,2021/04/16 15:00:00+00,Federal Entities,782,86257,1016.0,56051,8.0,2980
4,5,2021/04/17 15:00:00+00,Federal Entities,1136,87393,1177.0,57228,6.0,2986


#### Resetting Index Values

In [4]:
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,OBJECTID,VACCINATION_DATE,Gender,FirstDoseDaily,FirstDoseCumulative,SecondDoseDaily,SecondDoseCumulative,SingleDoseDaily,SingleDoseCumulative
0,1,2021/04/13 15:00:00+00,Federal Entities,84295,84295,54201.0,54201,2952.0,2952
1,2,2021/04/14 15:00:00+00,Federal Entities,557,84852,11.0,54212,1.0,2953
2,3,2021/04/15 15:00:00+00,Federal Entities,623,85475,823.0,55035,19.0,2972
3,4,2021/04/16 15:00:00+00,Federal Entities,782,86257,1016.0,56051,8.0,2980
4,5,2021/04/17 15:00:00+00,Federal Entities,1136,87393,1177.0,57228,6.0,2986


#### Resetting object ID values

In [5]:
df["OBJECTID"] = np.arange(1,df.shape[0]+1)
df.head()

Unnamed: 0,OBJECTID,VACCINATION_DATE,Gender,FirstDoseDaily,FirstDoseCumulative,SecondDoseDaily,SecondDoseCumulative,SingleDoseDaily,SingleDoseCumulative
0,1,2021/04/13 15:00:00+00,Federal Entities,84295,84295,54201.0,54201,2952.0,2952
1,2,2021/04/14 15:00:00+00,Federal Entities,557,84852,11.0,54212,1.0,2953
2,3,2021/04/15 15:00:00+00,Federal Entities,623,85475,823.0,55035,19.0,2972
3,4,2021/04/16 15:00:00+00,Federal Entities,782,86257,1016.0,56051,8.0,2980
4,5,2021/04/17 15:00:00+00,Federal Entities,1136,87393,1177.0,57228,6.0,2986


In [6]:
columnsplit = df['VACCINATION_DATE'].str.split(' ',n=1, expand=True)
# Assigning Column 0 to DATE
df = df.assign(VACCINATION_DATE=columnsplit[0])
df.head()

Unnamed: 0,OBJECTID,VACCINATION_DATE,Gender,FirstDoseDaily,FirstDoseCumulative,SecondDoseDaily,SecondDoseCumulative,SingleDoseDaily,SingleDoseCumulative
0,1,2021/04/13,Federal Entities,84295,84295,54201.0,54201,2952.0,2952
1,2,2021/04/14,Federal Entities,557,84852,11.0,54212,1.0,2953
2,3,2021/04/15,Federal Entities,623,85475,823.0,55035,19.0,2972
3,4,2021/04/16,Federal Entities,782,86257,1016.0,56051,8.0,2980
4,5,2021/04/17,Federal Entities,1136,87393,1177.0,57228,6.0,2986


#### Converting date from Obect to Date format

In [7]:
import datetime as dt
df['VACCINATION_DATE'] = pd.to_datetime(df["VACCINATION_DATE"], format='%Y/%m/%d')
df.head()

Unnamed: 0,OBJECTID,VACCINATION_DATE,Gender,FirstDoseDaily,FirstDoseCumulative,SecondDoseDaily,SecondDoseCumulative,SingleDoseDaily,SingleDoseCumulative
0,1,2021-04-13,Federal Entities,84295,84295,54201.0,54201,2952.0,2952
1,2,2021-04-14,Federal Entities,557,84852,11.0,54212,1.0,2953
2,3,2021-04-15,Federal Entities,623,85475,823.0,55035,19.0,2972
3,4,2021-04-16,Federal Entities,782,86257,1016.0,56051,8.0,2980
4,5,2021-04-17,Federal Entities,1136,87393,1177.0,57228,6.0,2986


In [8]:
df.dtypes

OBJECTID                         int64
VACCINATION_DATE        datetime64[ns]
Gender                          object
FirstDoseDaily                   int64
FirstDoseCumulative              int64
SecondDoseDaily                float64
SecondDoseCumulative             int64
SingleDoseDaily                float64
SingleDoseCumulative             int64
dtype: object

#### Renaming OBJECTID AND Vaccionation_date columns

In [9]:
df.rename(columns={"OBJECTID":"ID", "VACCINATION_DATE":"DATE"},inplace=True)
df.head()

Unnamed: 0,ID,DATE,Gender,FirstDoseDaily,FirstDoseCumulative,SecondDoseDaily,SecondDoseCumulative,SingleDoseDaily,SingleDoseCumulative
0,1,2021-04-13,Federal Entities,84295,84295,54201.0,54201,2952.0,2952
1,2,2021-04-14,Federal Entities,557,84852,11.0,54212,1.0,2953
2,3,2021-04-15,Federal Entities,623,85475,823.0,55035,19.0,2972
3,4,2021-04-16,Federal Entities,782,86257,1016.0,56051,8.0,2980
4,5,2021-04-17,Federal Entities,1136,87393,1177.0,57228,6.0,2986


In [10]:
df.isnull().sum()

ID                        0
DATE                      0
Gender                    0
FirstDoseDaily            0
FirstDoseCumulative       0
SecondDoseDaily          33
SecondDoseCumulative      0
SingleDoseDaily         250
SingleDoseCumulative      0
dtype: int64

#### Replacing NaN values with 0

In [11]:
df = df.replace(np.nan,0)
df

Unnamed: 0,ID,DATE,Gender,FirstDoseDaily,FirstDoseCumulative,SecondDoseDaily,SecondDoseCumulative,SingleDoseDaily,SingleDoseCumulative
0,1,2021-04-13,Federal Entities,84295,84295,54201.0,54201,2952.0,2952
1,2,2021-04-14,Federal Entities,557,84852,11.0,54212,1.0,2953
2,3,2021-04-15,Federal Entities,623,85475,823.0,55035,19.0,2972
3,4,2021-04-16,Federal Entities,782,86257,1016.0,56051,8.0,2980
4,5,2021-04-17,Federal Entities,1136,87393,1177.0,57228,6.0,2986
...,...,...,...,...,...,...,...,...,...
500,501,2021-05-19,Male,12041,1325837,13859.0,1039781,909.0,108800
501,502,2021-05-20,Male,7734,1333571,12633.0,1052414,1001.0,109801
502,503,2021-05-21,Male,7379,1340950,11452.0,1063866,787.0,110588
503,504,2021-05-22,Male,5826,1346776,6457.0,1070323,613.0,111201


Connecting to DB

In [12]:
from config import password
connection_string = f"udxenurz:{password}@batyr.db.elephantsql.com/udxenurz"
engine = create_engine(f'postgresql://{connection_string}')

In [13]:
engine.table_names()

['cases', 'vaccinations', 'gender']

In [14]:
# dropping values that are in any of our tables and resetting the index.
with engine.connect() as con:
    statement = [text("""Truncate table gender CASCADE""")]
    for query in statement:
        con.execute(query)

In [15]:
df.to_sql(name='gender', con=engine, if_exists='append', index=False)