## This Notebook:

1) links to the AWS database

2) cleans the AWS database

3) creates a local path for the output Cases_Cleaned/ML_cases.csv

4) creates a local path for the output Deaths_Cleaned/ML_deaths.csv

5) ML model for cases reads ML_cases.csv

6) ML model for cases saves output to user-defined location

5) ML model for deaths reads ML_deaths.csv

6) ML model for deaths saves output to user-defined location

7) creates PostgresSQL database for machine learning models

8) option to write model results information directly to a csv file


<div style="background:lightblue">

# Navigator

## [Start](#Notebook-and-Run-Count-Information)

## [Database Cleaner](#AWS-Database-Cleaner)

## [Machine Learning Model](#TITLE:-cases)

## [Database Structure](#PostgresSQL-Database)

## [Export to Database](#Dataframes-to-PostgreSQL-tables)

</div>

## Notebook and Run Count Information 

In [2]:
#results database information
# ********************************

name_nb = "ML_pn_rev1"

run_nb = {}
run_counter = 18
run_counter = run_counter + 1
run_nb['notebook'] = run_counter
run_nb

# ********************************

{'notebook': 19}

## AWS Database Cleaner


**RELEVANT DATAFRAMES:  df, df_cases, df_deaths**

**FILE: vax_cases_death.csv**

**SOURCE:  AWS download from SQL database**

In [3]:
# import dependencies

import os
import pandas as pd
import re
from sklearn.preprocessing import OneHotEncoder, LabelEncoder


In [4]:
# Clean file, save information for results database, make csv files for machine learning model

# read file

file_path = "https://initial-datasets.s3.amazonaws.com/vax_cases_deaths.csv"
df = pd.read_csv(file_path)


# results database information
# *********************************

if file_path == "https://initial-datasets.s3.amazonaws.com/vax_cases_deaths.csv":
    source_db = "AWS database csv file"
    file_id = file_path

# the statistics dataset used for the label column (name_statsfile)

name_statsfile = "stats_Xb_cases_ds"

# the statistic used for the setting the label column (name_statistic)

name_statistic = "mean"

# **********************************
 

# Location to string

df["location"] = df["location"].astype(str)



# Delete the column "submission_date."

df.drop(columns = ["date"], inplace = True)


# Add the label columns to df. 

df["2020_mean_cases"] = 0
df["2020_mean_deaths"] = 0


# Make lists for mean values

# mean cases for 2020 and for 2020 and 2021 combined

mean_cases = [90135, 325018]

mean_cases_value = mean_cases[1]

# mean deaths for 2020 and for 2020 and 2021 combined

mean_deaths = [2634, 6365]

mean_deaths_value = mean_deaths[1]



# Populate "2020_mean_cases" with 1 or 0 
# Populate "2020_mean_deaths" with 1 or 0 

# cases

for index, row in df.iterrows():
    x = row["total_cases"]
    if x >= int(mean_cases_value):
        df.loc[index, "2020_mean_cases"]=1
    else:
        df.loc[index, "2020_mean_cases"]=0

# deaths

for index, row in df.iterrows():
    x = row["total_deaths"]
    if x >= int(mean_deaths_value):
        df.loc[index, "2020_mean_deaths"]=1
    else:
        df.loc[index, "2020_mean_deaths"]=0

#delete columns "total_cases" and "total_deaths"

df.drop(columns = ["total_cases", "total_deaths"], inplace = True)



# Perform OneHotEncoding

# import dependencies

obj_list = df.dtypes[df.dtypes == "object"].index.to_list()

# apply OneHotEncoder to objects

enc = OneHotEncoder(sparse = False)
encoded_df = pd.DataFrame(enc.fit_transform(df[obj_list]))
encoded_df.columns = enc.get_feature_names(obj_list)

df = df.merge(encoded_df, left_index = True, right_index = True)
df = df.drop(obj_list, 1)



# Make dataframes for cases and deaths

# make a new dataframe for cases only.

# first reorder columns

columns_cases = ['mmwr_week', 'year', 'distributed', 'administered', '2020_mean_cases',
       '2020_mean_deaths', 'location_AK', 'location_AL', 'location_AR',
       'location_AZ', 'location_CA', 'location_CO', 'location_CT',
       'location_DC', 'location_DE', 'location_FL', 'location_GA',
       'location_HI', 'location_IA', 'location_ID', 'location_IL',
       'location_IN', 'location_KS', 'location_KY', 'location_LA',
       'location_MA', 'location_MD', 'location_ME', 'location_MI',
       'location_MN', 'location_MO', 'location_MS', 'location_MT',
       'location_NC', 'location_ND', 'location_NE', 'location_NH',
       'location_NJ', 'location_NM', 'location_NV', 'location_NY',
       'location_OH', 'location_OK', 'location_OR', 'location_PA',
       'location_RI', 'location_SC', 'location_SD', 'location_TN',
       'location_TX', 'location_UT', 'location_VA', 'location_VI',
       'location_VT', 'location_WA', 'location_WI', 'location_WV',
       'location_WY']

columns_cases_new = [ 'year', 'mmwr_week', 'distributed', 'administered', 
        'location_AK', 'location_AL', 'location_AR',
       'location_AZ', 'location_CA', 'location_CO', 'location_CT',
       'location_DC', 'location_DE', 'location_FL', 'location_GA',
       'location_HI', 'location_IA', 'location_ID', 'location_IL',
       'location_IN', 'location_KS', 'location_KY', 'location_LA',
       'location_MA', 'location_MD', 'location_ME', 'location_MI',
       'location_MN', 'location_MO', 'location_MS', 'location_MT',
       'location_NC', 'location_ND', 'location_NE', 'location_NH',
       'location_NJ', 'location_NM', 'location_NV', 'location_NY',
       'location_OH', 'location_OK', 'location_OR', 'location_PA',
       'location_RI', 'location_SC', 'location_SD', 'location_TN',
       'location_TX', 'location_UT', 'location_VA', 'location_VI',
       'location_VT', 'location_WA', 'location_WI', 'location_WV',
       'location_WY', '2020_mean_cases', '2020_mean_deaths']

df = df.reindex(columns = columns_cases_new )

# next drop out death-related column for df_cases

df_cases = df.copy()
df_cases.drop(columns = ['2020_mean_deaths'], inplace = True)

# next drop out cases-related column for df_deaths

df_deaths = df.copy()
df_deaths.drop(columns = ['2020_mean_cases'], inplace = True)



# Save dataframes as csv files for folders Cases_Cleaned and Deaths_Cleaned

# Save df_cases as csv file.

os.makedirs("Cases_Cleaned/",exist_ok=True)
df_cases.to_csv('Cases_Cleaned/ML_cases_vcd.csv', index = False)

# Save df_deaths as csv file.

os.makedirs("Deaths_Cleaned/",exist_ok=True)
df_deaths.to_csv('Deaths_Cleaned/ML_deaths_vcd.csv', index = False)



#results database information
# *********************************

casesfile_id = f"ML_cases_vcd.csv_{run_counter}"
deathsfile_id = f"ML_deaths_vcd.csv_{run_counter}"

# *********************************


## Machine Learning Model

### TITLE: cases

**MODEL: RandomForest**

**FILE:  Cases_Cleaned/ML_cases_vcd.csv**

**RELEVANT DATAFRAMES:  CR_cases_df, df_importance_cases**


In [5]:
# Initial imports

import pandas as pd
from path import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn import metrics


In [6]:
# Define input parameters, save information for the results database
# Same input parameters are used for both the cases and deaths machine learning parameters
#
# Functions defined here are input_deck, input_params, RandomForestClassifier



#results database information
# **********************************

type_model_cases = "Random Forest"
name_model_cases = "cases"

# *********************************

# Loading data
file_path = Path("Cases_Cleaned/ML_cases_vcd.csv")
df_cases = pd.read_csv(file_path)

# Define the features set.
X = df_cases.copy()
X = X.drop("2020_mean_cases", axis=1)

# Define the target set.
y = df_cases["2020_mean_cases"].ravel()

# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Creating a StandardScaler instance.
scaler = StandardScaler()

# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# set input deck to be used for both cases and deaths

def input_deck(n):
    
# format is [n_estimators, random_state, criterion, max_depth, max_features, min_impurity_decrease, oob_score]

    rf_input = [
        
        [128, 78, 'gini', None, 'auto', 0.0, False],
        [128, 78, 'gini', None, 'auto', 0.0, True],
        [128, 78, 'entropy', None, 'auto', 0.0, False],
        [128, 78, 'entropy', None, 'auto', 0.0, True],
        [128, 78, 'gini', 10, 'sqrt', 0.0, False],
        [128, 78, 'gini', 10, 'sqrt', 0.0, True],
        [128, 78, 'entropy', 10, 'sqrt', 0.0, False],
        [128, 78, 'entropy', 10, 'sqrt', 0.0, True],
        [128, 78, 'gini', None, 'sqrt', 0.02, False],
        [128, 78, 'gini', None, 'sqrt', 0.02, True],
        [128, 78, 'entropy', None, 'sqrt', 0.02, False],
        [128, 78, 'entropy', None, 'sqrt', 0.02, True],
        [128, 78, 'entropy', 10, 'sqrt', 0.0, True],
        [128, 78, 'gini', None, 'sqrt', 0.5, False],
        [128, 78, 'gini', None, 'sqrt', 0.5, True]
        
    ]
    
    rf_input_params = rf_input[n]
    
    return rf_input_params

def input_params(n):

    n_estimators = input_deck(n)[0]
    random_state = input_deck(n)[1]
    criterion = input_deck(n)[2]
    max_depth = input_deck(n)[3]
    max_features = input_deck(n)[4]
    min_impurity_decrease = input_deck(n)[5]
    oob_score = input_deck(n)[6]
    
    return n_estimators, random_state, criterion, max_depth, max_features, min_impurity_decrease, oob_score 

# set the input parameters

n_estimators, random_state, criterion, max_depth, max_features, min_impurity_decrease, oob_score = input_params(5)



In [7]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state,
                                  criterion=criterion,
                                  max_depth=max_depth, max_features =max_features,
                                  min_impurity_decrease = min_impurity_decrease,
                                 oob_score = oob_score) 


#results database information
# ************************************************************

run_dt = pd.to_datetime('now').strftime('%Y-%m-%d %H:%M:%S')

#parameter names used in the arguments
# n_estimators=128
# random_state=78
# criterion = 'gini' or 'entropy'
# max_depth = None or 10
# max_features = 'auto' or 'sqrt'
# min_impurity_decrease = 0.0 or a fraction
# oob_score = False or True

rf_pars = rf_model.get_params()
rf_n_estimators = rf_pars['n_estimators']
rf_random_state = rf_pars['random_state']
rf_criterion = rf_pars['criterion']
rf_max_depth = rf_pars['max_depth']
rf_max_features = rf_pars['max_features']
rf_min_impurity_decrease = rf_pars['min_impurity_decrease']
rf_oob_score = rf_pars['oob_score']


par_name_1 = f"n_estimators={rf_n_estimators}"
par_name_2 = f"random_state={rf_random_state}"
par_name_3 = f"criterion={rf_criterion}"
par_name_4 = f"max_depth={rf_max_depth}"
par_name_5 = f"max_features={rf_max_features}"
par_name_6 = f"max_depth={rf_min_impurity_decrease}"
par_name_7 = f"max_features={rf_oob_score}"

# **********************************************************

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

#results database information
# *************************************************

CM_A0P0_cases= cm_df.loc["Actual 0", "Predicted 0"]
CM_A0P1_cases= cm_df.loc["Actual 0", "Predicted 1"]
CM_A1P0_cases= cm_df.loc["Actual 1", "Predicted 0"]
CM_A1P1_cases= cm_df.loc["Actual 1", "Predicted 1"]

# *************************************************


# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

#results database information
# *********************************

acc_score_cases = acc_score

# make a dataframe from the classification report

def get_classification_report(y_test, y_pred):
    
    # Source: https://
    # stackoverflow.com/questions/39662398/scikit-learn-output-metrics-classification-report-into-csv-
    # tab-delimited-format
    
    report = metrics.classification_report(y_test, y_pred, output_dict=True)
    df_classification_report = pd.DataFrame(report).transpose()
    df_classification_report = df_classification_report.sort_values(by=['f1-score'], ascending=False)
    return df_classification_report

CR_cases_df = get_classification_report(y_test, predictions)

CR_P0_cases = CR_cases_df.loc['0', 'precision']
CR_P1_cases = CR_cases_df.loc['1', 'precision']
CR_R0_cases = CR_cases_df.loc['0', 'recall']
CR_R1_cases = CR_cases_df.loc['1', 'recall']
CR_f1_0_cases = CR_cases_df.loc['0', 'f1-score']
CR_f1_1_cases = CR_cases_df.loc['1', 'f1-score']

# *********************************

# sort the features by their importance

imp_list = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
df_importance_cases = pd.DataFrame(imp_list)
df_importance_cases.rename(columns = {0 :'Importance_cases'}, inplace = True)
df_importance_cases.rename(columns = {1 :'Feature_cases'}, inplace = True)
df_importance_cases['notebook'] = run_nb['notebook']
df_importance_cases['run_dt'] = run_dt
cols_imp = ['notebook', 'run_dt', 'Feature_cases', 'Importance_cases']
df_importance_cases = df_importance_cases.reindex(columns = cols_imp)



In [8]:
# Displaying results

print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
rep = classification_report(y_test, predictions)
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,245,4
Actual 1,21,328


Accuracy Score : 0.9581939799331104
Classification Report
              precision    recall  f1-score   support

           0       0.92      0.98      0.95       249
           1       0.99      0.94      0.96       349

    accuracy                           0.96       598
   macro avg       0.95      0.96      0.96       598
weighted avg       0.96      0.96      0.96       598



## Machine Learning Model

**TITLE: deaths**

**MODEL: RandomForest**

**FILE:  Cases_Cleaned/ML_deaths_vcd.csv**

**RELEVANT DATAFRAMES:  CR_death_df, df_importance_death**

In [9]:
# Save information for the results database
# 
# Functions defined here are RandomForestClassifier



#results database information
# *************************************

model_id = 1

type_model_deaths = "Random Forest"
name_model_deaths = "deaths"

# *************************************

# Loading data
file_path = Path("Deaths_Cleaned/ML_deaths_vcd.csv")
df_deaths = pd.read_csv(file_path)

# Define the features set.
X = df_deaths.copy()
X = X.drop("2020_mean_deaths", axis=1)

# Define the target set.
y = df_deaths["2020_mean_deaths"].ravel()

# Splitting into Train and Test sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Create a random forest classifier.

rf_model = RandomForestClassifier(n_estimators=n_estimators, random_state=random_state,
                                  criterion=criterion,
                                  max_depth=max_depth, max_features =max_features,
                                  min_impurity_decrease = min_impurity_decrease,
                                 oob_score = oob_score) 

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)

# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

#results database information
# *************************************************

CM_A0P0_death= cm_df.loc["Actual 0", "Predicted 0"]
CM_A0P1_death= cm_df.loc["Actual 0", "Predicted 1"]
CM_A1P0_death= cm_df.loc["Actual 1", "Predicted 0"]
CM_A1P1_death= cm_df.loc["Actual 1", "Predicted 1"]

# *************************************************

# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

#results database information
# **************************************

acc_score_death = acc_score

# make a dataframe from the classification report

def get_classification_report(y_test, y_pred):
    # Source: https://
    # stackoverflow.com/questions/39662398/scikit-learn-output-metrics-classification-report-into-csv-
    # tab-delimited-format
    report = metrics.classification_report(y_test, y_pred, output_dict=True)
    df_classification_report = pd.DataFrame(report).transpose()
    df_classification_report = df_classification_report.sort_values(by=['f1-score'], ascending=False)
    return df_classification_report

CR_death_df = get_classification_report(y_test, predictions)

CR_P0_death = CR_death_df.loc['0', 'precision']
CR_P1_death = CR_death_df.loc['1', 'precision']
CR_R0_death = CR_death_df.loc['0', 'recall']
CR_R1_death = CR_death_df.loc['1', 'recall']
CR_f1_0_death = CR_death_df.loc['0', 'f1-score']
CR_f1_1_death = CR_death_df.loc['1', 'f1-score']

# ********************************************

# sort the features by their importance

imp_list = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
df_importance_death = pd.DataFrame(imp_list)
df_importance_death.rename(columns = {0 :'Importance_death'}, inplace = True)
df_importance_death.rename(columns = {1 :'Feature_death'}, inplace = True)
df_importance_death['notebook'] = run_nb['notebook']
df_importance_death['run_dt'] = run_dt
cols_imp = ['notebook', 'run_dt', 'Feature_death', 'Importance_death']
df_importance_death = df_importance_death.reindex(columns = cols_imp)



In [10]:
# Displaying results

print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,271,9
Actual 1,38,280


Accuracy Score : 0.9214046822742475
Classification Report
              precision    recall  f1-score   support

           0       0.88      0.97      0.92       280
           1       0.97      0.88      0.92       318

    accuracy                           0.92       598
   macro avg       0.92      0.92      0.92       598
weighted avg       0.93      0.92      0.92       598



## PostgresSQL Database

### Database to hold machine learning results

#### Create 4 Dataframes for Importing into PostgreSQL Database

In [13]:
# option to manually set the run counter here to start at a later notebook number
# all results database information are used to create input for the PostgresSQL database here

if run_counter == 19:

    
# df_model
   
    name_nb_dict = {"name_nb":name_nb}
    run_dt_dict = {"run_dt":run_dt}
    run_nb_dict = run_nb
    source_db_dict = {"source_db":source_db}
    file_id_dict = {"file_id":file_id}
    model_id_dict = {"model_id":model_id}
    type_model_cases_dict = {"type_model_cases":type_model_cases}
    type_model_deaths_dict = {"type_model_deaths":type_model_deaths}
    name_model_cases_dict = {"name_model_cases":name_model_cases}
    name_model_deaths_dict = {"name_model_deaths":name_model_deaths}
    par_name_1_dict = {"par_name_1":par_name_1}
    par_name_2_dict = {"par_name_2":par_name_2}
    par_name_3_dict = {"par_name_3":par_name_3}
    par_name_4_dict = {"par_name_4":par_name_4}
    par_name_5_dict = {"par_name_5":par_name_5}
    par_name_6_dict = {"par_name_6":par_name_6}
    par_name_7_dict = {"par_name_7":par_name_7}
    casesfile_id_dict = {"casesfile_id":casesfile_id}
    deathsfile_id_dict = {"deathsfile_id":deathsfile_id}
    name_statsfile_dict = {"name_statsfile":name_statsfile}
    name_statistic_dict = {"name_statistic":name_statistic}
  
    
    data = [run_nb_dict,name_nb_dict, run_dt_dict, source_db_dict, file_id_dict, model_id_dict,
            type_model_cases_dict, type_model_deaths_dict,  name_model_cases_dict, name_model_deaths_dict,
            par_name_1_dict, par_name_2_dict, par_name_3_dict, par_name_4_dict, par_name_5_dict,
            par_name_6_dict, par_name_7_dict, casesfile_id_dict, deathsfile_id_dict,
            name_statsfile_dict, name_statistic_dict]
    
    data_merged = {}
    for x in data:
        data_merged.update(x)
    data_list = [data_merged]
    
    df_model = pd.DataFrame(data_list)
    
    
# df_model_results

    results_dict ={

        'notebook': run_nb_dict['notebook'],
        'run_dt':run_dt_dict['run_dt'],
        'CM_A0P0_cases':CM_A0P0_cases,
        'CM_A0P1_cases':CM_A0P1_cases,
        'CM_A1P0_cases':CM_A1P0_cases,
        'CM_A1P1_cases':CM_A1P1_cases,
        'CM_A0P0_death':CM_A0P0_death,
        'CM_A0P1_death':CM_A0P1_death,
        'CM_A1P0_death':CM_A1P0_death,
        'CM_A1P1_death':CM_A1P1_death,
        'acc_score_cases':acc_score_cases,
        'acc_score_death':acc_score_death,
        'CR_P0_cases':CR_P0_cases,
        'CR_P1_cases':CR_P1_cases,
        'CR_R0_cases':CR_R0_cases,
        'CR_R1_cases':CR_R1_cases,
        'CR_f1_0_cases':CR_f1_0_cases,
        'CR_f1_1_cases':CR_f1_1_cases,
        'CR_P0_death':CR_P0_death,
        'CR_P1_death':CR_P1_death,
        'CR_R0_death':CR_R0_death,
        'CR_R1_death':CR_R1_death,
        'CR_f1_0_death':CR_f1_0_death,
        'CR_f1_1_death':CR_f1_1_death

    }

    results_list = [results_dict]
    df_model_results = pd.DataFrame(results_list)

# df_model_importances

    df_model_importances = pd.merge(df_importance_cases, df_importance_death, left_index =True, right_index=True)
    df_model_importances.drop(columns=["notebook_y", "Feature_death"], inplace = True)
    df_model_importances.rename(columns = {'notebook_x':'notebook','Feature_cases':"Feature"}, inplace = True)
    df_model_importances.drop(columns = ["run_dt_y"], inplace = True)
    df_model_importances.rename(columns = {'run_dt_x':'run_dt'}, inplace = True)
    

# initialize the new dataframes

    df_model_new = df_model.copy()
    df_model_results_new = df_model_results.copy()
    df_model_importances_new = df_model_importances.copy()

# saved copies for resetting the dataframes

    df_model_first_run = df_model.copy()
    df_model_results_first_run = df_model_results.copy()
    df_model_importances_first_run = df_model_importances.copy()
        
else:
    
# dataframes for run_counter > 1

# df_model

   
    name_nb_dict = {"name_nb":name_nb}
    run_dt_dict = {"run_dt":run_dt}
    run_nb_dict = run_nb
    source_db_dict = {"source_db":source_db}
    file_id_dict = {"file_id":file_id}
    model_id_dict = {"model_id":model_id}
    type_model_cases_dict = {"type_model_cases":type_model_cases}
    type_model_deaths_dict = {"type_model_deaths":type_model_deaths}
    name_model_cases_dict = {"name_model_cases":name_model_cases}
    name_model_deaths_dict = {"name_model_deaths":name_model_deaths}
    par_name_1_dict = {"par_name_1":par_name_1}
    par_name_2_dict = {"par_name_2":par_name_2}
    par_name_3_dict = {"par_name_3":par_name_3}
    par_name_4_dict = {"par_name_4":par_name_4}
    par_name_5_dict = {"par_name_5":par_name_5}
    par_name_6_dict = {"par_name_6":par_name_6}
    par_name_7_dict = {"par_name_7":par_name_7}
    casesfile_id_dict = {"casesfile_id":casesfile_id}
    deathsfile_id_dict = {"deathsfile_id":deathsfile_id}
    name_statsfile_dict = {"name_statsfile":name_statsfile}
    name_statistic_dict = {"name_statistic":name_statistic}
    
    data = [run_nb_dict,name_nb_dict, run_dt_dict, source_db_dict, file_id_dict, model_id_dict,
            type_model_cases_dict, type_model_deaths_dict,  name_model_cases_dict, name_model_deaths_dict,
            par_name_1_dict, par_name_2_dict, par_name_3_dict, par_name_4_dict, par_name_5_dict,
            par_name_6_dict, par_name_7_dict, casesfile_id_dict, deathsfile_id_dict,
            name_statsfile_dict,  name_statistic_dict]
    
    data_merged = {}
    for x in data:
        data_merged.update(x)
    data_list = [data_merged]
    
    df_model = pd.DataFrame(data_list)
    

# df_model_results

    
    results_dict ={

        'notebook': run_nb_dict['notebook'],
        'run_dt':run_dt_dict['run_dt'],
        'CM_A0P0_cases':CM_A0P0_cases,
        'CM_A0P1_cases':CM_A0P1_cases,
        'CM_A1P0_cases':CM_A1P0_cases,
        'CM_A1P1_cases':CM_A1P1_cases,
        'CM_A0P0_death':CM_A0P0_death,
        'CM_A0P1_death':CM_A0P1_death,
        'CM_A1P0_death':CM_A1P0_death,
        'CM_A1P1_death':CM_A1P1_death,
        'acc_score_cases':acc_score_cases,
        'acc_score_death':acc_score_death,
        'CR_P0_cases':CR_P0_cases,
        'CR_P1_cases':CR_P1_cases,
        'CR_R0_cases':CR_R0_cases,
        'CR_R1_cases':CR_R1_cases,
        'CR_f1_0_cases':CR_f1_0_cases,
        'CR_f1_1_cases':CR_f1_1_cases,
        'CR_P0_death':CR_P0_death,
        'CR_P1_death':CR_P1_death,
        'CR_R0_death':CR_R0_death,
        'CR_R1_death':CR_R1_death,
        'CR_f1_0_death':CR_f1_0_death,
        'CR_f1_1_death':CR_f1_1_death

    }

    results_list = [results_dict]
    df_model_results = pd.DataFrame(results_list)


# df_model_importances

    df_model_importances = pd.merge(df_importance_cases, df_importance_death, left_index =True, right_index=True)
    df_model_importances.drop(columns=["notebook_y", "Feature_death"], inplace = True)
    df_model_importances.rename(columns = {'notebook_x':'notebook','Feature_cases':"Feature"}, inplace = True)
    df_model_importances.drop(columns = ["run_dt_y"], inplace = True)
    df_model_importances.rename(columns = {'run_dt_x':'run_dt'}, inplace = True)


# concat dataframes

    df_model_new = pd.concat([df_model_new, df_model], ignore_index = True)
    # df_set_stats_new = pd.concat([df_set_stats_new, df_set_stats], ignore_index = True)
    df_model_results_new = pd.concat([df_model_results_new, df_model_results],ignore_index = True)
    df_model_importances_new = pd.concat([df_model_importances_new, df_model_importances], ignore_index = True)


# the 3 dataframes to be put into PostgresSql:
#
# df_model_new
# df_model_results_new
# df_model_importances_new
#
# Note: df_set_stats_new is NOT used in this notebook



In [14]:
# View dataframes for input into the results database  
    
df_model_new

Unnamed: 0,notebook,name_nb,run_dt,source_db,file_id,model_id,type_model_cases,type_model_deaths,name_model_cases,name_model_deaths,...,par_name_2,par_name_3,par_name_4,par_name_5,par_name_6,par_name_7,casesfile_id,deathsfile_id,name_statsfile,name_statistic
0,19,ML_pn_rev1,2021-11-28 16:59:09,AWS database csv file,https://initial-datasets.s3.amazonaws.com/vax_...,1,Random Forest,Random Forest,cases,deaths,...,random_state=78,criterion=gini,max_depth=10,max_features=sqrt,max_depth=0.0,max_features=True,ML_cases_vcd.csv_19,ML_deaths_vcd.csv_19,stats_Xb_cases_ds,mean


In [131]:

# Option to append the dataframes to a csv file (not used for appending to a csv file created from PostgreSQL)

#df_model_new.to_csv('rfinput_optimal.csv', mode = 'a', index = False, header = False)
#df_model_results_new.to_csv('rfresult_optimal.csv', mode = 'a', index = False, header = False)
#df_model_importances_new.to_csv('rfimportance_optimal.csv', mode = 'a', index = False, header = False)


### Dataframes to PostgreSQL tables


In [405]:
from sqlalchemy import create_engine

In [406]:
from config import db_password

In [407]:
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/MLmodels"

In [408]:
engine = create_engine(db_string)

In [410]:
df_model_new.to_sql(name='mlinputs_2', con=engine)
#df_model_results_new.to_sql(name = 'rfresults', con=engine, if_exists='append')
#df_model_importances_new.to_sql(name = 'rfimportances', con = engine, if_exists='append')

<div style="background:lightblue">
    
# Navigator

## [Start](#Notebook-and-Run-Count-Information)

## [Database Cleaner](#AWS-Database-Cleaner)

## [Machine Learning Model](#TITLE:-cases)

## [Database Structure](#PostgresSQL-Database)

## [Export to Database](#Dataframes-to-PostgreSQL-tables)
    
</div>

<div class="alert alert-block alert-danger">


## STOP BEFORE RUNNING THE NEXT CELL:   RESET OPTION
    
ARE YOU SURE YOU WANT TO RESET?
ALL RUNS AFTER THE FIRST WILL BE GONE!
    
</div>

In [120]:
# RESET OF DATAFRAMES TO THE FIRST RUN.  ONLY RESET IF NEEDED.
#



reset_dataframes = False
if reset_dataframes == True:
    df_model_new = df_model_first_run
    df_model_results_new = df_model_results_first_run
    df_model_importances_new = df_model_importances_first_run 
    
