<a href="https://colab.research.google.com/github/david-garza/final_project/blob/pickle/machine_learning/ml_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import dependencies
import pandas as pd

# Import sci-kit leanring modules
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Classifer Models
from sklearn.ensemble import AdaBoostClassifier


# Metrics
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# Added SQLalchemy
import sqlalchemy as db
from config import password

# Export Model Files
from joblib import dump, load

# Setup Database Connection and Import Data

In [2]:
# create the connection to the PostgreSQL database.
db_string = f"postgresql://postgres1:{password}@final-project-database.crwsgvv9ibw0.us-east-1.rds.amazonaws.com:5432/final_project_db"
con = db.create_engine(db_string).connect()

  """)


## General Data

In [3]:
# Import and view data table
data_df = pd.read_sql_table("galveston_data_join",con)

## Load Weather Station Data

In [4]:
# Load the Primary Weather Stations Data
wx1_df = pd.read_sql_table("weather_station1",con)
wx18_df = pd.read_sql_table("weather_station18",con)
wx54_df = pd.read_sql_table("weather_station54",con)

# Data Processing and Feature Engineering

## Compute 5 Day averages and sums for WX data

In [5]:
# Add 5 Day Total rain fall ws18 and ws54
wx18_df["5_day_precip18"] = wx18_df["precipitation18"].rolling(5).sum()
wx54_df["5_day_precip54"] = wx54_df["precipitation54"].rolling(5).sum()

In [6]:
# Add 5 Day Total Rain and 5 Day average for WS1
wx1_df["5_day_precip1"]=wx1_df["precipitation1"].rolling(5).sum()
wx1_df["5_day_temp"]=wx1_df["avg_temp1"].rolling(5).mean()
wx1_df["5_day_temp_max"]=wx1_df["max_temp1"].rolling(5).mean()
wx1_df["5_day_temp_min"]=wx1_df["min_temp1"].rolling(5).mean()

In [7]:
# Drop the redudant columns
drop_columns = ["avg_temp1",	"max_temp1",	"min_temp1",	"precipitation1",	"snowfall1",	"snow_depth1"]
wx1_df.drop(drop_columns,1,inplace=True)

  This is separate from the ipykernel package so we can avoid doing imports until


In [8]:
# Drop ther redudant columns for the other wx
wx18_df.drop("precipitation18",1,inplace=True)
wx54_df.drop("precipitation54",1,inplace=True)

  
  This is separate from the ipykernel package so we can avoid doing imports until


## Merge 5 Day Data and Drop Redundent Columns

In [9]:
# Merge the 5 Day station 1 data to data_df
data_df=data_df.merge(wx1_df,how="left",left_on="date",right_on="date1")
data_df.head()

Unnamed: 0,beach_id,beach_name,start_lat,start_long,end_lat,end_long,waterbody_type,station_id,station_name,bacteria_count,...,max_temp1,min_temp1,precipitation1,precipitation54,precipitation18,date1,5_day_precip1,5_day_temp,5_day_temp_max,5_day_temp_min
0,TX767833,Sea Isle,29.157639,-95.011542,29.125974,-95.062028,Open Coast,GAL005,Terramar Beach,40.0,...,53.0,50.0,0.0,,,2007-01-22,1.31,49.8,54.6,47.0
1,TX767833,Sea Isle,29.157639,-95.011542,29.125974,-95.062028,Open Coast,GAL005,Terramar Beach,38.0,...,53.0,50.0,0.0,,,2007-01-22,1.31,49.8,54.6,47.0
2,TX767833,Sea Isle,29.157639,-95.011542,29.125974,-95.062028,Open Coast,GAL007,Sea Isle South,58.0,...,53.0,50.0,0.0,,,2007-01-22,1.31,49.8,54.6,47.0
3,TX767833,Sea Isle,29.157639,-95.011542,29.125974,-95.062028,Open Coast,GAL007,Sea Isle South,48.0,...,53.0,50.0,0.0,,,2007-01-22,1.31,49.8,54.6,47.0
4,TX974690,Jamaica Beach,29.182981,-94.969426,29.176498,-94.980493,Open Coast,GAL014,Jamaica Beach South,64.0,...,53.0,50.0,0.0,,,2007-01-22,1.31,49.8,54.6,47.0


In [10]:
# Merge the other wx data sets
data_df=data_df.merge(wx18_df,how="left",left_on="date",right_on="date18")
data_df=data_df.merge(wx54_df,how="left",left_on="date",right_on="date54")
data_df.head()

Unnamed: 0,beach_id,beach_name,start_lat,start_long,end_lat,end_long,waterbody_type,station_id,station_name,bacteria_count,...,precipitation18,date1,5_day_precip1,5_day_temp,5_day_temp_max,5_day_temp_min,date18,5_day_precip18,date54,5_day_precip54
0,TX767833,Sea Isle,29.157639,-95.011542,29.125974,-95.062028,Open Coast,GAL005,Terramar Beach,40.0,...,,2007-01-22,1.31,49.8,54.6,47.0,NaT,,NaT,
1,TX767833,Sea Isle,29.157639,-95.011542,29.125974,-95.062028,Open Coast,GAL005,Terramar Beach,38.0,...,,2007-01-22,1.31,49.8,54.6,47.0,NaT,,NaT,
2,TX767833,Sea Isle,29.157639,-95.011542,29.125974,-95.062028,Open Coast,GAL007,Sea Isle South,58.0,...,,2007-01-22,1.31,49.8,54.6,47.0,NaT,,NaT,
3,TX767833,Sea Isle,29.157639,-95.011542,29.125974,-95.062028,Open Coast,GAL007,Sea Isle South,48.0,...,,2007-01-22,1.31,49.8,54.6,47.0,NaT,,NaT,
4,TX974690,Jamaica Beach,29.182981,-94.969426,29.176498,-94.980493,Open Coast,GAL014,Jamaica Beach South,64.0,...,,2007-01-22,1.31,49.8,54.6,47.0,NaT,,NaT,


# Reduce Risk Level From 3 to 2 Levels

In [11]:
data_df["risk_level"].unique()

array(['medium_risk', 'low_risk', 'high_risk'], dtype=object)

In [12]:
data_df["risk_level"].replace("medium_risk","high_risk",inplace=True)

In [13]:
data_df["risk_level"].unique()

array(['high_risk', 'low_risk'], dtype=object)

# Create City Beaches data set

In [14]:
# Create a data set 4 to only city beaches
city_beaches_df = data_df[(data_df['beach_name'] == '61st St.') | (data_df['beach_name'] == '45th St.') | (data_df['beach_name'] == '25th St.') | (data_df['beach_name'] == 'Stewart Beach')]
city_beaches_df["beach_name"].unique()

array(['45th St.', 'Stewart Beach', '61st St.', '25th St.'], dtype=object)

# Create Datasets for Models

In [15]:
# Model 1 DF will only use the data from weather station 1, longer time, but fewer columns
model_1_columns = ["week","station_id","risk_level","5_day_precip1","5_day_temp","5_day_temp_max","5_day_temp_min"]
model_1_df = data_df[model_1_columns].dropna().drop_duplicates()
model_4_df = city_beaches_df[model_1_columns].dropna().drop_duplicates()


In [16]:
# Model 2 DF will include station 1 and 18 but fewer rows
model_2_columns = ["week","station_id","risk_level","5_day_precip1","5_day_precip18","5_day_temp","5_day_temp_max","5_day_temp_min"]
model_2_df = data_df[model_2_columns].dropna().drop_duplicates()
model_5_df = city_beaches_df[model_2_columns].dropna().drop_duplicates()

In [17]:
# Model 3 DF will include all station data, less number of rows
model_3_columns = ["week","station_id","risk_level","5_day_precip1","5_day_precip18","5_day_precip54","5_day_temp","5_day_temp_max","5_day_temp_min"]
model_3_df = data_df[model_3_columns].dropna().drop_duplicates()
model_6_df = city_beaches_df[model_3_columns].dropna().drop_duplicates()

# Preprocessing

## Split Features and Traget

In [18]:
# Split out the features and target
y1 = model_1_df["risk_level"]
X1 = model_1_df.drop("risk_level",1)

y2 = model_2_df["risk_level"]
X2 = model_2_df.drop("risk_level",1)

y3 = model_3_df["risk_level"]
X3 = model_3_df.drop("risk_level",1)

y4 = model_4_df["risk_level"]
X4 = model_4_df.drop("risk_level",1)

y5 = model_5_df["risk_level"]
X5 = model_5_df.drop("risk_level",1)

y6 = model_6_df["risk_level"]
X6 = model_6_df.drop("risk_level",1)

  This is separate from the ipykernel package so we can avoid doing imports until
  
  if __name__ == '__main__':
  if sys.path[0] == '':
  from ipykernel import kernelapp as app


In [19]:
# Create a sample of X4 date for feature testing in flask app
feature_testing=X4.sample(10)

In [20]:
feature_testing.to_csv("feature_testing.csv")

## Process Features

In [21]:
# Encode station_id
X1= pd.get_dummies(X1)

X2= pd.get_dummies(X2)

X3= pd.get_dummies(X3)

X4= pd.get_dummies(X4)

X5= pd.get_dummies(X5)

X6= pd.get_dummies(X6)

In [22]:
# Start instance of scaler
scale1 = StandardScaler()
scale2 = StandardScaler()
scale3 = StandardScaler()
scale4 = StandardScaler()
scale5 = StandardScaler()
scale6 = StandardScaler()

In [23]:
# Fit and apply scaling
X1 = scale1.fit_transform(X1)

X2 = scale2.fit_transform(X2)

X3 = scale3.fit_transform(X3)

scale4.fit(X4)
X4 = scale4.transform(X4)



In [24]:
# Save the scaling model to a file
dump(scale4,"scale4.joblib")

['scale4.joblib']

In [25]:
X5 = scale5.fit_transform(X5)

X6 = scale6.fit_transform(X6)



## Process Target

In [26]:
# Initialize LabelEncoder
le = LabelEncoder()


In [27]:
le.fit(y1)

y1=le.transform(y1)
y2=le.transform(y2)
y3=le.transform(y3)

y4=le.transform(y4)
y5=le.transform(y5)
y6=le.transform(y6)

In [28]:
# Save the LabeEncoder model to a file so it can be used in the flask model
dump(le,"labelencoder.joblib")

['labelencoder.joblib']

In [29]:
# Needed later when displaying classification report
target_labels=le.inverse_transform([0,1])

## Train Test Split

In [30]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1,y1,random_state=42,stratify=y1)

X2_train, X2_test, y2_train, y2_test = train_test_split(X2,y2,random_state=42,stratify=y2)

X3_train, X3_test, y3_train, y3_test = train_test_split(X3,y3,random_state=42,stratify=y3)

X4_train, X4_test, y4_train, y4_test = train_test_split(X4,y4,random_state=42,stratify=y4)

X5_train, X5_test, y5_train, y5_test = train_test_split(X5,y5,random_state=42,stratify=y5)

X6_train, X6_test, y6_train, y6_test = train_test_split(X6,y6,random_state=42,stratify=y6)

# AdaBoost

In [31]:
ada1_model = AdaBoostClassifier(learning_rate=1.0,n_estimators=4000)
ada2_model = AdaBoostClassifier(learning_rate=1.0,n_estimators=4000)
ada3_model = AdaBoostClassifier(learning_rate=1.0,n_estimators=4000)

ada4_model = AdaBoostClassifier(learning_rate=1.0,n_estimators=4000)
ada5_model = AdaBoostClassifier(learning_rate=1.0,n_estimators=4000)
ada6_model = AdaBoostClassifier(learning_rate=1.0,n_estimators=4000)

In [32]:
# Fit the model and predict for all beaches
ada1_model.fit(X1_train,y1_train)
y1_hat_test = ada1_model.predict(X1_test)

ada2_model.fit(X2_train,y2_train)
y2_hat_test = ada2_model.predict(X2_test)

ada3_model.fit(X3_train,y3_train)
y3_hat_test = ada3_model.predict(X3_test)



In [33]:
# Fit and predict the model for city beaches
ada4_model.fit(X4_train,y4_train)
y4_hat_test = ada4_model.predict(X4_test)

ada5_model.fit(X5_train,y5_train)
y5_hat_test = ada5_model.predict(X5_test)

ada6_model.fit(X6_train,y6_train)
y6_hat_test = ada6_model.predict(X6_test)

In [34]:
# Save the ada4_model to a file
dump(ada4_model,"ada4_model.joblib")

['ada4_model.joblib']

# Evaluate Performance

In [35]:
print("All beaches 1 Balanced Accuracy: ",'{:.3f}'.format(round(balanced_accuracy_score(y1_test,y1_hat_test),3)))
print("All beaches 2 Balanced Accuracy: ",'{:.3f}'.format(round(balanced_accuracy_score(y2_test,y2_hat_test),3)))
print("All beaches 3 Balanced Accuracy: ",'{:.3f}'.format(round(balanced_accuracy_score(y3_test,y3_hat_test),3)),"\n")

print("City beaches 4 Balanced Accuracy: ",'{:.3f}'.format(round(balanced_accuracy_score(y4_test,y4_hat_test),3)))
print("City beaches 5 Balanced Accuracy: ",'{:.3f}'.format(round(balanced_accuracy_score(y5_test,y5_hat_test),3)))
print("City beaches 6 Balanced Accuracy: ",'{:.3f}'.format(round(balanced_accuracy_score(y6_test,y6_hat_test),3)))

All beaches 1 Balanced Accuracy:  0.600
All beaches 2 Balanced Accuracy:  0.697
All beaches 3 Balanced Accuracy:  0.695 

City beaches 4 Balanced Accuracy:  0.665
City beaches 5 Balanced Accuracy:  0.753
City beaches 6 Balanced Accuracy:  0.672


## Data Set 1 Evaluation

In [36]:
# confustion matrix
cm1 = confusion_matrix(y1_test,y1_hat_test)
print("Data Set 1 Confusion Matrix")

# Create a data table
cm1_df = pd.DataFrame(cm1,index=["high risk actual", "low risk actual"],columns=["high risk predicted","low risk predicted"])
cm1_df

Data Set 1 Confusion Matrix


Unnamed: 0,high risk predicted,low risk predicted
high risk actual,246,792
low risk actual,154,3996


In [37]:
print(classification_report(y1_test,y1_hat_test,target_names=target_labels))

              precision    recall  f1-score   support

   high_risk       0.61      0.24      0.34      1038
    low_risk       0.83      0.96      0.89      4150

    accuracy                           0.82      5188
   macro avg       0.72      0.60      0.62      5188
weighted avg       0.79      0.82      0.78      5188



## Data Set 2 Evaluation

In [38]:
# confustion matrix
cm2 = confusion_matrix(y2_test,y2_hat_test)
print("Data Set 2 Confusion Matrix")

# Create a data table
cm2_df = pd.DataFrame(cm2,index=["high risk actual", "low risk actual"],columns=["high risk predicted","low risk predicted"])
cm2_df

Data Set 2 Confusion Matrix


Unnamed: 0,high risk predicted,low risk predicted
high risk actual,210,178
low risk actual,106,615


In [39]:
print(classification_report(y2_test,y2_hat_test,target_names=target_labels))

              precision    recall  f1-score   support

   high_risk       0.66      0.54      0.60       388
    low_risk       0.78      0.85      0.81       721

    accuracy                           0.74      1109
   macro avg       0.72      0.70      0.70      1109
weighted avg       0.74      0.74      0.74      1109



## Data Set 3 Evaluation

In [40]:
# confustion matrix
cm3 = confusion_matrix(y3_test,y3_hat_test)
print("Data Set 3 Confusion Matrix")

# Create a data table
cm3_df = pd.DataFrame(cm3,index=["high risk actual", "low risk actual"],columns=["high risk predicted","low risk predicted"])
cm3_df

Data Set 3 Confusion Matrix


Unnamed: 0,high risk predicted,low risk predicted
high risk actual,141,107
low risk actual,74,339


In [41]:
print(classification_report(y3_test,y3_hat_test,target_names=target_labels))

              precision    recall  f1-score   support

   high_risk       0.66      0.57      0.61       248
    low_risk       0.76      0.82      0.79       413

    accuracy                           0.73       661
   macro avg       0.71      0.69      0.70       661
weighted avg       0.72      0.73      0.72       661



## Data Set 4 Evaluation

In [42]:
# confustion matrix
cm4 = confusion_matrix(y4_test,y4_hat_test)
print("Data Set 4 Confusion Matrix")

# Create a data table
cm4_df = pd.DataFrame(cm4,index=["high risk actual", "low risk actual"],columns=["high risk predicted","low risk predicted"])
cm4_df

Data Set 4 Confusion Matrix


Unnamed: 0,high risk predicted,low risk predicted
high risk actual,215,347
low risk actual,96,1698


In [43]:
print(classification_report(y4_test,y4_hat_test,target_names=target_labels))

              precision    recall  f1-score   support

   high_risk       0.69      0.38      0.49       562
    low_risk       0.83      0.95      0.88      1794

    accuracy                           0.81      2356
   macro avg       0.76      0.66      0.69      2356
weighted avg       0.80      0.81      0.79      2356



## Data Set 5 Evaluation

In [44]:
# confustion matrix
cm5 = confusion_matrix(y5_test,y5_hat_test)
print("City beaches 5 Confusion Matrix")

# Create a data table
cm5_df = pd.DataFrame(cm5,index=["high risk actual", "low risk actual"],columns=["high risk predicted","low risk predicted"])
cm5_df

City beaches 5 Confusion Matrix


Unnamed: 0,high risk predicted,low risk predicted
high risk actual,131,68
low risk actual,45,252


In [45]:
print(classification_report(y5_test,y5_hat_test,target_names=target_labels))

              precision    recall  f1-score   support

   high_risk       0.74      0.66      0.70       199
    low_risk       0.79      0.85      0.82       297

    accuracy                           0.77       496
   macro avg       0.77      0.75      0.76       496
weighted avg       0.77      0.77      0.77       496



## Data Set 6 Evaluation

In [46]:
# confustion matrix
cm6 = confusion_matrix(y6_test,y6_hat_test)
print("Data Set 6 Confusion Matrix")

# Create a data table
cm6_df = pd.DataFrame(cm6,index=["high risk actual", "low risk actual"],columns=["high risk predicted","low risk predicted"])
cm6_df

Data Set 6 Confusion Matrix


Unnamed: 0,high risk predicted,low risk predicted
high risk actual,70,57
low risk actual,36,137


In [47]:
print(classification_report(y6_test,y6_hat_test,target_names=target_labels))

              precision    recall  f1-score   support

   high_risk       0.66      0.55      0.60       127
    low_risk       0.71      0.79      0.75       173

    accuracy                           0.69       300
   macro avg       0.68      0.67      0.67       300
weighted avg       0.69      0.69      0.68       300



# Test model load and verify it predicts correctly

In [48]:
# Load model from file
clf = load('ada4_model.joblib')

In [49]:
# Create a list of predictions using X4_test data, same as used for model evaluation
y_load = clf.predict(X4_test)

In [50]:
# Balanced accuracy returns a score of 1.0, indicated perfect match
balanced_accuracy_score(y4_hat_test,y_load)

1.0