<a href="https://colab.research.google.com/github/david-garza/final_project/blob/ml_refinement/machine_learning/ml_classifier_opt.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Import dependencies
import pandas as pd

# Import sci-kit leanring modules
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

# Classifer Models
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Metrics
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix

# Added SQLalchemy
import sqlalchemy as db
from config import password

# Setup Database Connection and Import Data

In [2]:
# create the connection to the PostgreSQL database.
db_string = f"postgresql://postgres1:{password}@final-project-database.crwsgvv9ibw0.us-east-1.rds.amazonaws.com:5432/final_project_db"
con = db.create_engine(db_string).connect()

  """)


## General Data

In [3]:
# Import and view data table
data_df = pd.read_sql_table("galveston_data_join",con)
data_df.head()

Unnamed: 0,beach_id,beach_name,start_lat,start_long,end_lat,end_long,waterbody_type,station_id,station_name,bacteria_count,...,date,week,month,year,avg_temp1,max_temp1,min_temp1,precipitation1,precipitation54,precipitation18
0,TX767833,Sea Isle,29.157639,-95.011542,29.125974,-95.062028,Open Coast,GAL005,Terramar Beach,40.0,...,2007-01-22,4,1,2007,52.0,53.0,50.0,0.0,,
1,TX767833,Sea Isle,29.157639,-95.011542,29.125974,-95.062028,Open Coast,GAL005,Terramar Beach,38.0,...,2007-01-22,4,1,2007,52.0,53.0,50.0,0.0,,
2,TX767833,Sea Isle,29.157639,-95.011542,29.125974,-95.062028,Open Coast,GAL007,Sea Isle South,58.0,...,2007-01-22,4,1,2007,52.0,53.0,50.0,0.0,,
3,TX767833,Sea Isle,29.157639,-95.011542,29.125974,-95.062028,Open Coast,GAL007,Sea Isle South,48.0,...,2007-01-22,4,1,2007,52.0,53.0,50.0,0.0,,
4,TX974690,Jamaica Beach,29.182981,-94.969426,29.176498,-94.980493,Open Coast,GAL014,Jamaica Beach South,64.0,...,2007-01-22,4,1,2007,52.0,53.0,50.0,0.0,,


## Load Weather Station Data

In [4]:
# Load the Primary Weather Stations Data
wx1_df = pd.read_sql_table("weather_station1",con)
wx18_df = pd.read_sql_table("weather_station18",con)
wx54_df = pd.read_sql_table("weather_station54",con)


# Data Processing and Feature Engineering

## Compute 5 Day averages and sums for WX data

In [8]:
# Add 5 Day Total rain fall ws18 and ws54
wx18_df["5_day_precip18"] = wx18_df["precipitation18"].rolling(5).sum()
wx54_df["5_day_precip54"] = wx54_df["precipitation54"].rolling(5).sum()

In [9]:
# Add 5 Day Total Rain and 5 Day average for WS1
wx1_df["5_day_precip1"]=wx1_df["precipitation1"].rolling(5).sum()
wx1_df["5_day_temp"]=wx1_df["avg_temp1"].rolling(5).mean()
wx1_df["5_day_temp_max"]=wx1_df["max_temp1"].rolling(5).mean()
wx1_df["5_day_temp_min"]=wx1_df["min_temp1"].rolling(5).mean()

In [11]:
# Drop the redudant columns
drop_columns = ["avg_temp1",	"max_temp1",	"min_temp1",	"precipitation1",	"snowfall1",	"snow_depth1"]
wx1_df.drop(drop_columns,1,inplace=True)
wx1_df.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,date1,5_day_precip1,5_day_temp,5_day_temp_max,5_day_temp_min
0,1946-08-01,,,,
1,1946-08-02,,,,
2,1946-08-03,,,,
3,1946-08-04,,,,
4,1946-08-05,0.0,,87.6,79.2


In [12]:
# Drop ther redudant columns for the other wx
wx18_df.drop("precipitation18",1,inplace=True)
wx54_df.drop("precipitation54",1,inplace=True)

  
  This is separate from the ipykernel package so we can avoid doing imports until


## Merge 5 Day Data and Drop Redundent Columns

In [13]:
# Merge the 5 Day station 1 data to data_df
data_df=data_df.merge(wx1_df,how="left",left_on="date",right_on="date1")
data_df.head()

Unnamed: 0,beach_id,beach_name,start_lat,start_long,end_lat,end_long,waterbody_type,station_id,station_name,bacteria_count,...,max_temp1,min_temp1,precipitation1,precipitation54,precipitation18,date1,5_day_precip1,5_day_temp,5_day_temp_max,5_day_temp_min
0,TX767833,Sea Isle,29.157639,-95.011542,29.125974,-95.062028,Open Coast,GAL005,Terramar Beach,40.0,...,53.0,50.0,0.0,,,2007-01-22,1.31,49.8,54.6,47.0
1,TX767833,Sea Isle,29.157639,-95.011542,29.125974,-95.062028,Open Coast,GAL005,Terramar Beach,38.0,...,53.0,50.0,0.0,,,2007-01-22,1.31,49.8,54.6,47.0
2,TX767833,Sea Isle,29.157639,-95.011542,29.125974,-95.062028,Open Coast,GAL007,Sea Isle South,58.0,...,53.0,50.0,0.0,,,2007-01-22,1.31,49.8,54.6,47.0
3,TX767833,Sea Isle,29.157639,-95.011542,29.125974,-95.062028,Open Coast,GAL007,Sea Isle South,48.0,...,53.0,50.0,0.0,,,2007-01-22,1.31,49.8,54.6,47.0
4,TX974690,Jamaica Beach,29.182981,-94.969426,29.176498,-94.980493,Open Coast,GAL014,Jamaica Beach South,64.0,...,53.0,50.0,0.0,,,2007-01-22,1.31,49.8,54.6,47.0


In [14]:
# Merge the other wx data sets
data_df=data_df.merge(wx18_df,how="left",left_on="date",right_on="date18")
data_df=data_df.merge(wx54_df,how="left",left_on="date",right_on="date54")
data_df.head()

Unnamed: 0,beach_id,beach_name,start_lat,start_long,end_lat,end_long,waterbody_type,station_id,station_name,bacteria_count,...,precipitation18,date1,5_day_precip1,5_day_temp,5_day_temp_max,5_day_temp_min,date18,5_day_precip18,date54,5_day_precip54
0,TX767833,Sea Isle,29.157639,-95.011542,29.125974,-95.062028,Open Coast,GAL005,Terramar Beach,40.0,...,,2007-01-22,1.31,49.8,54.6,47.0,NaT,,NaT,
1,TX767833,Sea Isle,29.157639,-95.011542,29.125974,-95.062028,Open Coast,GAL005,Terramar Beach,38.0,...,,2007-01-22,1.31,49.8,54.6,47.0,NaT,,NaT,
2,TX767833,Sea Isle,29.157639,-95.011542,29.125974,-95.062028,Open Coast,GAL007,Sea Isle South,58.0,...,,2007-01-22,1.31,49.8,54.6,47.0,NaT,,NaT,
3,TX767833,Sea Isle,29.157639,-95.011542,29.125974,-95.062028,Open Coast,GAL007,Sea Isle South,48.0,...,,2007-01-22,1.31,49.8,54.6,47.0,NaT,,NaT,
4,TX974690,Jamaica Beach,29.182981,-94.969426,29.176498,-94.980493,Open Coast,GAL014,Jamaica Beach South,64.0,...,,2007-01-22,1.31,49.8,54.6,47.0,NaT,,NaT,


# Reduce Risk Level From 3 to 2 Levels

In [15]:
data_df["risk_level"].unique()

array(['medium_risk', 'low_risk', 'high_risk'], dtype=object)

In [16]:
data_df["risk_level"].replace("medium_risk","high_risk",inplace=True)

In [17]:
data_df["risk_level"].unique()

array(['high_risk', 'low_risk'], dtype=object)

# Create City Beaches Data subset

In [18]:
# Create a data set 4 to only city beaches
city_beaches_df = data_df[(data_df['beach_name'] == '61st St.') | (data_df['beach_name'] == '45th St.') | (data_df['beach_name'] == '25th St.') | (data_df['beach_name'] == 'Stewart Beach')]
city_beaches_df["beach_name"].unique()

array(['45th St.', 'Stewart Beach', '61st St.', '25th St.'], dtype=object)

# Create Datasets for Models

In [20]:
# Model 1 DF will only use the data from weather station 1, longer time, but fewer columns
model_1_columns = ["week","station_id","risk_level","5_day_precip1","5_day_temp","5_day_temp_max","5_day_temp_min"]
model_1_df = data_df[model_1_columns].dropna().drop_duplicates()
model_4_df = city_beaches_df[model_1_columns].dropna().drop_duplicates()


In [21]:
# Model 2 DF will include station 1 and 18 but fewer rows
model_2_columns = ["week","station_id","risk_level","5_day_precip1","5_day_precip18","5_day_temp","5_day_temp_max","5_day_temp_min"]
model_2_df = data_df[model_2_columns].dropna().drop_duplicates()
model_5_df = city_beaches_df[model_2_columns].dropna().drop_duplicates()

In [22]:
# Model 3 DF will include all station data, less number of rows
model_3_columns = ["week","station_id","risk_level","5_day_precip1","5_day_precip18","5_day_precip54","5_day_temp","5_day_temp_max","5_day_temp_min"]
model_3_df = data_df[model_3_columns].dropna().drop_duplicates()
model_6_df = city_beaches_df[model_3_columns].dropna().drop_duplicates()

# Preprocessing Model 1

## Split Features and Traget

In [23]:
# Split out the features and target
y1 = model_1_df["risk_level"]
X1 = model_1_df.drop("risk_level",1)

y2 = model_2_df["risk_level"]
X2 = model_2_df.drop("risk_level",1)

y3 = model_3_df["risk_level"]
X3 = model_3_df.drop("risk_level",1)

y4 = model_4_df["risk_level"]
X4 = model_4_df.drop("risk_level",1)

y5 = model_5_df["risk_level"]
X5 = model_5_df.drop("risk_level",1)

y6 = model_6_df["risk_level"]
X6 = model_6_df.drop("risk_level",1)

  This is separate from the ipykernel package so we can avoid doing imports until
  
  if __name__ == '__main__':
  if sys.path[0] == '':
  from ipykernel import kernelapp as app


## Process Features

In [24]:
# Encode station_id
X1= pd.get_dummies(X1)

X2= pd.get_dummies(X2)

X3= pd.get_dummies(X3)

X4= pd.get_dummies(X4)

X5= pd.get_dummies(X5)

X6= pd.get_dummies(X6)

In [25]:
# Start instance of scaler
scale = StandardScaler()

In [26]:
# Fit and apply scaling
X1 = scale.fit_transform(X1)

X2 = scale.fit_transform(X2)

X3 = scale.fit_transform(X3)

X4 = scale.fit_transform(X4)

X5 = scale.fit_transform(X5)

X6 = scale.fit_transform(X6)



## Process Target

In [27]:
# Initialize LabelEncoder
le = LabelEncoder()

In [28]:
le.fit(y1)

y1=le.transform(y1)
y2=le.transform(y2)
y3=le.transform(y3)
y4=le.transform(y4)
y5=le.transform(y5)
y6=le.transform(y6)

In [29]:
le.inverse_transform([0,1])

array(['high_risk', 'low_risk'], dtype=object)

## Train Test Split

In [30]:
X1_train, X1_test, y1_train, y1_test = train_test_split(X1,y1,random_state=42,stratify=y1)

X2_train, X2_test, y2_train, y2_test = train_test_split(X2,y2,random_state=42,stratify=y2)

X3_train, X3_test, y3_train, y3_test = train_test_split(X3,y3,random_state=42,stratify=y3)

X4_train, X4_test, y4_train, y4_test = train_test_split(X4,y4,random_state=42,stratify=y4)

X5_train, X5_test, y5_train, y5_test = train_test_split(X5,y5,random_state=42,stratify=y5)

X6_train, X6_test, y6_train, y6_test = train_test_split(X6,y6,random_state=42,stratify=y6)

# Linear SVC Model

In [31]:
# Initialize a model
lsvc_model = LinearSVC(dual=False)

In [32]:
# Fit the model and predict
lsvc_model.fit(X1_train,y1_train)
y1_hat_test = lsvc_model.predict(X1_test)

In [33]:
lsvc_model.fit(X2_train,y2_train)
y2_hat_test = lsvc_model.predict(X2_test)

In [34]:
lsvc_model.fit(X3_train,y3_train)
y3_hat_test = lsvc_model.predict(X3_test)

In [35]:
lsvc_model.fit(X4_train,y4_train)
y4_hat_test = lsvc_model.predict(X4_test)

lsvc_model.fit(X5_train,y5_train)
y5_hat_test = lsvc_model.predict(X5_test)

lsvc_model.fit(X6_train,y6_train)
y6_hat_test = lsvc_model.predict(X6_test)

# Evaluate Performance

In [36]:
# Balanced Accuracy Score
print(balanced_accuracy_score(y1_test,y1_hat_test))
print(balanced_accuracy_score(y2_test,y2_hat_test))
print(balanced_accuracy_score(y3_test,y3_hat_test))

print(balanced_accuracy_score(y4_test,y4_hat_test))
print(balanced_accuracy_score(y5_test,y5_hat_test))
print(balanced_accuracy_score(y6_test,y6_hat_test))

0.5124015135687258
0.5339251755151064
0.5229291962821214
0.5118227226381334
0.5441432752990542
0.5875699786081653


# KNeighborsClassifier Model

In [37]:
knc_model = KNeighborsClassifier()

In [38]:
# Fit the model and predict
knc_model.fit(X1_train,y1_train)
y1_hat_test = knc_model.predict(X1_test)

In [39]:
knc_model.fit(X2_train,y2_train)
y2_hat_test = knc_model.predict(X2_test)

In [40]:
knc_model.fit(X3_train,y3_train)
y3_hat_test = knc_model.predict(X3_test)

In [41]:
knc_model.fit(X4_train,y4_train)
y4_hat_test = knc_model.predict(X4_test)

knc_model.fit(X5_train,y5_train)
y5_hat_test = knc_model.predict(X5_test)

knc_model.fit(X6_train,y6_train)
y6_hat_test = knc_model.predict(X6_test)

# Evalute Performance

In [42]:
# Balanced Accuracy Score
print(balanced_accuracy_score(y1_test,y1_hat_test))
print(balanced_accuracy_score(y2_test,y2_hat_test))
print(balanced_accuracy_score(y3_test,y3_hat_test))

print(balanced_accuracy_score(y4_test,y4_hat_test))
print(balanced_accuracy_score(y5_test,y5_hat_test))
print(balanced_accuracy_score(y6_test,y6_hat_test))

0.5209329804768206
0.5127596980139268
0.457134070139811
0.5123672423301079
0.5319019339119841
0.5260798325064858


# SVC Model

In [43]:
svc_model = SVC()

In [44]:
# Fit the model and predict
svc_model.fit(X1_train,y1_train)
y1_hat_test = svc_model.predict(X1_test)

svc_model.fit(X2_train,y2_train)
y2_hat_test = svc_model.predict(X2_test)

svc_model.fit(X3_train,y3_train)
y3_hat_test = svc_model.predict(X3_test)

svc_model.fit(X4_train,y4_train)
y4_hat_test = svc_model.predict(X4_test)

svc_model.fit(X5_train,y5_train)
y5_hat_test = svc_model.predict(X5_test)

svc_model.fit(X6_train,y6_train)
y6_hat_test = svc_model.predict(X6_test)

In [45]:
print(balanced_accuracy_score(y1_test,y1_hat_test))
print(balanced_accuracy_score(y2_test,y2_hat_test))
print(balanced_accuracy_score(y3_test,y3_hat_test))

print(balanced_accuracy_score(y4_test,y4_hat_test))
print(balanced_accuracy_score(y5_test,y5_hat_test))
print(balanced_accuracy_score(y6_test,y6_hat_test))

0.5010834087796272
0.5244023192301643
0.5019673123486683
0.49921743891262693
0.5282895961287921
0.5778526239133404


# Random Forest

In [52]:
rfc_model = RandomForestClassifier(n_estimators=1000)

In [53]:
# Fit the model and predict
rfc_model.fit(X1_train,y1_train)
y1_hat_test = rfc_model.predict(X1_test)

rfc_model.fit(X2_train,y2_train)
y2_hat_test = rfc_model.predict(X2_test)

rfc_model.fit(X3_train,y3_train)
y3_hat_test = rfc_model.predict(X3_test)

rfc_model.fit(X4_train,y4_train)
y4_hat_test = rfc_model.predict(X4_test)

rfc_model.fit(X5_train,y5_train)
y5_hat_test = rfc_model.predict(X5_test)

rfc_model.fit(X6_train,y6_train)
y6_hat_test = rfc_model.predict(X6_test)

In [54]:
print(balanced_accuracy_score(y1_test,y1_hat_test))
print(balanced_accuracy_score(y2_test,y2_hat_test))
print(balanced_accuracy_score(y3_test,y3_hat_test))

print(balanced_accuracy_score(y4_test,y4_hat_test))
print(balanced_accuracy_score(y5_test,y5_hat_test))
print(balanced_accuracy_score(y6_test,y6_hat_test))

0.6331652622048889
0.6090052475799648
0.5764810981801141
0.651232657692506
0.6777828536622507
0.5489053752673979


# AdaBoost

In [55]:
ada_model = AdaBoostClassifier(learning_rate=1.0,n_estimators=4000)

In [56]:
# Fit the model and predict
ada_model.fit(X1_train,y1_train)
y1_hat_test = ada_model.predict(X1_test)

ada_model.fit(X2_train,y2_train)
y2_hat_test = ada_model.predict(X2_test)

ada_model.fit(X3_train,y3_train)
y3_hat_test = ada_model.predict(X3_test)

ada_model.fit(X4_train,y4_train)
y4_hat_test = ada_model.predict(X4_test)

ada_model.fit(X5_train,y5_train)
y5_hat_test = ada_model.predict(X5_test)

ada_model.fit(X6_train,y6_train)
y6_hat_test = ada_model.predict(X6_test)

In [57]:
print(balanced_accuracy_score(y1_test,y1_hat_test))
print(balanced_accuracy_score(y2_test,y2_hat_test))
print(balanced_accuracy_score(y3_test,y3_hat_test))

print(balanced_accuracy_score(y4_test,y4_hat_test))
print(balanced_accuracy_score(y5_test,y5_hat_test))
print(balanced_accuracy_score(y6_test,y6_hat_test))

0.5999428929591197
0.6971095414444428
0.6946858158244162
0.6645252859472263
0.7533881528856403
0.6715443084065359


# Gradient Boosting Classifier

In [58]:
gbc_model= GradientBoostingClassifier(n_estimators=10000)

In [59]:
# Fit the model and predict
gbc_model.fit(X1_train,y1_train)
y1_hat_test = gbc_model.predict(X1_test)

gbc_model.fit(X2_train,y2_train)
y2_hat_test = gbc_model.predict(X2_test)

gbc_model.fit(X3_train,y3_train)
y3_hat_test = gbc_model.predict(X3_test)

gbc_model.fit(X4_train,y4_train)
y4_hat_test = gbc_model.predict(X4_test)

gbc_model.fit(X5_train,y5_train)
y5_hat_test = gbc_model.predict(X5_test)

gbc_model.fit(X6_train,y6_train)
y6_hat_test = gbc_model.predict(X6_test)

In [60]:
print(balanced_accuracy_score(y1_test,y1_hat_test))
print(balanced_accuracy_score(y2_test,y2_hat_test))
print(balanced_accuracy_score(y3_test,y3_hat_test))

print(balanced_accuracy_score(y4_test,y4_hat_test))
print(balanced_accuracy_score(y5_test,y5_hat_test))
print(balanced_accuracy_score(y6_test,y6_hat_test))

0.656644845277062
0.5929622374422695
0.5438129735218308
0.6597535478086306
0.643334517706377
0.5525920531609849
