<a href="https://colab.research.google.com/github/david-garza/final_project/blob/ml_refinement/machine_learning/ml_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [20]:
# Import dependencies
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# to import log function to transform y variable
import numpy as np

# Added SQLalchemy
import sqlalchemy as db
from config import password

# Setup Database Connection

In [21]:
# create the connection to the PostgreSQL database.
db_string = f"postgresql://postgres1:{password}@final-project-database.crwsgvv9ibw0.us-east-1.rds.amazonaws.com:5432/final_project_db"
con = db.create_engine(db_string).connect()

# Import Database Table

In [22]:
data_df = pd.read_sql_table("galveston_bacteria_data",con)
data_df.head()

Unnamed: 0,beach_id,beach_name,start_lat,start_long,end_lat,end_long,waterbody_type,station_id,station_name,bacteria_count,date1,avg_temp1,max_temp1,min_temp1,precipitation1,precipitation54,precipitation18
0,TX767833,Sea Isle,29.157639,-95.011542,29.125974,-95.062028,Open Coast,GAL005,Terramar Beach,40.0,2007-01-22,52.0,53.0,50.0,0.0,,
1,TX767833,Sea Isle,29.157639,-95.011542,29.125974,-95.062028,Open Coast,GAL005,Terramar Beach,38.0,2007-01-22,52.0,53.0,50.0,0.0,,
2,TX767833,Sea Isle,29.157639,-95.011542,29.125974,-95.062028,Open Coast,GAL007,Sea Isle South,58.0,2007-01-22,52.0,53.0,50.0,0.0,,
3,TX767833,Sea Isle,29.157639,-95.011542,29.125974,-95.062028,Open Coast,GAL007,Sea Isle South,48.0,2007-01-22,52.0,53.0,50.0,0.0,,
4,TX974690,Jamaica Beach,29.182981,-94.969426,29.176498,-94.980493,Open Coast,GAL014,Jamaica Beach South,64.0,2007-01-22,52.0,53.0,50.0,0.0,,


In [23]:
# Create DF of bactiera counts and basic weather station 1 data only
columns=["bacteria_count","avg_temp1","max_temp1","min_temp1","precipitation1"]
basic_df = data_df[columns]
basic_df.head()

Unnamed: 0,bacteria_count,avg_temp1,max_temp1,min_temp1,precipitation1
0,40.0,52.0,53.0,50.0,0.0
1,38.0,52.0,53.0,50.0,0.0
2,58.0,52.0,53.0,50.0,0.0
3,48.0,52.0,53.0,50.0,0.0
4,64.0,52.0,53.0,50.0,0.0


In [24]:
basic_df.shape

(29743, 5)

In [25]:
basic_df.dropna()
basic_df.shape

(29743, 5)

# Preprocessing 
## View Data Types


In [26]:
basic_df.dtypes

bacteria_count    float64
avg_temp1         float64
max_temp1         float64
min_temp1         float64
precipitation1    float64
dtype: object

## Seperate data into Training and Features

In [27]:
y=basic_df["bacteria_count"]
X=basic_df.drop("bacteria_count",1)
print(y.shape)
print(X.shape)

(29743,)
(29743, 4)


  


## Split Data Into Training and Testing

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=42)

## Scale Data

### Scale Features Like Normal

In [29]:
# Start an instance of Standard Scaler()
scale=StandardScaler()

In [30]:
# Fit scaler data
scale.fit(X_train)



StandardScaler()

In [31]:
# Apply scaling to freature data
X_train_scale = scale.transform(X_train)
X_test_scale = scale.transform(X_test)



In [32]:
# Convert scaled array back to df to get feature names back
X_train_scale_df = pd.DataFrame(X_train_scale,columns=X.columns)
X_test_scale_df = pd.DataFrame(X_test_scale,columns=X.columns)

### Scale the y variable

In [33]:
y_train.describe()

count    22307.000000
mean        53.612111
std        467.640390
min          0.000000
25%          5.000000
50%         10.000000
75%         26.000000
max      24200.000000
Name: bacteria_count, dtype: float64

In [34]:
y_train_log = np.log(y_train+1)

In [35]:
y_train_log.describe()

count    22307.000000
mean         2.604428
std          1.237925
min          0.000000
25%          1.791759
50%          2.397895
75%          3.295837
max         10.094149
Name: bacteria_count, dtype: float64

In [36]:
y_test_log = np.log(y_test+1)

# Modeling

In [37]:
# Setup the instance of the linear regression model, find intercept is false since X values are normalized
lr_model=LinearRegression()

In [38]:
# Fit the lr_model with the scaled features and y-variable
lr_model.fit(X_train_scale_df,y_train_log)



ValueError: ignored

## Model Coefficients

In [None]:
# Return the coefficeints of the linear model
base_coef = lr_model.coef_
pd.DataFrame(base_coef.reshape(1,4),columns=X.columns)

Unnamed: 0,avg_temp1,max_temp1,min_temp1,precipitation1
0,0.435268,-0.637812,0.331752,0.342874


## R-squared scores

In [None]:
# Measure the R-squred value for the model using the training data, test to see if any realtionship was detected
lr_model.score(X_train_scale_df,y_train_log)



0.10778724992425082

In [None]:
lr_model.score(X_test_scale_df,y_test_log)



0.1012132278348995

## Reisduals

In [None]:
# First predict the values y_hat values for both the trained and test sets
y_hat_train = np.expm1(lr_model.predict(X_train_scale_df))
y_hat_test = np.expm1(lr_model.predict(X_test_scale_df))



In [None]:
# compute the residuals
residual_train = y_hat_train-y_train
residual_test = y_hat_test - y_test

In [None]:
# Import Plotly
import plotly.express as px

In [None]:
# Plot residuals
fig = px.scatter(x=y_train, y=residual_train,labels=dict(x="Actual",y="Residual"),title="Residuals of Training Data")
fig.show()

In [None]:
fig = px.scatter(x=y_test, y=residual_test,labels=dict(x="Actual",y="Residual"),title="Residuals of Testing Data")
fig.show()