In [12]:
# Import dependencies
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Added SQLalchemy
import sqlalchemy as db
from getpass import getpass

# Setup Database Connection

In [13]:
# create the connection to the PostgreSQL database.
password = getpass('Enter database password')
db_string = f"postgresql://postgres1:{password}@final-project-database.crwsgvv9ibw0.us-east-1.rds.amazonaws.com:5432/final_project_db"
con = db.create_engine(db_string).connect()

Enter database password··········


# Import Database Table

In [14]:
# Read beach attributes and create dataframe
beach_attributes_df = pd.read_sql_table("beach_attributes",con)
beach_attributes_df.head()

Unnamed: 0,beach_id,beach_name,tier,start_lat,start_long,end_lat,end_long,waterbody_type
0,TX710697,25th St.,1,29.298146,-94.777565,29.284662,-94.794776,Open Coast
1,TX214299,45th St.,1,29.284667,-94.79477,29.271917,-94.815865,Open Coast
2,TX486021,61st St.,1,29.271922,-94.815859,29.264091,-94.830244,Open Coast
3,TX327206,Appfel Park,1,29.337451,-94.73301,29.32425,-94.739129,Open Coast
4,TX940700,Caplen,2,29.503046,-94.510477,29.494188,-94.532478,Open Coast


In [15]:
water_quality_df = pd.read_sql_table("water_quality",con)
water_quality_df.head()

Unnamed: 0,date,year,beach_id,beach_name,station_id,station_name,identifier,start_time,zone_code,bacteria_count,result_measure_unit,result_analytical_method_identifier,result_analytical_method_name
0,2021-12-27,2021,TX974690,Jamaica Beach,GAL014,Jamaica Beach South,GAL014_20211227_90752,09:00:00 AM,CDT,7.94,MPN/100ml,19299,ENTEROLERT
1,2021-12-27,2021,TX974690,Jamaica Beach,GAL014,Jamaica Beach South,GAL014_20211227_90752,09:00:00 AM,CDT,20.0,MPN/100ml,19299,ENTEROLERT
2,2021-12-15,2021,TX974690,Jamaica Beach,GAL014,Jamaica Beach South,GAL014_20211215_90599,09:00:00 AM,CDT,6.3,MPN/100ml,19299,ENTEROLERT
3,2021-12-15,2021,TX974690,Jamaica Beach,GAL014,Jamaica Beach South,GAL014_20211215_90599,09:00:00 AM,CDT,5.0,MPN/100ml,19299,ENTEROLERT
4,2021-12-01,2021,TX974690,Jamaica Beach,GAL014,Jamaica Beach South,GAL014_20211201_90435,09:00:00 AM,CDT,7.07,MPN/100ml,19299,ENTEROLERT


In [16]:
weather_station1_df = pd.read_sql_table("weather_station1",con)
weather_station1_df

Unnamed: 0,date1,avg_temp1,max_temp1,min_temp1,precipitation1,snowfall1,snow_depth1
0,1946-08-01,,86.0,77.0,0.00,0.0,0.0
1,1946-08-02,,80.0,78.0,0.00,0.0,0.0
2,1946-08-03,,90.0,80.0,0.00,0.0,0.0
3,1946-08-04,,91.0,81.0,0.00,0.0,0.0
4,1946-08-05,,91.0,80.0,0.00,0.0,0.0
...,...,...,...,...,...,...,...
16978,2022-05-31,86.0,91.0,83.0,0.00,0.0,0.0
16979,2022-06-01,86.0,90.0,81.0,0.00,0.0,0.0
16980,2022-06-02,85.0,93.0,77.0,0.59,0.0,0.0
16981,2022-06-03,83.0,90.0,78.0,0.00,0.0,0.0


# Merge Water Quality and Weather Station 1 on Date

In [17]:
# Attempt to merge water quality table and weather data
bacteria_wx_df = water_quality_df.merge(weather_station1_df,how="left",left_on="date",right_on="date1")
bacteria_wx_df.head()

Unnamed: 0,date,year,beach_id,beach_name,station_id,station_name,identifier,start_time,zone_code,bacteria_count,result_measure_unit,result_analytical_method_identifier,result_analytical_method_name,date1,avg_temp1,max_temp1,min_temp1,precipitation1,snowfall1,snow_depth1
0,2021-12-27,2021,TX974690,Jamaica Beach,GAL014,Jamaica Beach South,GAL014_20211227_90752,09:00:00 AM,CDT,7.94,MPN/100ml,19299,ENTEROLERT,2021-12-27,73.0,77.0,71.0,0.0,0.0,0.0
1,2021-12-27,2021,TX974690,Jamaica Beach,GAL014,Jamaica Beach South,GAL014_20211227_90752,09:00:00 AM,CDT,20.0,MPN/100ml,19299,ENTEROLERT,2021-12-27,73.0,77.0,71.0,0.0,0.0,0.0
2,2021-12-15,2021,TX974690,Jamaica Beach,GAL014,Jamaica Beach South,GAL014_20211215_90599,09:00:00 AM,CDT,6.3,MPN/100ml,19299,ENTEROLERT,2021-12-15,74.0,79.0,70.0,0.0,0.0,0.0
3,2021-12-15,2021,TX974690,Jamaica Beach,GAL014,Jamaica Beach South,GAL014_20211215_90599,09:00:00 AM,CDT,5.0,MPN/100ml,19299,ENTEROLERT,2021-12-15,74.0,79.0,70.0,0.0,0.0,0.0
4,2021-12-01,2021,TX974690,Jamaica Beach,GAL014,Jamaica Beach South,GAL014_20211201_90435,09:00:00 AM,CDT,7.07,MPN/100ml,19299,ENTEROLERT,2021-12-01,67.0,77.0,59.0,0.0,0.0,0.0


# Preprocessing 
## View Data Types


In [None]:
df.dtypes

date              object
beach_name        object
beach_type        object
bacterica_cts      int64
avg_temp           int64
max_temp           int64
min_temp           int64
precipation      float64
dtype: object

# Drop data from the data set.

In [None]:
df.drop("date", 1,inplace=True)

  """Entry point for launching an IPython kernel.


In [None]:
df.head()

Unnamed: 0,beach_name,beach_type,bacterica_cts,avg_temp,max_temp,min_temp,precipation
0,pirate's beach,lagoon,29,59,84,40,0.457607
1,pirate's beach,lagoon,149,64,73,30,0.466136
2,pirate's beach,lagoon,130,63,66,69,0.664975
3,pirate's beach,lagoon,60,48,67,54,0.794012
4,pirate's beach,lagoon,64,46,63,61,1.455917


# Convert Beach Name and Beach type to integers

In [None]:
df_encoded=pd.get_dummies(df,columns=["beach_name","beach_type"])

In [None]:
df_encoded.head()

Unnamed: 0,bacterica_cts,avg_temp,max_temp,min_temp,precipation,beach_name_25th street beach,beach_name_pirate's beach,beach_name_seawall beach,beach_name_west beach,beach_type_lagoon,beach_type_open ocean
0,29,59,84,40,0.457607,0,1,0,0,1,0
1,149,64,73,30,0.466136,0,1,0,0,1,0
2,130,63,66,69,0.664975,0,1,0,0,1,0
3,60,48,67,54,0.794012,0,1,0,0,1,0
4,64,46,63,61,1.455917,0,1,0,0,1,0


In [None]:
y = df_encoded["bacterica_cts"]

In [None]:
X= df_encoded.drop("bacterica_cts",1)

  """Entry point for launching an IPython kernel.


# scale the data using scale.fit_transform
## Scale the data so that all the integers are of similar size. 

In [None]:
scale=StandardScaler()

In [None]:
X_scaled=scale.fit_transform(X)

In [None]:
X_scaled

array([[-0.05375312,  1.54730147, -0.97555623, -0.54584478, -0.57735027,
         1.73205081, -0.57735027, -0.57735027,  1.73205081, -1.73205081],
       [ 0.34738955,  0.02491183, -1.71798867, -0.52550403, -0.57735027,
         1.73205081, -0.57735027, -0.57735027,  1.73205081, -1.73205081],
       [ 0.26716102, -0.94388158,  1.17749785, -0.05128782, -0.57735027,
         1.73205081, -0.57735027, -0.57735027,  1.73205081, -1.73205081],
       [-0.93626699, -0.80548252,  0.06384919,  0.25645423, -0.57735027,
         1.73205081, -0.57735027, -0.57735027,  1.73205081, -1.73205081],
       [-1.09672406, -1.35907875,  0.5835519 ,  1.83504121, -0.57735027,
         1.73205081, -0.57735027, -0.57735027,  1.73205081, -1.73205081],
       [-1.49786673, -1.63587687,  0.43506541,  0.00770414, -0.57735027,
         1.73205081, -0.57735027, -0.57735027,  1.73205081, -1.73205081],
       [-0.93626699, -0.66708346,  1.47447083,  1.93740631, -0.57735027,
         1.73205081, -0.57735027, -0.57735027

# Splitting the data into training and testing Data Sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [None]:
X_train.shape


(75, 10)

In [None]:
X_test.shape

(25, 10)

# Modeling
## Fit the Model

In [None]:
lr=LinearRegression()

In [None]:
Mock_model=lr.fit(X_train,y_train)

# Making Predictions Using the Testing Data

In [None]:
predictions = Mock_model.predict(X_test)

# Evaluate the Model using score
## At this time our Mock_model does not show any corrilation because it is a randomized data set.


In [None]:
Mock_model.score(X_test,y_test)

-0.15675499002521587