In [1]:
#Database connection
from sqlalchemy import create_engine
# You must have psycopg2 installed in your choosen environment
#!pip install psycopg2-binary
import psycopg2
from getpass import getpass

#Formatting data to remove nulls
import pandas as pd

#Machine learning
# The data appears to be imbalanced after running the counter
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

  """)


In [2]:
# Configure settings for RDS
password = getpass('Enter database password')
mode = "read"
jdbc_url=""
config = {"user":"postgres", 
          "password": password, 
          "driver":"org.postgresql.Driver"}

Enter database password········


In [3]:
# Establish connection to database 
db_string = f"postgresql://postgres:{password}@127.0.0.1:5432/shark_bite_db"
engine = create_engine(db_string)
conn = engine.connect()

In [5]:
shark_bite_df = pd.read_sql_table('completedata', conn )
#shark_bite_df

In [6]:
shark_bite_df.drop(["case_number"], axis=1, inplace=True)
shark_bite_df.dropna(how = 'any', axis = 0, inplace= True)
shark_bite_df.drop(["year", "day"], axis=1, inplace=True)

In [7]:
month_dict = {1.0: "Jan",
              2.0: "Feb",
              3.0: "Mar",
              4.0: "Apr",
              5.0: "May",
              6.0: "Jun",
              7.0: "Jul",
              8.0: "Aug",
              9.0: "Sep",
              10.0: "Oct",
              11.0: "Nov",
              12.0: "Dec" 
             }
shark_bite_df = shark_bite_df.replace({"month": month_dict})
#shark_bite_df

In [8]:
# Encoding feature column
shark_bite_encoded = pd.get_dummies(shark_bite_df, columns = [
    "month",
    "country",
    "activity",
    "species",
    "type",
    "people_involved",
    "sex"
])
#Encoding target column
# Not fatal (N): 0
# Fatal (Y): 1
x = {'N': 0}   
shark_bite_encoded = shark_bite_encoded.replace(x)
y = {'Y': 1}
shark_bite_encoded = shark_bite_encoded.replace(y)

pd.set_option('display.max_columns', None)
shark_bite_encoded

shark_bite_features = shark_bite_encoded.drop(columns='fatal')

In [9]:
shark_scaled = StandardScaler().fit_transform(shark_bite_features)


In [10]:
pca = PCA(n_components=10)
shark_pca = pca.fit_transform(shark_scaled)


In [17]:
# Creating the features
#X = shark_bite_encoded.drop(columns='fatal')

X = shark_pca_df
# Creating the target
y = shark_bite_encoded['fatal']

In [18]:
# Normal train test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


In [19]:
rf_model = RandomForestClassifier(n_estimators=128, random_state=1)

In [20]:
# Fitting the model
rf_model.fit(X_train, y_train)

RandomForestClassifier(n_estimators=128, random_state=1)

In [21]:
# Making predictions using the testing data
rf_predictions = rf_model.predict(X_test)
pd.DataFrame({"Prediction": rf_predictions, "Actual": y_test}).head(20)

Unnamed: 0,Prediction,Actual
1946,0,0
4350,1,0
1057,0,0
576,0,1
5471,1,1
1570,0,0
4819,0,0
3804,0,0
3845,0,0
2236,0,0


In [15]:
# Calculating the confusion matrix.
rf_cm = confusion_matrix(y_test, rf_predictions)
# Create a DataFrame from the confusion matrix.
rf_cm_df = pd.DataFrame(
    rf_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
rf_cm_df
# Calculating the accuracy score.
rf_acc_score = accuracy_score(y_test, rf_predictions)
rf_acc_score
# Displaying results
print("Confusion Matrix")
display(rf_cm_df)
print(f"Accuracy Score : {rf_acc_score}")
print("Classification Report")
print(classification_report(y_test, rf_predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,896,94
Actual 1,147,127


Accuracy Score : 0.8093354430379747
Classification Report
              precision    recall  f1-score   support

           0       0.86      0.91      0.88       990
           1       0.57      0.46      0.51       274

    accuracy                           0.81      1264
   macro avg       0.72      0.68      0.70      1264
weighted avg       0.80      0.81      0.80      1264

