In [None]:
#Database connection
from sqlalchemy import create_engine
# You must have psycopg2 installed in your choosen environment
#!pip install psycopg2-binary
import psycopg2
from getpass import getpass

#Formatting data to remove nulls
import pandas as pd

#Machine learning
# The data appears to be imbalanced after running the counter
from imblearn.ensemble import EasyEnsembleClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.combine import SMOTEENN
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.metrics import balanced_accuracy_score
from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report



In [None]:
# Configure settings for RDS
password = getpass('Enter database password')
mode = "read"
jdbc_url=""
config = {"user":"postgres", 
          "password": password, 
          "driver":"org.postgresql.Driver"}

In [None]:
# Establish connection to database 
db_string = f"postgresql://postgres:{password}@127.0.0.1:5432/shark_bite_db"
engine = create_engine(db_string)
conn = engine.connect()

In [None]:
shark_bite_df = pd.read_sql_table('completedata', conn )


In [None]:
shark_bite_df.dropna(how = 'any', axis = 0, inplace= True)
shark_bite_df.drop(["case_number"], axis=1, inplace=True)

In [None]:
shark_bite_df.dropna(how = 'any', axis = 0, inplace= True)


In [None]:
shark_bite_df_copy = shark_bite_df.copy()
shark_bite_df_copy.drop(["year", "day"], axis=1, inplace=True)
shark_bite_df_copy

In [None]:
month_dict = {1.0: "Jan",
              2.0: "Feb",
              3.0: "Mar",
              4.0: "Apr",
              5.0: "May",
              6.0: "Jun",
              7.0: "Jul",
              8.0: "Aug",
              9.0: "Sep",
              10.0: "Oct",
              11.0: "Nov",
              12.0: "Dec" 
             }
shark_bite_df_copy = shark_bite_df_copy.replace({"month": month_dict})
shark_bite_df_copy

In [None]:
# Encoding feature column
shark_bite_encoded = pd.get_dummies(shark_bite_df_copy, columns = [
    "month",
    "country",
    "activity",
    "species",
    "type",
    "people_involved",
    "sex"
])
#Encoding target column
# Not fatal (N): 0
# Fatal (Y): 1
x = {'N': 0}   
shark_bite_encoded = shark_bite_encoded.replace(x)
y = {'Y': 1}
shark_bite_encoded = shark_bite_encoded.replace(y)

pd.set_option('display.max_columns', None)
shark_bite_encoded

In [None]:
# Creating the features
X = shark_bite_encoded.drop(columns='fatal')

# Creating the target
y = shark_bite_encoded['fatal']

In [None]:
# Normal train test split 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)


# Easy Ensemble AdaBoost Classifier Model

In [None]:
eec_model = EasyEnsembleClassifier(n_estimators=100, random_state=1)

In [None]:
# Fitting the model
eec_model.fit(X_train, y_train)

In [None]:
# Making predictions using the testing data
eec_predictions = eec_model.predict(X_test)

In [None]:
eec_cm = confusion_matrix(y_test, eec_predictions)
eec_cm_df = pd.DataFrame(
    eec_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
eec_cm_df

In [None]:
eec_acc_score = accuracy_score(y_test, eec_predictions)
print(f"Accuracy Score : {eec_acc_score}")


In [None]:
print("Classification Report")
print(classification_report(y_test, eec_predictions))


# Random Forest Model


In [None]:
rf_model = RandomForestClassifier(n_estimators=128, random_state=1)

In [None]:
rf_model.fit(X_train, y_train)

In [None]:
rf_predictions = rf_model.predict(X_test)

In [None]:
# Calculating the confusion matrix.
rf_cm = confusion_matrix(y_test, rf_predictions)
# Create a DataFrame from the confusion matrix.
rf_cm_df = pd.DataFrame(
    rf_cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])
rf_cm_df
# Calculating the accuracy score.
rf_acc_score = accuracy_score(y_test, rf_predictions)
rf_acc_score
# Displaying results
print("Confusion Matrix")
display(rf_cm_df)
print(f"Accuracy Score : {rf_acc_score}")
print("Classification Report")
print(classification_report(y_test, rf_predictions))


In [None]:
importances = rf_model.feature_importances_
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

# Gradient Boosting Model

In [None]:
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=100,
                                            learning_rate=learning_rate,
                                            max_features=60,
                                            max_depth=3,
                                            random_state=1)
    # Fit the model for each learning rate
    classifier.fit(X_train, y_train)
    print("Learning rate: ", learning_rate)
    #Evaluate the accuracy for each learning rate
    print("Accuracy score (training): {0:.3f}".format(
        classifier.score(
            X_train,
            y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
        classifier.score(
            X_test,
            y_test)))
    print()

In [None]:
GB_classifier = GradientBoostingClassifier(n_estimators=100,
                                        learning_rate=0.25,
                                        max_features=60,
                                        max_depth=3,
                                        random_state=1)



In [None]:
GB_classifier.fit(X_train, y_train)

In [None]:
GB_predictions = GB_classifier.predict(X_test)
pd.DataFrame({"Prediction": GB_predictions, "Actual": y_test}).head(20)


In [None]:
# Calculating the accuracy score
GB_acc_score = accuracy_score(y_test, GB_predictions)
print(f"Accuracy Score : {GB_acc_score}")
# Generate the confusion matrix
GB_cm = confusion_matrix(y_test, GB_predictions)
GB_cm_df = pd.DataFrame(
    GB_cm, index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"])
# Displaying results
display(GB_cm_df)
# Generate classification report
print("Classification Report")
print(classification_report(y_test, GB_predictions))


# SMOTEENN

In [None]:
print(Counter(y))
print(Counter(y_train))

In [None]:
smoteenn = SMOTEENN(random_state=0)
X_resampled, y_resampled = smoteenn.fit_resample(X, y)
Counter(y_resampled)

In [None]:
# Fit a Logistic regression model using random undersampled data
smoteenn_model = LogisticRegression(solver='lbfgs', random_state=1, max_iter=150)
smoteenn_model.fit(X_resampled, y_resampled)
smoteenn_predictions = smoteenn_model.predict(X_test)
# Display the confusion matrix
smoteenn_cm = confusion_matrix(y_test, smoteenn_predictions)
smoteenn_cm_df = pd.DataFrame(
    smoteenn_cm, index=["Actual 0", "Actual 1"],
    columns=["Predicted 0", "Predicted 1"])
smoteenn_cm_df

In [None]:
smoteenn_acc_score = balanced_accuracy_score(y_test, smoteenn_predictions)
print(f"Accuracy Score : {smoteenn_acc_score}")

In [None]:
print(classification_report_imbalanced(y_test, smoteenn_predictions))