In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
# https://www.statsmodels.org/stable/index.html
import statsmodels.api as sm

In [4]:
import warnings
warnings.filterwarnings("ignore")

In [5]:
from functools import partial

In [6]:
from dotenv import load_dotenv

from pathlib import Path

env_path = Path("../../.env-live")

if env_path.exists():
    print('envs Loaded')
    load_dotenv(dotenv_path=env_path)
from jrjModelRegistry.jrjModelRegistry import registerAJrjModel

ModuleNotFoundError: No module named 'jrjModelRegistry'

In [None]:
def generalRegressionPredictor(self, transformedData):
    return self.predict(transformedData)

In [None]:
spamDf = pd.read_excel("./Spam.xlsx")
# spamDf = pd.read_excel("https://www.dropbox.com/scl/fi/v24mmhg5hmefmnv99uqsy/Spam.xlsx?rlkey=iq7exnueq84sy7y2b8ud70mp0&dl=1")
spamDf

In [None]:
spamDf.size, spamDf.shape

In [None]:
evaluationBinaryCalssifiactionSampleData = {
    "Recipients": [19, 15, 13],
    "Hyperlinks": [1, 1, 11],
    "Characters": [47, 58, 88]
}

In [None]:
def evaluationBinaryCalssifiactionTransformer(dataForTransfer = None):
    import pandas as pd
    import statsmodels.api as sm
    if isinstance(dataForTransfer, pd.DataFrame):
        df = dataForTransfer.copy()
    else:
        df = pd.DataFrame(dataForTransfer)
    dfTransformer = sm.add_constant(df[["Recipients", "Hyperlinks", "Characters"]],has_constant='add')
    return dfTransformer

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Split the dataset into training and testing sets
trainSet, testSet = train_test_split(
  spamDf,
  test_size=0.3,
  random_state=1,
  stratify=spamDf['Spam']
)
trainSet.shape, testSet.shape

In [None]:
# Fit the logistic regression model
features = ['Recipients', 'Hyperlinks', 'Characters']
xTrain = trainSet[features]
yTrain = trainSet['Spam'].astype(int)
yTest = testSet['Spam'].astype(int)

In [None]:
model1 = sm.Logit(
  yTrain,
  evaluationBinaryCalssifiactionTransformer(xTrain)
)
model1Fit = model1.fit()
print(model1Fit.summary())

In [None]:
predict1 = model1Fit.predict(evaluationBinaryCalssifiactionTransformer(testSet))
testSet['predict1'] = predict1
sumTable = pd.DataFrame({'A': testSet['Spam'], 'Prob': testSet['predict1']})
sumTable.to_csv("ROC.csv", index=True)


In [None]:
testSet['predict1']

In [None]:
sumTable1 = pd.DataFrame({'A': testSet['Spam'], 'Prob': testSet['predict1']})

In [None]:
# Make predictions based on probability threshold of 0.5
testSet['predictions'] = (testSet['predict1'] > 0.5).astype(int)
sumTable1['P'] = testSet['predictions']
sumTable1

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, roc_curve

In [None]:
# Calculate accuracy
accuracy = accuracy_score(sumTable1['A'], sumTable1['P'])
print(f'Accuracy: {accuracy}')

In [None]:
# Calculate recall
recall = recall_score(sumTable1['A'], sumTable1['P'])
print(f'Recall: {recall}')

In [None]:
# Calculate precision
precision = precision_score(sumTable1['A'], sumTable1['P'])
print(f'Precision: {precision}')

In [None]:
# Sensitivity and Specificity (Sensitivity is same as recall)
sensitivity = recall
specificity = sum((sumTable1['A'] == 0) & (sumTable1['P'] == 0)) / sum(sumTable1['A'] == 0)
print(f'Sensitivity: {sensitivity}')
print(f'Specificity: {specificity}')

In [None]:
# Calculate F1 Score
f1Score = 2 * (precision * recall) / (precision + recall)
print(f'F1 Score: {f1Score}')

In [None]:
# Plot ROC curve
fpr, tpr, _ = roc_curve(testSet['Spam'], testSet['predict1'])
roc_auc = roc_auc_score(testSet['Spam'], testSet['predict1'])
# Calculate AUC
print(f'AUC: {roc_auc}')


In [None]:
import matplotlib.pyplot as plt

In [None]:

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic')
plt.legend(loc="lower right")
plt.show()


In [None]:
model1Fit.transformer = evaluationBinaryCalssifiactionTransformer
model1Fit.mainPredictor = partial(generalRegressionPredictor, model1Fit)
registerAJrjModel(
    model1Fit,
    {
        "modelName":f"saadet_doga_hascelik__evaluationBinaryCalssifiactionModelWithExteraMetircs",
        "version":"1.0.1",
        "params": model1Fit.params.to_dict(),
        "score": accuracy,
        "otherEvaluationMetrics": {
            "accuracy": accuracy,
            "recall": recall,
            "precision": precision,
            "sensitivity": sensitivity,
            "specificity": specificity,
            "f1Score": f1Score,
            "roc_auc": roc_auc,
        },
        "modelLibrary": "statsmodels.api.Logit",
        "libraryMetadata": {
            "pvalues": model1Fit.pvalues.to_dict(),
            "pseudo_r_squared": float(model1Fit.prsquared),
            "llf": float(model1Fit.llf),
            "aic": float(model1Fit.aic),
            "bic": float(model1Fit.bic)
        },
    
        "sampleData": {
            "dataForTransfer": evaluationBinaryCalssifiactionSampleData
        }
    }
)

## DT

In [None]:
def generalDtPredictor(self, transformedData):
    import pandas as pd
    probs = self.predict_proba(transformedData)
    return pd.Series(probs[:, 1])

In [None]:
def generalDtTransformer(dataForTransfer = None):
    import pandas as pd
    if isinstance(dataForTransfer, pd.DataFrame):
        df = dataForTransfer.copy()
    else:
        df = pd.DataFrame(dataForTransfer)
    return df

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
dt = DecisionTreeClassifier()


In [None]:
dt.fit(evaluationBinaryCalssifiactionTransformer(xTrain), yTrain)

In [None]:
# Importing required packages for visualization
from IPython.display import Image  
from six import StringIO  
from sklearn.tree import export_graphviz
import pydotplus, graphviz

In [None]:
# plotting tree with max_depth=3
dot_data = StringIO()  

export_graphviz(dt, out_file=dot_data, filled=True, rounded=True,
                feature_names=evaluationBinaryCalssifiactionTransformer(xTrain).columns, 
                class_names=['No Disease', "Disease"])

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
#Image(graph.create_png(),width=800,height=900)
#graph.write_pdf("dt_heartdisease.pdf")

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
y_train_pred = dt.predict(evaluationBinaryCalssifiactionTransformer(xTrain))
y_test_pred = dt.predict(evaluationBinaryCalssifiactionTransformer(testSet))

In [None]:
# y_train_pred

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
print(accuracy_score(yTrain, y_train_pred))
confusion_matrix(yTrain, y_train_pred)

In [None]:
print(accuracy_score(yTest, y_test_pred))
confusion_matrix(yTest, y_test_pred)

In [None]:
from sklearn.metrics import classification_report

In [None]:
dt.transformer = evaluationBinaryCalssifiactionTransformer
dt.mainPredictor = partial(generalDtPredictor,dt)

In [None]:
y_pred = dt.predict(evaluationBinaryCalssifiactionTransformer(evaluationBinaryCalssifiactionTransformer(testSet)))
# y_pred
score = accuracy_score(yTest, y_pred)
# score
report = classification_report(yTest, y_pred, output_dict=True)
# report

In [None]:
dt_metadata = {
    "modelName": "saadet_Doga_hascelik__evaluationBinaryCalssifiactionModelDt",
    "version": "1.0.1",
    "params": dt.get_params(),  # All model hyperparameters
    "score": float(score),         # Accuracy
    "modelLibrary": "sklearn.tree.DecisionTreeClassifier",
    "libraryMetadata": {
        "feature_importances": dt.feature_importances_.tolist(),
        "n_features": int(dt.n_features_in_),
        "n_classes": int(dt.n_classes_),
        "classes": dt.classes_.tolist(),
        "depth": int(dt.get_depth()),
        "n_leaves": int(dt.get_n_leaves()),
        "classification_report": report
    },
     "sampleData": {
        "dataForTransfer": evaluationBinaryCalssifiactionSampleData
    }
}

In [None]:
registerAJrjModel(
    dt,
    dt_metadata
)

In [None]:
dt1 = DecisionTreeClassifier(min_samples_leaf=20, random_state=42, criterion="entropy")


In [None]:
dt1.fit(evaluationBinaryCalssifiactionTransformer(xTrain), yTrain)

In [None]:
dot_data = StringIO()  

export_graphviz(dt1, out_file=dot_data, filled=True, rounded=True,
                feature_names=evaluationBinaryCalssifiactionTransformer(xTrain).columns, 
                class_names=['No Disease', "Disease"])

graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
Image(graph.create_png())
#Image(graph.create_png(),width=800,height=900)
#graph.write_pdf("dt_heartdisease.pdf")

In [None]:
dt1.transformer = evaluationBinaryCalssifiactionTransformer
dt1.mainPredictor = partial(generalDtPredictor,dt1)
y_pred = dt.predict(evaluationBinaryCalssifiactionTransformer(evaluationBinaryCalssifiactionTransformer(testSet)))
# y_pred
score = accuracy_score(yTest, y_pred)
# score
report = classification_report(yTest, y_pred, output_dict=True)
# report
dt1_metadata = {
    "modelName": "saadet_doga_hascelik__evaluationBinaryCalssifiactionModelDt1",
    "version": "1.0.1",
    "params": dt1.get_params(),  # All model hyperparameters
    "score": float(score),         # Accuracy
    "modelLibrary": "sklearn.tree.DecisionTreeClassifier",
    "libraryMetadata": {
        "feature_importances": dt1.feature_importances_.tolist(),
        "n_features": int(dt1.n_features_in_),
        "n_classes": int(dt1.n_classes_),
        "classes": dt1.classes_.tolist(),
        "depth": int(dt1.get_depth()),
        "n_leaves": int(dt1.get_n_leaves()),
        "classification_report": report
    },
     "sampleData": {
        "dataForTransfer": evaluationBinaryCalssifiactionSampleData
    }
}

In [None]:
registerAJrjModel(
    dt1,
    dt1_metadata
)