In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.impute import SimpleImputer

csv_location = "exrc06p03_nba.csv"

# Load input CSV which contains data related to human voices into pandas.DataFrame
df = pd.read_csv(csv_location)

# # Get basic information about data
# print(df.info())  # prints concise summary about DataFrame's structure
# print(df.head())  # prints first five rows - default

# Extract Feature (X) and Target (y)
# Drop non-feature column ie ´Name´
X = df.drop(columns=["TARGET_5Yrs", "Name"])
y = df["TARGET_5Yrs"]

# Impute missing values with the median of each column
imputer = SimpleImputer(strategy="median")
X_imputed = imputer.fit_transform(X)

# Identify numeric columns
numeric_cols = df.select_dtypes(include=["number"]).columns

# Replace missing values in numeric columns with median
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

# Show updated data
print(df.head())
print("\nMissing values after filling:\n", df[numeric_cols].isna().sum())


# Lets print what diff values on Target ie ´TARGET_5Yrs´ column 
# print(y.value_counts())   # confirm the values / distribution for Target ie ´TARGET_5Yrs´ column

# Split train/test set 70/30 %
X_train, X_test, y_train, y_test = train_test_split(
    X_imputed,
    y,
    train_size=0.7, # 70% for training and rest 30% for testing
    stratify=y,     # keep class balance since target column is slightly imbalanced
    random_state=42 # random_state=42 is chose to ensure same rows go to train and test every time
)

# Create SVM model (Support Vector Classifier)
# RBF kernel is default and works well in many cases
svm_model = SVC(kernel="rbf", random_state=42)

# Train the SVM model
svm_model.fit(X_train, y_train)

# Predictions on test data
y_pred_svm = svm_model.predict(X_test)

# Evaluate model with Accuracy Score and Confusion Matrix
acc_score_svm = accuracy_score(y_test, y_pred_svm)          # Calculate Accuracy score
confusion_matrix_svm = confusion_matrix(y_test, y_pred_svm) # Calculate Confusion Matrix

print(f"Accuracy Score with SVM Model: {acc_score_svm:.4f}")
print(f"Confusion Matrix with SVM Model:\n{confusion_matrix_svm}")

# Create Logistic Regression Model
# Use liblinear solver, good for smaller datasets and binary classification
model_lr = LogisticRegression(max_iter=2000, solver="liblinear")

# Train the model
model_lr.fit(X_train, y_train)

# Prediction with test data
y_pred_lr = model_lr.predict(X_test)

# Evaluate model with Accuracy Score and Confusion Matrix
acc_score_lr = accuracy_score(y_test, y_pred_lr)          # Calculate Accuracy score
confusion_matrix_lr = confusion_matrix(y_test, y_pred_lr) # Calculate Confusion Matrix

print(f"\nAccuracy Score with Logistic Regression Model: {acc_score_lr:.4f}")
print(f"Confusion Matrix with Logistic Regression Model:\n{confusion_matrix_lr}")


TARGET_5Yrs
1.0    831
0.0    509
Name: count, dtype: int64
Accuracy Score with SVM Model: 0.6998
Confusion Matrix with SVM Model:
[[ 73  80]
 [ 41 209]]

Accuracy Score with Logistic Regression Model: 0.7097
Confusion Matrix with Logistic Regression Model:
[[ 83  70]
 [ 47 203]]
