In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

csv_location = "exrc06p03_nba.csv"

# Load input CSV which contains data related to human voices into pandas.DataFrame
df = pd.read_csv(csv_location)

# # Get basic information about data
# print(df.info())  # prints concise summary about DataFrame's structure
# print(df.head())  # prints first five rows - default

# Extract Feature (X) and Target (y)
# Drop non-feature column ie ´Name´
X = df.drop(columns=["TARGET_5Yrs", "Name"])
y = df["TARGET_5Yrs"]

# Impute missing values with the median of each column
imputer = SimpleImputer(strategy="median")
X_imputed = imputer.fit_transform(X)

# Lets print what diff values on Target ie ´TARGET_5Yrs´ column 
# print(y.value_counts())   # confirm the values / distribution for Target ie ´TARGET_5Yrs´ column

# Split train/test set 70/30 %
X_train, X_test, y_train, y_test = train_test_split(
    X_imputed,
    y,
    train_size=0.7, # to split data as 70% for training and rest 30% for testing
    stratify=y,     # to keep the same class ratio in training and test sets
    random_state=42 # to ensure same rows go to train and test sets in every run for consistency purpose
)

# Standardize features for SVM and Logistic Regression models
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create SVM model (Support Vector Classifier)
svm_model = SVC(
    class_weight="balanced",  # adjust importance of classes to handle imbalanced data
    kernel="rbf",             # use RBF kernel to learn non-linear decision boundaries
    random_state=42           # ensure reproducible and consistent results
)

# Train the SVM model
svm_model.fit(X_train_scaled, y_train)

# Prediction with test data
y_pred_svm = svm_model.predict(X_test_scaled)

# Evaluate model's Accuracy Score and Confusion Matrix
acc_score_svm = accuracy_score(y_test, y_pred_svm)          # Calculate Accuracy score
confusion_matrix_svm = confusion_matrix(y_test, y_pred_svm) # Calculate Confusion Matrix

print(f"Accuracy Score with SVM Model: {acc_score_svm:.4f}")
print(f"Confusion Matrix with SVM Model:\n{confusion_matrix_svm}")

# Create Logistic Regression Model
# Use liblinear solver, good for smaller datasets and binary classification
model_lr = LogisticRegression(
    class_weight="balanced",    # automatically handle imbalanced classes by adjusting weights
    max_iter=2000,              # allow more steps so the model can fully converge
    solver="liblinear"          # best solver for binary classification and smaller datasets   
)

# Train the Logistic Regression model
model_lr.fit(X_train_scaled, y_train)

# Prediction with test data
y_pred_lr = model_lr.predict(X_test_scaled)

# Evaluate model's Accuracy Score and Confusion Matrix
acc_score_lr = accuracy_score(y_test, y_pred_lr)
confusion_matrix_lr = confusion_matrix(y_test, y_pred_lr)

print(f"\nAccuracy Score with Logistic Regression Model: {acc_score_lr:.4f}")
print(f"Confusion Matrix with Logistic Regression Model:\n{confusion_matrix_lr}")

Accuracy Score with SVM Model: 0.6898
Confusion Matrix with SVM Model:
[[108  45]
 [ 80 170]]

Accuracy Score with Logistic Regression Model: 0.6774
Confusion Matrix with Logistic Regression Model:
[[103  50]
 [ 80 170]]
