In [None]:
# Step 1: Understand Data
import pandas as pd

# Load the dataset
df = pd.read_csv("train_loan.csv")

# Print dataset properties
print("Head:\n", df.head())
print("\nShape:", df.shape)
print("\nColumns:", df.columns)
print("\nData Types:\n", df.dtypes)
print("\nInfo:\n", df.info())
print("\nValue Counts:\n", df['Loan_Status'].value_counts())

# Step 2: Data Cleaning
# Replace numbers as string by integer in "Dependents" column
df['Dependents'] = df['Dependents'].replace(to_replace='3+', value=3).astype(int)

# Fill missing data in categorical columns by mode
categorical_columns = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Credit_History']
for col in categorical_columns:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Handle missing values in numerical columns
# (You might want to replace missing numerical values with mean or median)

# Drop Loan ID column
df.drop('Loan_ID', axis=1, inplace=True)

# Step 3: Exploratory Data Analysis (Optional)
# Draw count plots for Married, Dependents, Graduates, Self-employed using seaborn or matplotlib

# Step 4: Extract X and y
X = df.drop('Loan_Status', axis=1)
y = df['Loan_Status']

# Step 5: One Hot Encoding
X = pd.get_dummies(X)

# Step 6: Model Building
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Split X and y for training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create LinearSVC model, train and test
linear_svc_model = LinearSVC()
linear_svc_model.fit(X_train_scaled, y_train)
y_pred = linear_svc_model.predict(X_test_scaled)

# Print accuracy value
print("Accuracy:", accuracy_score(y_test, y_pred))

# Print confusion matrix
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Print classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Step 7: Performance Comparisons
# Compare with LogisticRegression and SGDClassifier, and SVC with various kernels
# (you'll need to import these models and perform similar steps as above)
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC

# Step 7: Performance Comparisons

# 1) Compare the performance of LinearSVC against LogisticRegression
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train_scaled, y_train)
y_pred_lr = logistic_regression_model.predict(X_test_scaled)

print("\nLogistic Regression - Accuracy:", accuracy_score(y_test, y_pred_lr))

# 2) Compare the performance of LinearSVC against SGDClassifier
sgd_classifier_model = SGDClassifier()
sgd_classifier_model.fit(X_train_scaled, y_train)
y_pred_sgd = sgd_classifier_model.predict(X_test_scaled)

print("\nSGD Classifier - Accuracy:", accuracy_score(y_test, y_pred_sgd))

# 3) Compare LinearSVC against SVC with various kernels: 'linear', 'poly', 'rbf', 'sigmoid'
kernels = ['linear', 'poly', 'rbf', 'sigmoid']

for kernel in kernels:
    svc_model = SVC(kernel=kernel)
    svc_model.fit(X_train_scaled, y_train)
    y_pred_svc = svc_model.predict(X_test_scaled)

    print(f"\nSVC with {kernel} kernel - Accuracy:", accuracy_score(y_test, y_pred_svc))
