In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Load the datasets
train_data_path = 'C:\\Users\\kowsi\\Desktop\\Code Version 3\\train.csv'  # Path to the train dataset
test_data_path = 'C:\\Users\\kowsi\\Desktop\\Code Version 3\\test.csv'  # Path to the test dataset

# Load the train and test data
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)

# Step 2: Preprocess the data
# Encode categorical variables with LabelEncoder for simplicity
label_encoders = {}

# Columns that need encoding
categorical_columns = ['Gender', 'Married', 'Education', 'Self_Employed', 'Property_Area', 'Dependents']

for col in categorical_columns:
    le = LabelEncoder()
    train_data[col] = le.fit_transform(train_data[col].astype(str))  # Fit and transform train data
    test_data[col] = le.transform(test_data[col].astype(str))  # Transform test data
    label_encoders[col] = le  # Save the encoder if needed later

# Fill missing values ONLY for numeric columns
numeric_columns = train_data.select_dtypes(include=['float64', 'int64']).columns
train_data[numeric_columns] = train_data[numeric_columns].fillna(train_data[numeric_columns].mean())
test_data[numeric_columns] = test_data[numeric_columns].fillna(test_data[numeric_columns].mean())

# Step 3: Prepare the features (X) and target (y) for training
X_train = train_data.drop(columns=['Loan_ID', 'Loan_Status'])  # Features for training (excluding Loan_ID and Loan_Status)
y_train = train_data['Loan_Status'].map({'Y': 1, 'N': 0})  # Target variable (mapping Y to 1 and N to 0)

X_test = test_data.drop(columns=['Loan_ID'])  # Features for testing (excluding Loan_ID)

# Step 4: Train the Decision Tree Model
model_dt = DecisionTreeClassifier(max_depth=5, random_state=42)
model_dt.fit(X_train, y_train)

# Step 5: Predict on the training set and check accuracy
y_train_pred = model_dt.predict(X_train)
print("Training Accuracy:", accuracy_score(y_train, y_train_pred))
print("Training Classification Report:\n", classification_report(y_train, y_train_pred))

# Step 6: Predict on the test set
test_predictions = model_dt.predict(X_test)

# Step 7: Save the predictions to a CSV for submission or further use
test_data['Loan_Status'] = ['Y' if pred == 1 else 'N' for pred in test_predictions]
submission = test_data[['Loan_ID', 'Loan_Status']]
submission.to_csv('loan_predictions.csv', index=False)

# Optional: Show the first few rows of the predictions
submission.head()


Training Accuracy: 0.8322475570032574
Training Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.58      0.68       192
           1       0.83      0.95      0.89       422

    accuracy                           0.83       614
   macro avg       0.83      0.76      0.78       614
weighted avg       0.83      0.83      0.82       614



Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,Y
1,LP001022,Y
2,LP001031,Y
3,LP001035,Y
4,LP001051,Y
