In [4]:
import pandas as pd

data_train = pd.read_csv('master_data_train_final_scaled_with_ids.csv')
data_test = pd.read_csv('master_data_test_final_scaled_with_ids.csv')

In [6]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Replace invalid characters in feature names across the whole dataset
data_train.columns = data_train.columns.str.replace('<', '_').str.replace('>', '_').str.replace(' ', '_')
data_test.columns = data_test.columns.str.replace('<', '_').str.replace('>', '_').str.replace(' ', '_')

# Now proceed with splitting
X = data_train.drop(columns=['TARGET'])
y = data_train['TARGET']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Feature names cleaned and datasets split successfully.")

Feature names cleaned and datasets split successfully.


In [7]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

# Step 1: Split the dataset into training and testing subsets
X = data_train.drop(columns=['TARGET'])
y = data_train['TARGET']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Data split into training and testing subsets.")

# Step 2: Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print("SMOTE applied to training data.")
print("Class Distribution After SMOTE:")
print(y_train_smote.value_counts())

# Step 3: Train the XGBoost model with the best hyperparameters
xgb_model = XGBClassifier(
    learning_rate=0.2,
    max_depth=7,
    n_estimators=200,
    subsample=1.0,
    random_state=42
)
xgb_model.fit(X_train_smote, y_train_smote)

print("Model trained with SMOTE-balanced training data.")

# Step 4: Evaluate on the testing subset
y_pred_test = xgb_model.predict(X_test)

accuracy_test = accuracy_score(y_test, y_pred_test)
report_test = classification_report(y_test, y_pred_test)

print("\nEvaluation on Testing Subset:")
print("Test Accuracy:", accuracy_test)
print(report_test)

Data split into training and testing subsets.
SMOTE applied to training data.
Class Distribution After SMOTE:
0.0    226132
1.0    226132
Name: TARGET, dtype: int64
Model trained with SMOTE-balanced training data.

Evaluation on Testing Subset:
Test Accuracy: 0.9184267434108905
              precision    recall  f1-score   support

         0.0       0.92      1.00      0.96     56554
         1.0       0.38      0.02      0.04      4949

    accuracy                           0.92     61503
   macro avg       0.65      0.51      0.50     61503
weighted avg       0.88      0.92      0.88     61503



In [8]:
from sklearn.metrics import accuracy_score, classification_report

# Evaluate on the training subset
y_pred_train = xgb_model.predict(X_train)

accuracy_train = accuracy_score(y_train, y_pred_train)
report_train = classification_report(y_train, y_pred_train)

print("\nEvaluation on Training Subset:")
print("Train Accuracy:", accuracy_train)
print(report_train)

# Evaluate on the testing subset
y_pred_test = xgb_model.predict(X_test)

accuracy_test = accuracy_score(y_test, y_pred_test)
report_test = classification_report(y_test, y_pred_test)

print("\nEvaluation on Testing Subset:")
print("Test Accuracy:", accuracy_test)
print(report_test)


Evaluation on Training Subset:
Train Accuracy: 0.9250146336704498
              precision    recall  f1-score   support

         0.0       0.93      1.00      0.96    226132
         1.0       0.91      0.08      0.15     19876

    accuracy                           0.93    246008
   macro avg       0.92      0.54      0.55    246008
weighted avg       0.92      0.93      0.90    246008


Evaluation on Testing Subset:
Test Accuracy: 0.9184267434108905
              precision    recall  f1-score   support

         0.0       0.92      1.00      0.96     56554
         1.0       0.38      0.02      0.04      4949

    accuracy                           0.92     61503
   macro avg       0.65      0.51      0.50     61503
weighted avg       0.88      0.92      0.88     61503



In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Train the Logistic Regression model
logistic_model = LogisticRegression(penalty='l2', C=1.0, solver='lbfgs', max_iter=1000, random_state=42)
logistic_model.fit(X_train_smote, y_train_smote)

# Step 2: Evaluate on the training subset
y_pred_train_log = logistic_model.predict(X_train)
accuracy_train_log = accuracy_score(y_train, y_pred_train_log)
report_train_log = classification_report(y_train, y_pred_train_log)

print("\nLogistic Regression Evaluation on Training Subset:")
print("Train Accuracy:", accuracy_train_log)
print(report_train_log)

# Step 3: Evaluate on the testing subset
y_pred_test_log = logistic_model.predict(X_test)
accuracy_test_log = accuracy_score(y_test, y_pred_test_log)
report_test_log = classification_report(y_test, y_pred_test_log)

print("\nLogistic Regression Evaluation on Testing Subset:")
print("Test Accuracy:", accuracy_test_log)
print(report_test_log)


Logistic Regression Evaluation on Training Subset:
Train Accuracy: 0.5569940814932848
              precision    recall  f1-score   support

         0.0       0.92      0.56      0.70    226132
         1.0       0.09      0.48      0.15     19876

    accuracy                           0.56    246008
   macro avg       0.51      0.52      0.42    246008
weighted avg       0.86      0.56      0.66    246008


Logistic Regression Evaluation on Testing Subset:
Test Accuracy: 0.5543469424255727
              precision    recall  f1-score   support

         0.0       0.92      0.56      0.70     56554
         1.0       0.09      0.47      0.15      4949

    accuracy                           0.55     61503
   macro avg       0.51      0.52      0.42     61503
weighted avg       0.86      0.55      0.65     61503



In [10]:
Y_test = xgb_model.predict(data_test)

In [11]:
# Check predictions
print("Predictions for the test data:")
print(Y_test[:10])  # Show the first 10 predictions
print(f"Total predictions: {len(Y_test)}")

Predictions for the test data:
[0 0 0 0 0 0 0 0 0 0]
Total predictions: 48744


In [12]:
# Combine predictions with SK_ID_CURR
submission = pd.DataFrame({
    'SK_ID_CURR': data_test['SK_ID_CURR'],
    'TARGET': Y_test
})

# Save to CSV
submission.to_csv('final_predictions.csv', index=False)

print("Predictions saved as 'final_predictions.csv'")

Predictions saved as 'final_predictions.csv'
