In [34]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay, precision_recall_curve
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [35]:
df = pd.read_csv('new_train2.csv')

In [36]:
# Preprocess
df['label'] = df['y'].apply(lambda x: 1 if x == 'yes' else 0)
del df['y']

most_frequent = df[df['default'] != 'unknown']['default'].mode()[0]
df['default'].replace('unknown', most_frequent, inplace=True)

df = df[(df['housing'] != 'unknown') & (df['loan'] != 'unknown') & (df['marital'] != 'unknown')]

X = df.drop('label', axis=1)
y = df['label']

categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
numerical_features = ['age', 'duration', 'campaign']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(sparse_output=False), categorical_features)  # Set sparse_output=False
    ])

In [37]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [39]:
print("Unique values in training set:", X_train['marital'].unique())
print("Unique values in testing set:", X_test['marital'].unique())

Unique values in training set: ['married' 'single' 'divorced']
Unique values in testing set: ['single' 'married' 'divorced']


In [38]:
model = LogisticRegression(solver='liblinear', max_iter=1000)
name = "Logistic Regression"

pipeline = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', model)])

# Training 
pipeline.fit(X_train, y_train)

# Validate: cross-validation
cv_score = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='accuracy')
print(f"Cross-Validation Accuracy for {name}: {np.mean(cv_score):.4f} ± {np.std(cv_score):.4f}")

Traceback (most recent call last):
  File "/Users/xiaolan/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 115, in __call__
    def __init__(self, *, scorers, raise_exc=True):
                            ^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/xiaolan/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 276, in _score
    ``self._kwargs`` and ``kwargs`` passed as metadata.
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/xiaolan/anaconda3/lib/python3.11/site-packages/sklearn/metrics/_scorer.py", line 73, in _cached_call
    homogeneity_score,
               ^^^^^^^^
  File "/Users/xiaolan/anaconda3/lib/python3.11/site-packages/sklearn/pipeline.py", line 480, in predict
    Call `transform` of each transformer in the pipeline. The transformed
             ^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/xiaolan/anaconda3/lib/python3.11/site-packages/sklearn/utils/_set_output.py", line 140, in wrapped
    data_to_wrap = f(self, X, *args

Cross-Validation Accuracy for Logistic Regression: nan ± nan
