# MODEL BUILDING

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier, LogisticRegression, LogisticRegressionCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report, f1_score

In [2]:
# Load data
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

# Preprocessing

In [3]:
# Separate features and label
X = train_data.drop(columns=['target'])
y = train_data['target']

# Prepare Features
def prepare_features(df):
    df['pdays'] = df['pdays'].replace(-1, np.nan)

    df['date'] = pd.to_datetime(df['last contact date'])
    df['year'] = df['date'].dt.year
    df['month'] = df['date'].dt.month
    df['day'] = df['date'].dt.day
    df['day of week'] = df['date'].dt.dayofweek                                             
    df['is weekend'] = df['day of week'].apply(lambda x: 1 if x >= 5 else 0)
    df = df.drop(columns=['last contact date', 'date'])
    return df

X = prepare_features(X)

# train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [4]:
# for transforming features
numeric_columns = ['age', 'balance', 'duration', 'campaign', 'pdays', 'previous']
categorical_columns = ['job', 'marital', 'default', 'housing', 'loan', 'contact', 'poutcome', 'year', 'month', 'day', 'day of week', 'is weekend']
ordinal_columns = ['education']

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), 
    ('scaler', StandardScaler()),
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), 
    ('onehot', OneHotEncoder(handle_unknown='ignore'))  
])

ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=[['primary', 'secondary', 'tertiary']])) 
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_columns),
        ('cat', categorical_transformer, categorical_columns),
        ('ord', ordinal_transformer, ordinal_columns)
    ])

In [5]:
# for transforming label
label_enc = LabelEncoder()
y_train = label_enc.fit_transform(y_train)
y_test = label_enc.fit_transform(y_test)

# Models

In [6]:
# # Perceptron
# pipe = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('classifier', Perceptron(alpha=0.01, max_iter=2000, tol=0.0001, eta0=0.001, class_weight='balanced'))
# ])

# # fi_macro_average, 0.6923850345260515

In [7]:
# # LogisticRegression
# pipe = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('classifier', LogisticRegression(class_weight='balanced'))
# ])

# # fi_macro_average, 0.7297559870521964

In [8]:
# # KNeighborsClassifier
# pipe = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ('classifier', KNeighborsClassifier(n_neighbors=3))
# ])

# # fi_macro_average, 0.6545377907966893

In [9]:
# # SVC
# pipe = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ("classifier", SVC(class_weight='balanced', kernel = 'linear'))
# ])

# # fi_macro_average, 0.7502859645079138

In [10]:
# # DecisionTreeClassifier
# pipe = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ("classifier", DecisionTreeClassifier(class_weight='balanced', max_depth=5, min_samples_leaf=10, random_state=42))
# ])

# # fi_macro_average, 0.7336741752248984

In [11]:
# RandomForestClassifier
pipe = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ("classifier", RandomForestClassifier(n_estimators=300, max_depth=20, min_samples_split=10, min_samples_leaf=2, class_weight="balanced", random_state=42))
])

# fi_macro_average, 0.7709318845337377

In [12]:
# # AdaBoostClassifier
# dtc = DecisionTreeClassifier(class_weight='balanced', max_depth=5, min_samples_leaf=10, random_state=42)
# pipe = Pipeline(steps=[
#     ('preprocessor', preprocessor),
#     ("adaboost", AdaBoostClassifier(estimator=dtc, n_estimators=50, learning_rate=0.1, algorithm='SAMME.R', random_state=42))
# ])

# # fi_macro_average, 0.7448168489599839

In [13]:
# fit/predict
model = pipe.fit(X_train, y_train)
y_pred = model.predict(X_test)

# f1_score
f1_score(y_test, y_pred, average='macro')

0.7709318845337377

### Other Metrics

In [14]:
# Confusion Matrix
confusion_matrix(y_test, y_pred)

array([[5793,  884],
       [ 225,  941]])

In [15]:
# Classification Report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.87      0.91      6677
           1       0.52      0.81      0.63      1166

    accuracy                           0.86      7843
   macro avg       0.74      0.84      0.77      7843
weighted avg       0.90      0.86      0.87      7843



# HPT

In [None]:
# param_grid_perceptron = {
#     'classifier__alpha': [0.0001, 0.001, 0.01],
#     'classifier__eta0': [0.001, 0.01, 0.1],
#     'classifier__max_iter': [1000, 1500, 2000],
#     'classifier__tol': [1e-3, 1e-4],
#     'classifier__class_weight': ['balanced', None]
# }

# param_grid_lr = {
#     'classifier__C': [0.1, 1, 10],
#     'classifier__solver': ['liblinear', 'saga'],
#     'classifier__max_iter': [100, 200],
#     'classifier__class_weight': ['balanced', None]
# }

# param_grid_knn = {
#     'classifier__n_neighbors': [3, 5, 7, 9, 11],
#     'classifier__weights': ['uniform', 'distance'],
#     'classifier__metric': ['euclidean', 'manhattan']
# }

# param_grid_svc = {
#     'classifier__C': [0.1, 1, 10],
#     'classifier__kernel': ['linear', 'rbf'],
#     'classifier__gamma': ['scale', 'auto'],
#     'classifier__class_weight': ['balanced', None]
# }

# param_grid_dt = {
#     'classifier__max_depth': [3, 5, 7],
#     'classifier__min_samples_split': [1, 2, 5],
#     'classifier__min_samples_leaf': [1, 2, 4, 8, 10, 12],
#     'classifier__class_weight': ['balanced', None]
# }

# param_grid_rf = {
#     "classifier__n_estimators": [100, 200, 300, 400, 500],
#     "classifier__max_depth": [5, 10, 20, 30, None],
#     "classifier__min_samples_split": [2, 5, 10],
#     "classifier__min_samples_leaf": [1, 2, 5],
#     "classifier__bootstrap": [True, False],
#     'classifier__class_weight': ['balanced', None]

# }

In [None]:
# cv = StratifiedShuffleSplit(n_splits=3, test_size=0.2, random_state=42)

In [None]:
# grid_search = GridSearchCV(
#     pipe, 
#     param_grid=param_grid_svc, 
#     scoring='f1', 
#     n_jobs=-1, 
#     cv=5
# )

# grid_search.fit(X_train, y_train)

# y_pred = grid_search.predict(X_test)
# f1_score(y_test, y_pred, average='macro')

In [None]:
# random_search = RandomizedSearchCV(
#     pipe,                 
#     param_distributions=param_grid_dt,           
#     scoring='f1',        
#     cv=5,                  
#     n_jobs=-1,            
#     random_state=42 
# )

# random_search.fit(X_train, y_train)

# y_pred = random_search.predict(X_test)
# f1_score(y_test, y_pred, average='macro')

In [None]:
# grid_search.best_params_

In [None]:
# best_model = grid_search.best_estimator_

# Submission

In [16]:
# Load and Prepare Features from their test data
X_test = prepare_features(test_data)

# Predict on their test set
y_test_pred = model.predict(X_test)

# map predictions
# y_pred_mapped = np.where(y_pred == 1, 'yes', 'no')
y_pred_mapped = label_enc.inverse_transform(y_test_pred)

# Create the submission file
submission = pd.DataFrame({"id": range(0,X_test.shape[0]), "target": y_pred_mapped})

# Save the submission file
submission.to_csv('submission.csv', index=False)