In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.preprocessing import LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.impute import KNNImputer
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

# Load the dataset
file_path = 'bank-additional-full.csv'
df = pd.read_csv(file_path, sep=';')

# Replacing specific values with NaN
search_values = ['unknown', 'nonexistent', np.nan]
df.replace(search_values, np.nan, inplace=True)

# Label encoding
columns_to_encode = df.columns[df.dtypes==object].tolist()  # Encode only object type columns
label_encoder = LabelEncoder()
for column in columns_to_encode:
    df[column] = label_encoder.fit_transform(df[column].astype(str))

# Imputation with mode
columns_to_impute = df.columns[df.isna().any()].tolist()  # Columns with missing values
for col in columns_to_impute:
    mode = df[col].mode()[0]
    df[col].fillna(mode, inplace=True)

# Splitting the dataset
X = df.drop(['y'], axis=1)
y = df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
models = {
    'GaussianNB': GaussianNB(),
    'SVM': SVC(),
    'RandomForest': RandomForestClassifier(n_estimators=100, random_state=42),
    'KNN': KNeighborsClassifier(),
    'XGBoost': XGBClassifier(),
    'LogisticRegression': LogisticRegression(max_iter=1000)
}

trained_models = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    trained_models[name] = model  # Storing the trained models

# Let's serialize the models next.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [5]:
import pickle

for name, model in trained_models.items():
    with open(f'models/{name}_model.pkl', 'wb') as file:
        pickle.dump(model, file)


In [12]:
import json

In [13]:
input_data = json.loads(X_train.to_json(orient='records'))

In [15]:
input_data[0]

{'age': 40,
 'job': 1,
 'marital': 1,
 'education': 2,
 'default': 0,
 'housing': 2,
 'loan': 1,
 'contact': 1,
 'month': 3,
 'day_of_week': 1,
 'duration': 94,
 'campaign': 2,
 'pdays': 999,
 'previous': 0,
 'poutcome': 1,
 'emp.var.rate': 1.4,
 'cons.price.idx': 93.918,
 'cons.conf.idx': -42.7,
 'euribor3m': 4.96,
 'nr.employed': 5228.1}

In [44]:
# Load the dataset
file_path = 'bank-additional-full.csv'
df = pd.read_csv(file_path, sep=';')

In [45]:
df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


In [16]:
label_encoders = {}  # Dictionary to store label encoders

for column in columns_to_encode:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column].astype(str))
    label_encoders[column] = le  # Store the encoder

In [17]:
def reverse_encoding(encoded_df, encoders):
    """
    Translates the numerical values in a DataFrame back to categorical names.
    
    :param encoded_df: DataFrame with encoded numerical values.
    :param encoders: Dictionary of LabelEncoder objects used for initial encoding.
    :return: DataFrame with original categorical values.
    """
    decoded_df = encoded_df.copy()
    for column, encoder in encoders.items():
        if column in decoded_df.columns:
            decoded_df[column] = encoder.inverse_transform(decoded_df[column].astype(int))
    return decoded_df


In [18]:
label_encoders

{'job': LabelEncoder(),
 'marital': LabelEncoder(),
 'education': LabelEncoder(),
 'default': LabelEncoder(),
 'housing': LabelEncoder(),
 'loan': LabelEncoder(),
 'contact': LabelEncoder(),
 'month': LabelEncoder(),
 'day_of_week': LabelEncoder(),
 'poutcome': LabelEncoder(),
 'y': LabelEncoder()}

In [22]:
# Load the dataset
file_path = 'bank-additional-full.csv'

categorical_columns = pd.read_csv(file_path, sep=';').select_dtypes(include=['object']).columns.tolist()
non_categorical_columns = pd.read_csv(file_path, sep=';').select_dtypes(exclude=['object']).columns.tolist()


In [25]:
categorical_columns, non_categorical_columns

(['job',
  'marital',
  'education',
  'default',
  'housing',
  'loan',
  'contact',
  'month',
  'day_of_week',
  'poutcome',
  'y'],
 ['age',
  'duration',
  'campaign',
  'pdays',
  'previous',
  'emp.var.rate',
  'cons.price.idx',
  'cons.conf.idx',
  'euribor3m',
  'nr.employed'])

In [26]:
categorical_data_dict = {column: df[column].unique().tolist() for column in categorical_columns}

In [27]:
categorical_data_dict

{'job': [5, 10, 0, 1, 2, 8, 6, 3, 9, 7, 4, 11],
 'marital': [1, 3, 0, 2],
 'education': [0, 3, 1, 2, 6, 5, 7, 4],
 'default': [1, 0, 2],
 'housing': [1, 2, 0],
 'loan': [1, 2, 0],
 'contact': [1, 0],
 'month': [6, 4, 3, 1, 8, 7, 2, 5, 0, 9],
 'day_of_week': [1, 3, 4, 2, 0],
 'poutcome': [1, 0, 2],
 'y': [0, 1]}

In [37]:
label_encoders = {}  # Dictionary to store label encoders

for column in columns_to_encode:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column].astype(str))
    label_encoders[column] = le  # Store the encoder


In [38]:
category_mappings = {}

for column, encoder in label_encoders.items():
    # Inverse transform the encoded labels to get original labels
    original_labels = encoder.inverse_transform(np.arange(len(encoder.classes_)))
    # Map encoded numbers to original labels
    mapping = dict(zip(np.arange(len(encoder.classes_)), original_labels))
    category_mappings[column] = mapping


In [39]:
category_mappings

{'job': {0: '0',
  1: '1',
  2: '10',
  3: '11',
  4: '2',
  5: '3',
  6: '4',
  7: '5',
  8: '6',
  9: '7',
  10: '8',
  11: '9'},
 'marital': {0: '0', 1: '1', 2: '2', 3: '3'},
 'education': {0: '0', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6', 7: '7'},
 'default': {0: '0', 1: '1', 2: '2'},
 'housing': {0: '0', 1: '1', 2: '2'},
 'loan': {0: '0', 1: '1', 2: '2'},
 'contact': {0: '0', 1: '1'},
 'month': {0: '0',
  1: '1',
  2: '2',
  3: '3',
  4: '4',
  5: '5',
  6: '6',
  7: '7',
  8: '8',
  9: '9'},
 'day_of_week': {0: '0', 1: '1', 2: '2', 3: '3', 4: '4'},
 'poutcome': {0: '0', 1: '1', 2: '2'},
 'y': {0: '0', 1: '1'}}

In [41]:
# Convert int64 keys to str keys
category_mappings = {str(key): value for key, value in category_mappings.items()}

In [42]:
category_mappings

{'job': {0: '0',
  1: '1',
  2: '10',
  3: '11',
  4: '2',
  5: '3',
  6: '4',
  7: '5',
  8: '6',
  9: '7',
  10: '8',
  11: '9'},
 'marital': {0: '0', 1: '1', 2: '2', 3: '3'},
 'education': {0: '0', 1: '1', 2: '2', 3: '3', 4: '4', 5: '5', 6: '6', 7: '7'},
 'default': {0: '0', 1: '1', 2: '2'},
 'housing': {0: '0', 1: '1', 2: '2'},
 'loan': {0: '0', 1: '1', 2: '2'},
 'contact': {0: '0', 1: '1'},
 'month': {0: '0',
  1: '1',
  2: '2',
  3: '3',
  4: '4',
  5: '5',
  6: '6',
  7: '7',
  8: '8',
  9: '9'},
 'day_of_week': {0: '0', 1: '1', 2: '2', 3: '3', 4: '4'},
 'poutcome': {0: '0', 1: '1', 2: '2'},
 'y': {0: '0', 1: '1'}}

In [43]:
import json

json_data = json.dumps(category_mappings)
# json.dumps(category_mappings) then copy and paste the output to the file
# Path: category_mappings.json
# # Output JSON file
# file_path = 'category_mappings.json'
# with open(file_path, 'w') as file:
#     json.dump(category_mappings, file)






TypeError: keys must be str, int, float, bool or None, not numpy.int64

In [46]:
# Original dictionary
category_mappings = {
    'job': {
        0: 'admin.',
        2: 'entrepreneur',
        1: 'blue-collar',
        4: 'management',
        6: 'self-employed',
        5: 'retired',
        3: 'housemaid',
        8: 'student',
        7: 'services',
        9: 'technician',
        10: 'unemployed',
        11: None  # Represents missing data
    },
    'marital': {
        0: 'divorced',
        2: 'single',
        1: 'married',
        3: None
    },
    'education': {
        0: 'basic.4y',
        1: 'basic.6y',
        2: 'basic.9y',
        3: 'high.school',
        4: 'illiterate',
        5: 'professional.course',
        6: 'university.degree',
        7: None
    },
    'default': {
        0: 'no',
        1: 'yes',
        2: None
    },
    'housing': {
        0: 'no',
        1: 'yes',
        2: None
    },
    'loan': {
        0: 'no',
        1: 'yes',
        2: None
    },
    'contact': {
        0: 'cellular',
        1: 'telephone'
    },
    'month': {
        0: 'apr',
        1: 'aug',
        2: 'dec',
        3: 'jul',
        4: 'jun',
        5: 'mar',
        6: 'may',
        7: 'nov',
        8: 'oct',
        9: 'sep'
    },
    'day_of_week': {
        0: 'fri',
        1: 'mon',
        2: 'thu',
        3: 'tue',
        4: 'wed'
    },
    'poutcome': {
        0: 'failure',
        1: 'success',
        2: None
    },
    'y': {
        0: 'no',
        1: 'yes'
    }
}

# Inverted dictionary
inverted_category_mappings = {key: {value: k for k, value in sub_dict.items()} for key, sub_dict in category_mappings.items()}

# Print the inverted dictionary
print(inverted_category_mappings)

{'job': {'admin.': 0, 'entrepreneur': 2, 'blue-collar': 1, 'management': 4, 'self-employed': 6, 'retired': 5, 'housemaid': 3, 'student': 8, 'services': 7, 'technician': 9, 'unemployed': 10, None: 11}, 'marital': {'divorced': 0, 'single': 2, 'married': 1, None: 3}, 'education': {'basic.4y': 0, 'basic.6y': 1, 'basic.9y': 2, 'high.school': 3, 'illiterate': 4, 'professional.course': 5, 'university.degree': 6, None: 7}, 'default': {'no': 0, 'yes': 1, None: 2}, 'housing': {'no': 0, 'yes': 1, None: 2}, 'loan': {'no': 0, 'yes': 1, None: 2}, 'contact': {'cellular': 0, 'telephone': 1}, 'month': {'apr': 0, 'aug': 1, 'dec': 2, 'jul': 3, 'jun': 4, 'mar': 5, 'may': 6, 'nov': 7, 'oct': 8, 'sep': 9}, 'day_of_week': {'fri': 0, 'mon': 1, 'thu': 2, 'tue': 3, 'wed': 4}, 'poutcome': {'failure': 0, 'success': 1, None: 2}, 'y': {'no': 0, 'yes': 1}}


In [47]:
category_mappings = {'decode':{
    'job': {
        0: 'admin.',
        2: 'entrepreneur',
        1: 'blue-collar',
        4: 'management',
        6: 'self-employed',
        5: 'retired',
        3: 'housemaid',
        8: 'student',
        7: 'services',
        9: 'technician',
        10: 'unemployed',
        11: None  # Represents missing data
    },
    'marital': {
        0: 'divorced',
        2: 'single',
        1: 'married',
        3: None
    },
    'education': {
        0: 'basic.4y',
        1: 'basic.6y',
        2: 'basic.9y',
        3: 'high.school',
        4: 'illiterate',
        5: 'professional.course',
        6: 'university.degree',
        7: None
    },
    'default': {
        0: 'no',
        1: 'yes',
        2: None
    },
    'housing': {
        0: 'no',
        1: 'yes',
        2: None
    },
    'loan': {
        0: 'no',
        1: 'yes',
        2: None
    },
    'contact': {
        0: 'cellular',
        1: 'telephone'
    },
    'month': {
        0: 'apr',
        1: 'aug',
        2: 'dec',
        3: 'jul',
        4: 'jun',
        5: 'mar',
        6: 'may',
        7: 'nov',
        8: 'oct',
        9: 'sep'
    },
    'day_of_week': {
        0: 'fri',
        1: 'mon',
        2: 'thu',
        3: 'tue',
        4: 'wed'
    },
    'poutcome': {
        0: 'failure',
        1: 'success',
        2: None
    },
    'y': {
        0: 'no',
        1: 'yes'
    }
},
'encode': {
    'job': {
        'admin.': 0,
        'entrepreneur': 2,
        'blue-collar': 1,
        'man`age`ment': 4,
        'self-employed': 6,
        'retired': 5,
        'housemaid': 3,
        'student': 8,
        'services': 7,
        'technician': 9,
        'unemployed': 10,
        None: 11
    },
    'marital': {
        'divorced': 0,
        'single': 2,
        'married': 1,
        None: 3
    },
    'education': {
        'basic.4y': 0,
        'basic.6y': 1,
        'basic.9y': 2,
        'high.school': 3,
        'illiterate': 4,
        'professional.course': 5,
        'university.degree': 6,
        None: 7
    },
    'default': {
        'no': 0,
        'yes': 1,
        None: 2
    },
    'housing': {
        'no': 0,
        'yes': 1,
        None: 2
    },
    'loan': {
        'no': 0,
        'yes': 1,
        None: 2
    },
    'contact': {
        'cellular': 0,
        'telephone': 1
    },
    'month': {
        'apr': 0,
        'aug': 1,
        'dec': 2,
        'jul': 3,
        'jun': 4,
        'mar': 5,
        'may': 6,
        'nov': 7,
        'oct': 8,
        'sep': 9
    },
    'day_of_week': {
        'fri': 0,
        'mon': 1,
        'thu': 2,
        'tue': 3,
        'wed': 4
    },
    'poutcome': {
        'failure': 0,
        'success': 1,
        None: 2
    },
    'y': {
        'no': 0,
        'yes': 1
    }
}

}


def decode_data(data):
    decoded_data = []
    for category, values_dict in category_mappings['decode'].items():
        decoded_category_data = [values_dict[value] for value in data[category]]
        decoded_data.append(decoded_category_data)
    return decoded_data

def encode_data(data):
    encoded_data = {category: [] for category in category_mappings['encode']}
    for category, values_dict in category_mappings['encode'].items():
        encoded_category_data = [values_dict[value] for value in data[category]]
        encoded_data[category] = encoded_category_data
    return encoded_data

In [48]:
df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
