In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("bank-additional/bank-additional-full.csv", header=0, delimiter=";")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [4]:
df.default.value_counts()

default
no         32588
unknown     8597
yes            3
Name: count, dtype: int64

column "default" is extremely unbalanced therefore can be excluded from dataset

In [5]:
import pandas as pd
def clean_data(df : pd.DataFrame) -> pd.DataFrame:
    df_cleaned = df.drop("default", axis=1)
    df_cleaned['y'] = df_cleaned['y'].map({'yes': 1, 'no': 0})
    return df_cleaned


## Data Preprocessing

### Encoding Categorical Data

In [6]:
import pandas as pd
def preprocess_data(data : pd.DataFrame) -> pd.DataFrame:
    dummies_contact = pd.get_dummies(data['contact'], prefix='contact', drop_first=True)

    dummies_house = pd.get_dummies(data['housing'], prefix='housing')
    dummies_house = dummies_house.drop('housing_unknown', axis=1)

    dummies_loan = pd.get_dummies(data['loan'], prefix='loan')
    dummies_loan  = dummies_loan.drop('loan_unknown', axis=1)


    dummies_marital = pd.get_dummies(data['marital'], prefix='marital')
    dummies_marital  = dummies_marital.drop('marital_unknown', axis=1)

    dummies_education = pd.get_dummies(data['education'], prefix='education')
    dummies_education  = dummies_education.drop('education_unknown', axis=1)

    dummies_job = pd.get_dummies(data['job'], prefix='job')
    dummies_job  = dummies_job.drop('job_unknown', axis=1)


    dummies_poutcome = pd.get_dummies(data['poutcome'], prefix='poutcome')
    dummies_poutcome  = dummies_poutcome.drop('poutcome_nonexistent', axis=1)


    df_encoded = pd.concat([
        data,
        dummies_contact,
        dummies_house,
        dummies_loan,
        dummies_marital,
        dummies_education,
        dummies_job,
        dummies_poutcome
    ], axis=1).drop([
        'contact',
        'housing',
        'loan',
        'marital', 
        'education', 
        'job', 
        'poutcome', 
        'month',
        'day_of_week'
    ], axis=1)

    # dummies_month = pd.get_dummies(data['month'], prefix='month', drop_first=True)
    # dummies_day_of_week = pd.get_dummies(data['day_of_week'], prefix='day_of_week', drop_first=True)

    # df_cleaned_date = pd.concat([
    #     df_cleaned,
    #     dummies_month,
    #     dummies_day_of_week
    # ])

    return df_encoded

### Data Scaling

In [7]:
from sklearn.model_selection import train_test_split

clean = clean_data(df)
df_encoded = preprocess_data(clean)

X_train, X_val, y_train, y_val = train_test_split(df_encoded.drop('y', axis=1), df_encoded['y'], test_size=0.2, random_state=42)


In [8]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

X_val_scaled = scaler.transform(X_val)

## Logistic Regression

In [9]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()

model.fit(X_train_scaled, y_train)

y_pred = model.predict(X_val_scaled)

### Logistic Regression Metrics

In [10]:
from sklearn.metrics import accuracy_score

accuracy_score_val = accuracy_score(y_val, y_pred)

print(accuracy_score_val)

0.9093226511289147


In [12]:

df_test = pd.read_csv('bank-additional/bank-additional.csv', delimiter=";")
clean_test = clean_data(df_test)
df_encoded_test = preprocess_data(clean)
X_test_scaled = scaler.transform(df_encoded_test.drop('y', axis=1))

y_pred = model.predict(X_test_scaled)

accuracy_score_test = accuracy_score(df_encoded_test['y'], y_pred)


print(accuracy_score_test)

0.9106535884238127
