### Predictive Insights from Portuguese Bank Marketing Data

## Feature Importance

In [18]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

from tabulate import tabulate

In [7]:
# Load the data
data_bank = pd.read_csv('data_bank.csv')

In [8]:
print(data_bank.columns)

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'target'],
      dtype='object')


In [9]:
print(data_bank.dtypes)

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
target        int64
dtype: object


In [14]:
data_bank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  target     45211 non-null  int64 
dtypes: int64(8), object(9)
memory usage: 5.9+ MB


# Feature Selection

Feature selection - using feature importance to choose sets top 5/10/20 and the one with "all features" 

In [12]:
# Separate the features and target variable
X = data_bank.drop('target', axis=1)
y = data_bank['target']

# Convert categorical variables to numerical using one-hot encoding
X_encoded = pd.get_dummies(X)

# Create the Random Forest classifier
rf = RandomForestClassifier(random_state=42)

# Fit the classifier to the data
rf.fit(X_encoded, y)

# Get feature importances
feature_importances = pd.Series(rf.feature_importances_, index=X_encoded.columns)

# Rank the features in descending order
feature_ranking = feature_importances.sort_values(ascending=False)

# Define the number of top features to select
top_features = [5, 10, 20, len(X_encoded.columns)]

# Select the top features for each set
feature_sets = {}
for num_features in top_features:
    selected_features = feature_ranking[:num_features].index.tolist()
    feature_sets[f'Top {num_features} Features'] = selected_features

# Print the selected feature sets
for feature_set, features in feature_sets.items():
    print(f"{feature_set}: {features}")
    
# Create a list of lists to store the feature importance table data
table_data = []
for feature_set, features in feature_sets.items():
    table_data.append([feature_set] + features)

# Print the table output
headers = ['Feature Set', 'Feature 1', 'Feature 2', 'Feature 3', '...']
print(tabulate(table_data, headers=headers, tablefmt='grid'))    

Top 5 Features: ['duration', 'balance', 'age', 'day', 'poutcome_success']
Top 10 Features: ['duration', 'balance', 'age', 'day', 'poutcome_success', 'pdays', 'campaign', 'previous', 'month_mar', 'housing_yes']
Top 20 Features: ['duration', 'balance', 'age', 'day', 'poutcome_success', 'pdays', 'campaign', 'previous', 'month_mar', 'housing_yes', 'month_apr', 'housing_no', 'education_secondary', 'month_jun', 'education_tertiary', 'job_technician', 'poutcome_failure', 'month_oct', 'marital_married', 'job_management']
Top 50 Features: ['duration', 'balance', 'age', 'day', 'poutcome_success', 'pdays', 'campaign', 'previous', 'month_mar', 'housing_yes', 'month_apr', 'housing_no', 'education_secondary', 'month_jun', 'education_tertiary', 'job_technician', 'poutcome_failure', 'month_oct', 'marital_married', 'job_management', 'poutcome_other', 'contact_other', 'marital_single', 'month_aug', 'month_may', 'job_admin.', 'contact_cellular', 'job_blue-collar', 'month_jul', 'month_nov', 'month_feb', '

## Evaluating Feature Importance with Random Forest and Logistic Regression

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from tabulate import tabulate  # Make sure to install this package using pip

# Create a list of models to evaluate
models = [
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=42))
]

# Define the number of top features to select
top_features = [5, 10, 20, len(X_encoded.columns)]

# Create a StandardScaler
scaler = StandardScaler()

# Select the top features for each set and evaluate models
results = {}

for num_features in top_features:
    selected_features = feature_ranking[:num_features].index.tolist()
    feature_set_name = f'Top {num_features} Features'

    # Scale the selected features
    X_selected = X_encoded[selected_features]
    X_selected_scaled = scaler.fit_transform(X_selected)

    model_results = []

    for model_name, model in models:
        scores = cross_val_score(model, X_selected_scaled, y, cv=5)
        model_results.append((model_name, scores.mean(), scores.std()))

    results[feature_set_name] = model_results

# Print the results
for feature_set_name, model_results in results.items():
    print(f"Feature Set: {feature_set_name}")
    headers = ['Model', 'Mean Accuracy', 'Standard Deviation']
    print(tabulate(model_results, headers=headers, tablefmt='grid'))
    print()


Feature Set: Top 5 Features
+---------------------+-----------------+----------------------+
| Model               |   Mean Accuracy |   Standard Deviation |
| Random Forest       |        0.873615 |            0.0170845 |
+---------------------+-----------------+----------------------+
| Logistic Regression |        0.893079 |            0.0089383 |
+---------------------+-----------------+----------------------+

Feature Set: Top 10 Features
+---------------------+-----------------+----------------------+
| Model               |   Mean Accuracy |   Standard Deviation |
| Random Forest       |        0.837693 |           0.0603912  |
+---------------------+-----------------+----------------------+
| Logistic Regression |        0.890093 |           0.00871695 |
+---------------------+-----------------+----------------------+

Feature Set: Top 20 Features
+---------------------+-----------------+----------------------+
| Model               |   Mean Accuracy |   Standard Deviation |
| 

In [26]:
# Create a DataFrame from the results
results_df = pd.DataFrame(results)

# Save the results to a CSV file
results_df.to_csv("feature_selection_results.csv", index=False)

In [20]:
feature_ranking.head()

duration            0.264959
balance             0.093675
age                 0.090709
day                 0.081387
poutcome_success    0.049913
dtype: float64

In [21]:
X_encoded.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,...,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success
0,58,2143,5,261,1,999,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
1,44,29,5,151,1,999,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0
2,33,2,5,76,1,999,0,0,0,1,...,0,0,0,1,0,0,0,0,1,0
3,47,1506,5,92,1,999,0,0,1,0,...,0,0,0,1,0,0,0,0,1,0
4,33,1,5,198,1,999,0,0,0,0,...,0,0,0,1,0,0,0,0,1,0


# Modeling