### Predictive Insights from Portuguese Bank Marketing Data

## Feature Importance

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

from tabulate import tabulate

In [2]:
# Load the data
data_bank = pd.read_csv('data_bank.csv')

In [3]:
print(data_bank.columns)

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'target'],
      dtype='object')


In [4]:
print(data_bank.dtypes)

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
target        int64
dtype: object


In [5]:
data_bank.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 17 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   default    45211 non-null  object
 5   balance    45211 non-null  int64 
 6   housing    45211 non-null  object
 7   loan       45211 non-null  object
 8   contact    45211 non-null  object
 9   day        45211 non-null  int64 
 10  month      45211 non-null  object
 11  duration   45211 non-null  int64 
 12  campaign   45211 non-null  int64 
 13  pdays      45211 non-null  int64 
 14  previous   45211 non-null  int64 
 15  poutcome   45211 non-null  object
 16  target     45211 non-null  int64 
dtypes: int64(8), object(9)
memory usage: 5.9+ MB


# Feature Selection

Feature selection - using feature importance to choose sets top 5/10/20 and the one with "all features" 

In [6]:
#### Data Preparation:

# Separate the features and target variable; drop 'duration' variable since it is available after the call 
X = data_bank.drop(['target','duration'], axis=1)
y = data_bank['target']

# Convert categorical variables to numerical using one-hot encoding
X_encoded = pd.get_dummies(X)


#### Random Forest Classifier:

# Create the Random Forest classifier
rf = RandomForestClassifier(random_state=42)

#### Model Training:

# Fit the classifier to the data
rf.fit(X_encoded, y)


#### Feature Importance:
# Get feature importances and save it as one-dimensional array (pd.Series(values, index))
feature_importances = pd.Series(rf.feature_importances_, index=X_encoded.columns)


#### Feature Ranking:
# Rank the features in descending order
feature_ranking = feature_importances.sort_values(ascending=False)

# Define the number of top features to select
top_features = [5, 10, 20, len(X_encoded.columns)]



#### Selecting Top Features:
# Select the top features for each set
feature_sets = {}
for num_features in top_features:
    selected_features = feature_ranking[:num_features].index.tolist()
    feature_sets[f'Top {num_features} Features'] = selected_features
  

 # Print the selected feature sets
for feature_set, features in feature_sets.items():
    print(f"{feature_set}: {features}")
    
   
 # Create a list of lists to store the feature importance table data
table_data = []
for feature_set, features in feature_sets.items():
    table_data.append([feature_set] + features)

# Print the table output
headers = ['Feature Set', 'Feature 1', 'Feature 2', 'Feature 3', '...']
print(tabulate(table_data, headers=headers, tablefmt='grid'))    

Top 5 Features: ['balance', 'age', 'day', 'campaign', 'pdays']
Top 10 Features: ['balance', 'age', 'day', 'campaign', 'pdays', 'poutcome_success', 'previous', 'education_secondary', 'education_tertiary', 'month_apr']
Top 20 Features: ['balance', 'age', 'day', 'campaign', 'pdays', 'poutcome_success', 'previous', 'education_secondary', 'education_tertiary', 'month_apr', 'job_technician', 'job_management', 'poutcome_failure', 'month_mar', 'housing_no', 'marital_married', 'job_admin.', 'poutcome_other', 'month_jun', 'month_oct']
Top 49 Features: ['balance', 'age', 'day', 'campaign', 'pdays', 'poutcome_success', 'previous', 'education_secondary', 'education_tertiary', 'month_apr', 'job_technician', 'job_management', 'poutcome_failure', 'month_mar', 'housing_no', 'marital_married', 'job_admin.', 'poutcome_other', 'month_jun', 'month_oct', 'job_blue-collar', 'housing_yes', 'marital_single', 'month_aug', 'contact_cellular', 'month_may', 'month_jul', 'education_primary', 'month_feb', 'job_servi

## Evaluating Feature Importance with Random Forest and Logistic Regression

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from tabulate import tabulate
import warnings

# Ignore all warnings
warnings.filterwarnings("ignore")

# Create a list of models to evaluate
models = [
    ('Random Forest', RandomForestClassifier(random_state=42)),
    ('Logistic Regression', LogisticRegression(max_iter=1000, random_state=42))
]

# Define the number of top features to select
top_features = [5, 10, 20, len(X_encoded.columns)]

# Create a StandardScaler
scaler = StandardScaler()

# Select the top features for each set and evaluate models
results = {}

# Fit the Random Forest model to calculate feature importance
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_encoded, y)
feature_importances = rf_model.feature_importances_
feature_importances_df = pd.DataFrame({'Feature': X_encoded.columns, 'Importance': feature_importances})
feature_importances_df = feature_importances_df.sort_values(by='Importance', ascending=False)

for num_features in top_features:
    selected_features = feature_importances_df['Feature'][:num_features].tolist()
    feature_set_name = f'Top {num_features} Features'

    # Scale the selected features
    X_selected = X_encoded[selected_features]
    X_selected_scaled = scaler.fit_transform(X_selected)

    model_results = []

    for model_name, model in models:
        scores_precision = cross_val_score(model, X_selected_scaled, y, cv=5, scoring='precision')
        scores_recall = cross_val_score(model, X_selected_scaled, y, cv=5, scoring='recall')
        scores_f1 = cross_val_score(model, X_selected_scaled, y, cv=5, scoring='f1')

        model_results.append((model_name, scores_precision.mean(), scores_recall.mean(), scores_f1.mean()))

    results[feature_set_name] = model_results

# Print the results
for feature_set_name, model_results in results.items():
    print(f"Feature Set: {feature_set_name}")
    headers = ['Model', 'Mean Precision', 'Mean Recall', 'Mean F1 Score']
    print(tabulate(model_results, headers=headers, tablefmt='grid'))
    print()

Feature Set: Top 5 Features
+---------------------+------------------+---------------+-----------------+
| Model               |   Mean Precision |   Mean Recall |   Mean F1 Score |
| Random Forest       |         0.11047  |    0.16938    |      0.101975   |
+---------------------+------------------+---------------+-----------------+
| Logistic Regression |         0.157143 |    0.00151229 |      0.00299348 |
+---------------------+------------------+---------------+-----------------+

Feature Set: Top 10 Features
+---------------------+------------------+---------------+-----------------+
| Model               |   Mean Precision |   Mean Recall |   Mean F1 Score |
| Random Forest       |         0.109575 |      0.187905 |       0.0987039 |
+---------------------+------------------+---------------+-----------------+
| Logistic Regression |         0.504107 |      0.162386 |       0.173225  |
+---------------------+------------------+---------------+-----------------+

Feature Set: Top 

In [14]:
import csv

# Specify the file path and name for the CSV file
csv_file = 'model_results.csv'

# Write the model results to the CSV file
with open(csv_file, 'w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Feature Set', 'Model', 'Mean Precision', 'Mean Recall', 'Mean F1 Score'])
    for feature_set_name, model_results in results.items():
        for model_result in model_results:
            writer.writerow([feature_set_name] + list(model_result))