In [38]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import date
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.feature_selection import RFE

In [24]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:

# Carregar o dataset
marketing_data = pd.read_csv('/content/drive/MyDrive/AM/marketing_campaign.csv')

In [26]:
# Transformações nos Dados
marketing_data['Age'] = date.today().year - marketing_data['Year_Birth']
marketing_data['Customer_Days'] = (pd.to_datetime('now', utc=True) - pd.to_datetime(marketing_data['Dt_Customer'], utc=True)).dt.days

In [27]:
# Conversão de variáveis categóricas usando variáveis dummy
marketing_data = pd.get_dummies(marketing_data, columns=['Marital_Status', 'Education'])


In [28]:
# Remover colunas que não serão mais usadas
columns_to_drop = ['Year_Birth', 'Dt_Customer']
marketing_data.drop(columns=columns_to_drop, inplace=True)


In [29]:
# Tratamento de outliers e dados faltantes para 'Income' e 'Age'
Q1_income = marketing_data['Income'].quantile(0.25)
Q3_income = marketing_data['Income'].quantile(0.75)
IQR_income = Q3_income - Q1_income
marketing_data = marketing_data[(marketing_data['Income'] < Q3_income + 1.5 * IQR_income) & (marketing_data['Income'] > Q1_income - 1.5 * IQR_income)]

Q1_age = marketing_data['Age'].quantile(0.25)
Q3_age = marketing_data['Age'].quantile(0.75)
IQR_age = Q3_age - Q1_age
marketing_data = marketing_data[(marketing_data['Age'] < Q3_age + 1.5 * IQR_age) & (marketing_data['Age'] > Q1_age - 1.5 * IQR_age)]


In [8]:
# Dividir o dataset em características e rótulos
features = marketing_data.drop(['Response'], axis=1)
labels = marketing_data['Response']


In [30]:
# Escalamento dos dados
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)


In [31]:
# Dividir o dataset em conjunto de treino e teste
X_train, X_test, y_train, y_test = train_test_split(features_scaled, labels, test_size=0.40, random_state=5)


In [34]:
# Regressão Logística
param_grid_lr = {'C': [0.01, 0.1, 1, 10], 'penalty': ['l2'], 'solver': ['liblinear', 'lbfgs']}
lr_model = GridSearchCV(LogisticRegression(random_state=5), param_grid=param_grid_lr, cv=5, verbose=1)
lr_model.fit(X_train, y_train)
predictions_lr = lr_model.predict(X_test)
print("\nLogistic Regression Model:")
print("Accuracy:", accuracy_score(y_test, predictions_lr))
print(classification_report(y_test, predictions_lr))


Fitting 5 folds for each of 8 candidates, totalling 40 fits

Logistic Regression Model:
Accuracy: 0.8798185941043084
              precision    recall  f1-score   support

           0       0.90      0.97      0.93       750
           1       0.68      0.37      0.48       132

    accuracy                           0.88       882
   macro avg       0.79      0.67      0.71       882
weighted avg       0.87      0.88      0.86       882



In [35]:
# Árvore de Decisão
param_grid_dt = {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 4, 6], 'min_samples_leaf': [1, 2, 3]}
dt_model = GridSearchCV(DecisionTreeClassifier(random_state=5), param_grid=param_grid_dt, cv=5, verbose=1)
dt_model.fit(X_train, y_train)
predictions_dt = dt_model.predict(X_test)
print("\nDecision Tree Model:")
print("Accuracy:", accuracy_score(y_test, predictions_dt))
print(classification_report(y_test, predictions_dt))

Fitting 5 folds for each of 36 candidates, totalling 180 fits

Decision Tree Model:
Accuracy: 0.8378684807256236
              precision    recall  f1-score   support

           0       0.89      0.92      0.91       750
           1       0.45      0.38      0.41       132

    accuracy                           0.84       882
   macro avg       0.67      0.65      0.66       882
weighted avg       0.83      0.84      0.83       882



In [36]:
# SVM com K-Fold Cross Validation
param_grid_svm = {'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01], 'kernel': ['rbf', 'linear']}
svm_model = GridSearchCV(SVC(), param_grid_svm, cv=5, verbose=1)
svm_model.fit(X_train, y_train)
predictions_svm = svm_model.predict(X_test)
print("\nSVM Model:")
print("Accuracy:", accuracy_score(y_test, predictions_svm))
print(classification_report(y_test, predictions_svm))

Fitting 5 folds for each of 18 candidates, totalling 90 fits

SVM Model:
Accuracy: 0.8866213151927438
              precision    recall  f1-score   support

           0       0.90      0.97      0.94       750
           1       0.71      0.41      0.52       132

    accuracy                           0.89       882
   macro avg       0.81      0.69      0.73       882
weighted avg       0.87      0.89      0.87       882



In [39]:
# RFE com o modelo SVM para identificar as características mais importantes
rfe_svm = RFE(estimator=SVC(kernel='linear'), n_features_to_select=10)
rfe_svm.fit(X_train, y_train)

print("\nTop 10 Features (RFE with SVM):")
top_features_rfe_svm = pd.DataFrame({'Feature': features.columns, 'Ranking': rfe_svm.ranking_}).sort_values('Ranking')
print(top_features_rfe_svm.head(10))


Top 10 Features (RFE with SVM):
                    Feature  Ranking
19             AcceptedCmp1        1
31  Marital_Status_Together        1
29   Marital_Status_Married        1
4                   Recency        1
25            Customer_Days        1
18             AcceptedCmp5        1
7           MntMeatProducts        1
16             AcceptedCmp3        1
14        NumStorePurchases        1
11        NumDealsPurchases        1


In [32]:
# RandomForest
param_grid_rf = {'n_estimators': [50, 100, 200], 'max_features': ['sqrt'], 'max_depth': [None, 3, 5, 8], 'criterion': ['gini'], 'min_samples_split': [2, 3, 4]}
rf_model = GridSearchCV(RandomForestClassifier(random_state=5), param_grid=param_grid_rf, cv=5, verbose=1)
rf_model.fit(X_train, y_train)
predictions_rf = rf_model.predict(X_test)
print("RandomForest Model:")
print("Accuracy:", accuracy_score(y_test, predictions_rf))
print(classification_report(y_test, predictions_rf))
feature_importance_rf = pd.DataFrame({'feature': features.columns, 'importance': rf_model.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
print(feature_importance_rf.head(10))


Fitting 5 folds for each of 36 candidates, totalling 180 fits
RandomForest Model:
Accuracy: 0.8798185941043084
              precision    recall  f1-score   support

           0       0.88      0.99      0.93       750
           1       0.80      0.27      0.40       132

    accuracy                           0.88       882
   macro avg       0.84      0.63      0.67       882
weighted avg       0.87      0.88      0.85       882

              feature  importance
4             Recency    0.086517
25      Customer_Days    0.074622
5            MntWines    0.073286
1              Income    0.062150
7     MntMeatProducts    0.056384
19       AcceptedCmp1    0.049899
18       AcceptedCmp5    0.048275
10       MntGoldProds    0.046737
15  NumWebVisitsMonth    0.045837
0                  ID    0.045662


In [None]:
#Melhorando parâmetros com optuna

In [42]:
import optuna

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 3, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    model = RandomForestClassifier(
        n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, random_state=5)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    return accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("Best trial:")
trial = study.best_trial
print("  Accuracy: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))


[I 2023-12-12 01:33:15,183] A new study created in memory with name: no-name-fd405b9e-984c-4a61-9cd4-55eee49c5a0d
[I 2023-12-12 01:33:17,175] Trial 0 finished with value: 0.873015873015873 and parameters: {'n_estimators': 147, 'max_depth': 12, 'min_samples_split': 5}. Best is trial 0 with value: 0.873015873015873.
[I 2023-12-12 01:33:17,795] Trial 1 finished with value: 0.8752834467120182 and parameters: {'n_estimators': 84, 'max_depth': 17, 'min_samples_split': 4}. Best is trial 1 with value: 0.8752834467120182.
[I 2023-12-12 01:33:19,163] Trial 2 finished with value: 0.8786848072562359 and parameters: {'n_estimators': 190, 'max_depth': 20, 'min_samples_split': 7}. Best is trial 2 with value: 0.8786848072562359.
[I 2023-12-12 01:33:20,749] Trial 3 finished with value: 0.8775510204081632 and parameters: {'n_estimators': 178, 'max_depth': 11, 'min_samples_split': 6}. Best is trial 2 with value: 0.8786848072562359.
[I 2023-12-12 01:33:22,772] Trial 4 finished with value: 0.87414965986394

Best trial:
  Accuracy: 0.8798185941043084
  Params: 
    n_estimators: 200
    max_depth: 23
    min_samples_split: 9


In [43]:
# Parâmetros obtidos do Optuna
optuna_params = {
    'n_estimators': 200,
    'max_depth': 23,
    'min_samples_split': 9,
    'random_state': 5
}

# Criação do modelo RandomForest com os parâmetros do Optuna
rf_optuna_model = RandomForestClassifier(**optuna_params)

# Treinando o modelo
rf_optuna_model.fit(X_train, y_train)

# Fazendo previsões no conjunto de teste
predictions_rf_optuna = rf_optuna_model.predict(X_test)

# Avaliando o modelo
print("RandomForest Model with Optuna Parameters:")
print("Accuracy:", accuracy_score(y_test, predictions_rf_optuna))
print(classification_report(y_test, predictions_rf_optuna))

# Importância das características (Feature Importance)
feature_importance_rf_optuna = pd.DataFrame({
    'feature': features.columns,
    'importance': rf_optuna_model.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importance_rf_optuna.head(10))

RandomForest Model with Optuna Parameters:
Accuracy: 0.8798185941043084
              precision    recall  f1-score   support

           0       0.88      0.99      0.93       750
           1       0.80      0.27      0.40       132

    accuracy                           0.88       882
   macro avg       0.84      0.63      0.67       882
weighted avg       0.87      0.88      0.85       882

              feature  importance
4             Recency    0.089412
25      Customer_Days    0.086889
5            MntWines    0.074015
19       AcceptedCmp1    0.066912
1              Income    0.059678
18       AcceptedCmp5    0.053534
7     MntMeatProducts    0.053341
10       MntGoldProds    0.043286
15  NumWebVisitsMonth    0.041900
24                Age    0.038320
