#Importações

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import date
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.feature_selection import RFE
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score


In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Carregar o dataset
marketing_data = pd.read_csv('/content/drive/MyDrive/AM/marketing_campaign.csv')

In [6]:
# Transformações nos Dados
marketing_data['Age'] = date.today().year - marketing_data['Year_Birth']
marketing_data['Customer_Days'] = (pd.to_datetime('now', utc=True) - pd.to_datetime(marketing_data['Dt_Customer'], utc=True)).dt.days

In [7]:
# Conversão de variáveis categóricas usando variáveis dummy
marketing_data = pd.get_dummies(marketing_data, columns=['Marital_Status', 'Education'])


In [8]:
# Remover colunas que não serão mais usadas
columns_to_drop = ['Year_Birth', 'Dt_Customer']
marketing_data.drop(columns=columns_to_drop, inplace=True)


In [9]:
# Tratamento de outliers e dados faltantes para 'Income' e 'Age'
Q1_income = marketing_data['Income'].quantile(0.25)
Q3_income = marketing_data['Income'].quantile(0.75)
IQR_income = Q3_income - Q1_income
marketing_data = marketing_data[(marketing_data['Income'] < Q3_income + 1.5 * IQR_income) & (marketing_data['Income'] > Q1_income - 1.5 * IQR_income)]

Q1_age = marketing_data['Age'].quantile(0.25)
Q3_age = marketing_data['Age'].quantile(0.75)
IQR_age = Q3_age - Q1_age
marketing_data = marketing_data[(marketing_data['Age'] < Q3_age + 1.5 * IQR_age) & (marketing_data['Age'] > Q1_age - 1.5 * IQR_age)]


In [10]:
# Dividir o dataset em características e rótulos
features = marketing_data.drop(['Response'], axis=1)
labels = marketing_data['Response']


In [11]:
# Escalamento dos dados
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)


In [12]:
# Dividir o dataset em conjunto de treino e teste
X_train, X_test, y_train, y_test = train_test_split(features_scaled, labels, test_size=0.40, random_state=5)


In [13]:
# Aplicando SMOTE para balancear o dataset
smote = SMOTE(random_state=5)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

In [14]:
# Regressão Logística
param_grid_lr = {'C': [0.01, 0.1, 1, 10], 'penalty': ['l2'], 'solver': ['liblinear', 'lbfgs']}
lr_model = GridSearchCV(LogisticRegression(random_state=5), param_grid=param_grid_lr, cv=5, verbose=1)
lr_model.fit(X_train_smote, y_train_smote)
predictions_lr = lr_model.predict(X_test)
print("\nLogistic Regression Model:")
print("Accuracy:", accuracy_score(y_test, predictions_lr))
print(classification_report(y_test, predictions_lr))


Fitting 5 folds for each of 8 candidates, totalling 40 fits

Logistic Regression Model:
Accuracy: 0.8163265306122449
              precision    recall  f1-score   support

           0       0.94      0.84      0.89       750
           1       0.43      0.70      0.53       132

    accuracy                           0.82       882
   macro avg       0.69      0.77      0.71       882
weighted avg       0.86      0.82      0.83       882



In [15]:
scores_lr = cross_val_score(lr_model.best_estimator_, features_scaled, labels, cv=5)
print("\nRegressão Logística - Média da Acurácia com CV:", scores_lr.mean())
print("Desvio padrão da acurácia com CV:", scores_lr.std())



Regressão Logística - Média da Acurácia com CV: 0.8893424036281179
Desvio padrão da acurácia com CV: 0.006316729377407769


In [16]:
# Árvore de Decisão
param_grid_dt = {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 4, 6], 'min_samples_leaf': [1, 2, 3]}
dt_model = GridSearchCV(DecisionTreeClassifier(random_state=5), param_grid=param_grid_dt, cv=5, verbose=1)
dt_model.fit(X_train_smote, y_train_smote)
predictions_dt = dt_model.predict(X_test)
print("\nDecision Tree Model:")
print("Accuracy:", accuracy_score(y_test, predictions_dt))
print(classification_report(y_test, predictions_dt))

Fitting 5 folds for each of 36 candidates, totalling 180 fits

Decision Tree Model:
Accuracy: 0.808390022675737
              precision    recall  f1-score   support

           0       0.90      0.87      0.89       750
           1       0.38      0.46      0.42       132

    accuracy                           0.81       882
   macro avg       0.64      0.67      0.65       882
weighted avg       0.82      0.81      0.82       882



In [17]:
scores_dt = cross_val_score(dt_model.best_estimator_, features_scaled, labels, cv=5)
print("\nÁrvore de Decisão - Média da Acurácia com CV:", scores_dt.mean())
print("Desvio padrão da acurácia com CV:", scores_dt.std())



Árvore de Decisão - Média da Acurácia com CV: 0.8408163265306122
Desvio padrão da acurácia com CV: 0.008163265306122438


In [18]:
# SVM com K-Fold Cross Validation
param_grid_svm = {'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01], 'kernel': ['rbf', 'linear']}
svm_model = GridSearchCV(SVC(), param_grid_svm, cv=5, verbose=1)
svm_model.fit(X_train_smote, y_train_smote)
predictions_svm = svm_model.predict(X_test)
print("\nSVM Model:")
print("Accuracy:", accuracy_score(y_test, predictions_svm))
print(classification_report(y_test, predictions_svm))

Fitting 5 folds for each of 18 candidates, totalling 90 fits

SVM Model:
Accuracy: 0.8401360544217688
              precision    recall  f1-score   support

           0       0.87      0.95      0.91       750
           1       0.43      0.20      0.27       132

    accuracy                           0.84       882
   macro avg       0.65      0.58      0.59       882
weighted avg       0.80      0.84      0.81       882



In [19]:
scores_svm = cross_val_score(svm_model.best_estimator_, features_scaled, labels, cv=5)
print("\nSVM - Média da Acurácia com CV:", scores_svm.mean())
print("Desvio padrão da acurácia com CV:", scores_svm.std())



SVM - Média da Acurácia com CV: 0.8562358276643991
Desvio padrão da acurácia com CV: 0.009249922020122979


In [20]:
# RandomForest
param_grid_rf = {'n_estimators': [50, 100, 200], 'max_features': ['sqrt'], 'max_depth': [None, 3, 5, 8], 'criterion': ['gini'], 'min_samples_split': [2, 3, 4]}
rf_model = GridSearchCV(RandomForestClassifier(random_state=5), param_grid=param_grid_rf, cv=5, verbose=1)
rf_model.fit(X_train_smote, y_train_smote)
predictions_rf = rf_model.predict(X_test)
print("RandomForest Model:")
print("Accuracy:", accuracy_score(y_test, predictions_rf))
print(classification_report(y_test, predictions_rf))
feature_importance_rf = pd.DataFrame({'feature': features.columns, 'importance': rf_model.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
print(feature_importance_rf.head(10))


Fitting 5 folds for each of 36 candidates, totalling 180 fits
RandomForest Model:
Accuracy: 0.8764172335600907
              precision    recall  f1-score   support

           0       0.91      0.95      0.93       750
           1       0.61      0.47      0.53       132

    accuracy                           0.88       882
   macro avg       0.76      0.71      0.73       882
weighted avg       0.87      0.88      0.87       882

                feature  importance
4               Recency    0.090586
25        Customer_Days    0.074798
13  NumCatalogPurchases    0.068654
15    NumWebVisitsMonth    0.057745
14    NumStorePurchases    0.056647
7       MntMeatProducts    0.052066
5              MntWines    0.042046
1                Income    0.041556
3              Teenhome    0.041553
12      NumWebPurchases    0.041245


In [21]:
scores_rf = cross_val_score(rf_model.best_estimator_, features_scaled, labels, cv=5)
print("\nRandom Forest - Média da Acurácia com CV:", scores_rf.mean())
print("Desvio padrão da acurácia com CV:", scores_rf.std())



Random Forest - Média da Acurácia com CV: 0.8757369614512471
Desvio padrão da acurácia com CV: 0.005442176870748306


#Melhorando parâmetros com optuna

In [24]:
import optuna

def objective(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 3, 30)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    model = RandomForestClassifier(
        n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split, random_state=5)
    model.fit(X_train_smote, y_train_smote)
    preds = model.predict(X_test)
    accuracy = accuracy_score(y_test, preds)
    return accuracy

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

print("Best trial:")
trial = study.best_trial
print("  Accuracy: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))


[I 2023-12-13 19:53:15,749] A new study created in memory with name: no-name-33cef522-b1fd-459c-a11e-00cf138ebd55
[I 2023-12-13 19:53:17,058] Trial 0 finished with value: 0.8707482993197279 and parameters: {'n_estimators': 135, 'max_depth': 8, 'min_samples_split': 5}. Best is trial 0 with value: 0.8707482993197279.
[I 2023-12-13 19:53:19,302] Trial 1 finished with value: 0.8752834467120182 and parameters: {'n_estimators': 136, 'max_depth': 8, 'min_samples_split': 4}. Best is trial 1 with value: 0.8752834467120182.
[I 2023-12-13 19:53:20,335] Trial 2 finished with value: 0.8764172335600907 and parameters: {'n_estimators': 126, 'max_depth': 26, 'min_samples_split': 8}. Best is trial 2 with value: 0.8764172335600907.
[I 2023-12-13 19:53:21,258] Trial 3 finished with value: 0.873015873015873 and parameters: {'n_estimators': 143, 'max_depth': 7, 'min_samples_split': 8}. Best is trial 2 with value: 0.8764172335600907.
[I 2023-12-13 19:53:22,987] Trial 4 finished with value: 0.875283446712018

Best trial:
  Accuracy: 0.8843537414965986
  Params: 
    n_estimators: 242
    max_depth: 18
    min_samples_split: 5


In [60]:
# Parâmetros obtidos do Optuna
optuna_params = {
    'n_estimators': 206,
    'max_depth': 24,
    'min_samples_split': 8,
    'random_state': 5
}

# Criação do modelo RandomForest com os parâmetros do Optuna
rf_optuna_model = RandomForestClassifier(**optuna_params)

# Treinando o modelo
rf_optuna_model.fit(X_train_smote, y_train_smote)

# Fazendo previsões no conjunto de teste
predictions_rf_optuna = rf_optuna_model.predict(X_test)

# Avaliando o modelo
print("RandomForest Model with Optuna Parameters:")
print("Accuracy:", accuracy_score(y_test, predictions_rf_optuna))
print(classification_report(y_test, predictions_rf_optuna))

# Importância das características (Feature Importance)
feature_importance_rf_optuna = pd.DataFrame({
    'feature': features.columns,
    'importance': rf_optuna_model.feature_importances_
}).sort_values('importance', ascending=False)

print(feature_importance_rf_optuna.head(10))

RandomForest Model with Optuna Parameters:
Accuracy: 0.8832199546485261
              precision    recall  f1-score   support

           0       0.91      0.96      0.93       750
           1       0.66      0.45      0.54       132

    accuracy                           0.88       882
   macro avg       0.78      0.71      0.74       882
weighted avg       0.87      0.88      0.87       882

                feature  importance
4               Recency    0.095371
25        Customer_Days    0.075780
13  NumCatalogPurchases    0.070509
15    NumWebVisitsMonth    0.053606
14    NumStorePurchases    0.052535
3              Teenhome    0.049254
12      NumWebPurchases    0.046665
5              MntWines    0.045623
19         AcceptedCmp1    0.045384
7       MntMeatProducts    0.043512
