In [23]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import date
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn import metrics

In [24]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [25]:

# Carregar o dataset
marketing_data = pd.read_csv('/content/drive/MyDrive/AM/marketing_campaign.csv')

In [26]:
# Transformações nos Dados
marketing_data['Age'] = date.today().year - marketing_data['Year_Birth']
marketing_data['Customer_Days'] = (pd.to_datetime('now', utc=True) - pd.to_datetime(marketing_data['Dt_Customer'], utc=True)).dt.days

In [27]:
# Conversão de variáveis categóricas usando variáveis dummy
marketing_data = pd.get_dummies(marketing_data, columns=['Marital_Status', 'Education'])


In [28]:
# Remover colunas que não serão mais usadas
columns_to_drop = ['Year_Birth', 'Dt_Customer']
marketing_data.drop(columns=columns_to_drop, inplace=True)


In [29]:
# Tratamento de outliers e dados faltantes para 'Income' e 'Age'
Q1_income = marketing_data['Income'].quantile(0.25)
Q3_income = marketing_data['Income'].quantile(0.75)
IQR_income = Q3_income - Q1_income
marketing_data = marketing_data[(marketing_data['Income'] < Q3_income + 1.5 * IQR_income) & (marketing_data['Income'] > Q1_income - 1.5 * IQR_income)]

Q1_age = marketing_data['Age'].quantile(0.25)
Q3_age = marketing_data['Age'].quantile(0.75)
IQR_age = Q3_age - Q1_age
marketing_data = marketing_data[(marketing_data['Age'] < Q3_age + 1.5 * IQR_age) & (marketing_data['Age'] > Q1_age - 1.5 * IQR_age)]


In [8]:
# Dividir o dataset em características e rótulos
features = marketing_data.drop(['Response'], axis=1)
labels = marketing_data['Response']


In [30]:
# Escalamento dos dados
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)


In [31]:
# Dividir o dataset em conjunto de treino e teste
X_train, X_test, y_train, y_test = train_test_split(features_scaled, labels, test_size=0.40, random_state=5)


In [34]:
# Regressão Logística
param_grid_lr = {'C': [0.01, 0.1, 1, 10], 'penalty': ['l2'], 'solver': ['liblinear', 'lbfgs']}
lr_model = GridSearchCV(LogisticRegression(random_state=5), param_grid=param_grid_lr, cv=5, verbose=1)
lr_model.fit(X_train, y_train)
predictions_lr = lr_model.predict(X_test)
print("\nLogistic Regression Model:")
print("Accuracy:", accuracy_score(y_test, predictions_lr))
print(classification_report(y_test, predictions_lr))


Fitting 5 folds for each of 8 candidates, totalling 40 fits

Logistic Regression Model:
Accuracy: 0.8798185941043084
              precision    recall  f1-score   support

           0       0.90      0.97      0.93       750
           1       0.68      0.37      0.48       132

    accuracy                           0.88       882
   macro avg       0.79      0.67      0.71       882
weighted avg       0.87      0.88      0.86       882



In [35]:
# Árvore de Decisão
param_grid_dt = {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 4, 6], 'min_samples_leaf': [1, 2, 3]}
dt_model = GridSearchCV(DecisionTreeClassifier(random_state=5), param_grid=param_grid_dt, cv=5, verbose=1)
dt_model.fit(X_train, y_train)
predictions_dt = dt_model.predict(X_test)
print("\nDecision Tree Model:")
print("Accuracy:", accuracy_score(y_test, predictions_dt))
print(classification_report(y_test, predictions_dt))

Fitting 5 folds for each of 36 candidates, totalling 180 fits

Decision Tree Model:
Accuracy: 0.8378684807256236
              precision    recall  f1-score   support

           0       0.89      0.92      0.91       750
           1       0.45      0.38      0.41       132

    accuracy                           0.84       882
   macro avg       0.67      0.65      0.66       882
weighted avg       0.83      0.84      0.83       882



In [36]:
# SVM com K-Fold Cross Validation
param_grid_svm = {'C': [0.1, 1, 10], 'gamma': [1, 0.1, 0.01], 'kernel': ['rbf', 'linear']}
svm_model = GridSearchCV(SVC(), param_grid_svm, cv=5, verbose=1)
svm_model.fit(X_train, y_train)
predictions_svm = svm_model.predict(X_test)
print("\nSVM Model:")
print("Accuracy:", accuracy_score(y_test, predictions_svm))
print(classification_report(y_test, predictions_svm))

Fitting 5 folds for each of 18 candidates, totalling 90 fits

SVM Model:
Accuracy: 0.8866213151927438
              precision    recall  f1-score   support

           0       0.90      0.97      0.94       750
           1       0.71      0.41      0.52       132

    accuracy                           0.89       882
   macro avg       0.81      0.69      0.73       882
weighted avg       0.87      0.89      0.87       882



In [32]:
# RandomForest
param_grid_rf = {'n_estimators': [50, 100, 200], 'max_features': ['sqrt'], 'max_depth': [None, 3, 5, 8], 'criterion': ['gini'], 'min_samples_split': [2, 3, 4]}
rf_model = GridSearchCV(RandomForestClassifier(random_state=5), param_grid=param_grid_rf, cv=5, verbose=1)
rf_model.fit(X_train, y_train)
predictions_rf = rf_model.predict(X_test)
print("RandomForest Model:")
print("Accuracy:", accuracy_score(y_test, predictions_rf))
print(classification_report(y_test, predictions_rf))
feature_importance_rf = pd.DataFrame({'feature': features.columns, 'importance': rf_model.best_estimator_.feature_importances_}).sort_values('importance', ascending=False)
print(feature_importance_rf.head(10))


Fitting 5 folds for each of 36 candidates, totalling 180 fits
RandomForest Model:
Accuracy: 0.8798185941043084
              precision    recall  f1-score   support

           0       0.88      0.99      0.93       750
           1       0.80      0.27      0.40       132

    accuracy                           0.88       882
   macro avg       0.84      0.63      0.67       882
weighted avg       0.87      0.88      0.85       882

              feature  importance
4             Recency    0.086517
25      Customer_Days    0.074622
5            MntWines    0.073286
1              Income    0.062150
7     MntMeatProducts    0.056384
19       AcceptedCmp1    0.049899
18       AcceptedCmp5    0.048275
10       MntGoldProds    0.046737
15  NumWebVisitsMonth    0.045837
0                  ID    0.045662
