<a href="https://colab.research.google.com/github/filipemcbarros/modelos_classificadores_conciliacao/blob/main/algoritmos_classicos_conciliacao.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Bibliotecas utilizadas**

In [None]:
import pandas as pd
from google.colab import drive
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix,accuracy_score,classification_report,roc_auc_score,roc_curve,accuracy_score,matthews_corrcoef,f1_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# **Carregamento do dataset**

In [None]:
dataset.to_csv("/content/drive/MyDrive/Colab Notebooks/base_processos/dataset_final.csv")
dataset

# **Treinamento e geração dos modelos com TF-IDF**

In [None]:
# Definir a divisão dos conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(features.values, classes, test_size=0.3, random_state=1234)

# Validação Cruzada

# TF-IDF vectorize
#maxfeatures
tfidf_vectorizer = TfidfVectorizer(max_features=10000)
tfidf_train_vectors = tfidf_vectorizer.fit_transform(X_train)
tfidf_test_vectors = tfidf_vectorizer.transform(X_test)

# **Treinamento e geração dos modelos com Word2Vec**

In [None]:
from gensim.models import KeyedVectors


#Selecionar o arquivo com o tamanho da dimensão que deseja gerar a representação CBOW ou Skip-Gram
word2vec_model = KeyedVectors.load_word2vec_format('/content/drive/Shared drives/cbow_s600.txt')

# Definir a divisão dos conjuntos de treino e teste
X_train, X_test, y_train, y_test = train_test_split(features, classes, test_size=0.3, random_state=1234)

# Função para calcular o vetor médio de um conjunto de palavras usando Word2Vec
def calculate_mean_vector(words, model):
    vectors = []
    for word in words:
        if word in model:
            vectors.append(model[word])
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)  # Vetor de zeros se nenhuma palavra for encontrada

# Converter as listas de palavras em vetores médios usando o modelo Word2Vec
word2vec_train_vectors = [calculate_mean_vector(words, word2vec_model) for words in X_train]
word2vec_test_vectors = [calculate_mean_vector(words, word2vec_model) for words in X_test]

word2vec_train_vectors = np.array(word2vec_train_vectors)
word2vec_test_vectors = np.array(word2vec_test_vectors)

# **Modelos TF-IDF**

In [None]:
!pip install scikit-learn
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC

#Seleção de parâmetros via GridSearch SVC
model = SVC()
parameters = {
    'C':[1.0, 10.0, 100.0, 1000.0]
}

grid_search = GridSearchCV(model, parameters, scoring='f1_macro', cv=5, return_train_score=True, verbose=1)
grid_search.fit(X_train, y_train)

In [None]:
# Treinamento do modelo SVC
from sklearn.svm import SVC
svc = SVC(kernel='linear')
svc.fit(tfidf_train_vectors,y_train)
target_svc = svc.predict(tfidf_test_vectors)

In [None]:
# Treinamento do modelo SVC Polinomial
from sklearn.svm import SVC
svcPoly = SVC(kernel='poly', degree=5)
svcPoly.fit(tfidf_train_vectors,y_train)
target_svc_poly = svcPoly.predict(tfidf_test_vectors)

In [None]:
# Treinamento do modelo SVC Gaussian Kernel
from sklearn.svm import SVC

svcGaussian = SVC(kernel='rbf', C=1000.0)
svcGaussian.fit(tfidf_train_vectors,y_train)
target_svc_gaussian = svcGaussian.predict(tfidf_test_vectors)

In [None]:
#Seleção de parâmetros via GridSearch Logistic Regression
model = LogisticRegression()
parameters = {
  'C':[1.0, 10.0, 100.0, 1000.0],
  'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga']
}

grid_search = GridSearchCV(model, parameters, scoring='f1_macro', cv=5, return_train_score=True, verbose=1)
grid_search.fit(X_train, y_train)

In [None]:
# Treinamento do modelo Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=10.0, solver='lbfgs')
lr.fit(tfidf_train_vectors,y_train)
target_lr = lr.predict(tfidf_test_vectors)

In [None]:
#Seleção de parâmetros via GridSearch Árvore de Decisão
model = LogisticRegression()
parameters = {
  'criterion': ['gini', 'entropy', 'log_loss']
}

grid_search = GridSearchCV(model, parameters, scoring='f1_macro', cv=5, return_train_score=True, verbose=1)
grid_search.fit(X_train, y_train)

In [None]:
# Treinamento do modelo Árvore de Decisão
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(criterion='entropy')
dtc.fit(tfidf_train_vectors, y_train)
target_dtc = dtc.predict(tfidf_test_vectors)

In [None]:
# Treinamento do modelo XGBoost
import xgboost as xgb

# Convert the data into DMatrix format, which is required by XGBoost
dtrain = xgb.DMatrix(tfidf_train_vectors, label=y_train)
dtest = xgb.DMatrix(tfidf_test_vectors, label=y_test)

# Set the parameters for XGBoost / Testes dos parâmetros feito em algumas combinações para achar os valores abaixo. GridSearch não foi aplicado aqui e sim a construção de vários modelos com parâmetros diferentes
params = {
    'objective': 'reg:squarederror',
    'max_depth': 7,
    'learning_rate': 0.3,
    'n_estimators': 100
}

# Train the Boosted Trees model
bst = xgb.train(params, dtrain)

# Make predictions on the test set
target_xgb = bst.predict(dtest)

# **Modelos Word2Vec**

In [None]:
# Treinamento do modelo SVC
from sklearn.svm import SVC
svc = SVC(kernel='linear')
svc.fit(word2vec_train_vectors,y_train)
target_svc = svc.predict(word2vec_test_vectors)

In [None]:
# Treinamento do modelo SVC Polinomial
from sklearn.svm import SVC
svcPoly = SVC(kernel='poly', degree=5)
svcPoly.fit(word2vec_train_vectors,y_train)
target_svc_poly = svcPoly.predict(word2vec_test_vectors)

In [None]:
# Treinamento do modelo SVC Gaussian Kernel
from sklearn.svm import SVC

svcGaussian = SVC(C=1000.0, kernel='rbf')
svcGaussian.fit(word2vec_train_vectors,y_train)
target_svc_gaussian = svcGaussian.predict(word2vec_test_vectors)

In [None]:
# Treinamento do modelo Logistic Regression
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=10.0, solver='lbfgs')
lr.fit(word2vec_train_vectors,y_train)
target_lr = lr.predict(word2vec_test_vectors)

In [None]:
# Treinamento do modelo Árvore de Decisão
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(criterion='entropy') #criterion{“gini”, “entropy”, “log_loss”}, default=”gini” / splitter{“best”, “random”}, default=”best”
dtc.fit(word2vec_train_vectors, y_train)
target_dtc = dtc.predict(word2vec_test_vectors)

In [None]:
# Treinamento do modelo XGBoost
import xgboost as xgb

# Convert the data into DMatrix format, which is required by XGBoost
dtrain = xgb.DMatrix(word2vec_train_vectors, label=y_train)
dtest = xgb.DMatrix(word2vec_test_vectors, label=y_test)

# Set the parameters for XGBoost
params = {
    'objective': 'reg:squarederror',
    'max_depth': 7,
    'learning_rate': 0.3,
    'n_estimators': 100
}

# Train the Boosted Trees model
bst = xgb.train(params, dtrain)

# Make predictions on the test set
target_xgb = bst.predict(dtest)

# **Modelo SVC:**

In [None]:
print('############# Relatório de Classificação - Modelo SVC #############')
print()

print("Relatório de Classificação:\n", classification_report(y_test, target_svc_gaussian, digits=4))
# imprimir a acurácia do modelo
print("Acurácia: {:.4f}\n".format(accuracy_score(y_test, target_svc_gaussian)))
# imprimir a área sob da curva
print("AUC: {:.4f}\n".format(roc_auc_score(y_test, target_svc_gaussian)))
print("MCC:  {:.4f}\n".format(matthews_corrcoef(y_test, target_svc_gaussian)))
print("F1-Score:  {:.4f}\n".format(f1_score(y_test, target_svc_gaussian)))

In [None]:
print('#### Matriz de Confusão - Modelo SVC ####')
print()
cf_matrix = confusion_matrix(y_test,target_svc_gaussian)
sns.heatmap(cf_matrix, annot=True, fmt='.0f')
plt.show()

# **Modelo SVC Polynomial:**

In [None]:
print('############# Relatório de Classificação - Modelo SVC Polynomial #############')
print()

print("Relatório de Classificação:\n", classification_report(y_test, target_svc_poly, digits=4))
# imprimir a acurácia do modelo
print("Acurácia: {:.4f}\n".format(accuracy_score(y_test, target_svc_poly)))
# imprimir a área sob da curva
print("AUC: {:.4f}\n".format(roc_auc_score(y_test, target_svc_poly)))
print("MCC:  {:.4f}\n".format(matthews_corrcoef(y_test, target_svc_poly)))
print("F1-Score:  {:.4f}\n".format(f1_score(y_test, target_svc_poly)))

In [None]:
print('#### Matriz de Confusão - Modelo SVC Polynomial ####')
print()
cf_matrix = confusion_matrix(y_test,target_svc_poly)
sns.heatmap(cf_matrix, annot=True, fmt='.0f')
plt.show()

# **Modelo SVC Gaussian:**

In [None]:
print('############# Relatório de Classificação - Modelo SVC Gaussian #############')
print()

print("Relatório de Classificação:\n", classification_report(y_test, target_svc_gaussian, digits=4))
# imprimir a acurácia do modelo
print("Acurácia: {:.4f}\n".format(accuracy_score(y_test, target_svc_gaussian)))
# imprimir a área sob da curva
print("AUC: {:.4f}\n".format(roc_auc_score(y_test, target_svc_gaussian)))
print("MCC:  {:.4f}\n".format(matthews_corrcoef(y_test, target_svc_gaussian)))
print("F1-Score:  {:.4f}\n".format(f1_score(y_test, target_svc_gaussian)))

In [None]:
print('#### Matriz de Confusão - Modelo SVC Gaussian ####')
print()
cf_matrix = confusion_matrix(y_test,target_svc_gaussian)
sns.heatmap(cf_matrix, annot=True, fmt='.0f')
plt.show()

# **Modelo Regressão Logística:**

In [None]:
print('############# Relatório de Classificação - Modelo Regressão Logística #############')
print()

print("Relatório de Classificação:\n", classification_report(y_test, target_lr, digits=4))
# imprimir a acurácia do modelo
print("Acurácia: {:.4f}\n".format(accuracy_score(y_test, target_lr)))
# imprimir a área sob da curva
print("AUC: {:.4f}\n".format(roc_auc_score(y_test, target_lr)))
print("MCC:  {:.4f}\n".format(matthews_corrcoef(y_test, target_lr)))
print("F1-Score:  {:.4f}\n".format(f1_score(y_test, target_lr)))

In [None]:
print('#### Matriz de Confusão - Modelo Regressão Logística ####')
print()
cf_matrix = confusion_matrix(y_test,target_lr)
sns.heatmap(cf_matrix, annot=True, fmt='.0f')
plt.show()

# **Modelo Árvore de Decisão:**

In [None]:
print('############# Relatório de Classificação - Modelo Árvore de Decisão #############')
print()

print("Relatório de Classificação:\n", classification_report(y_test, target_dtc, digits=4))
# imprimir a acurácia do modelo
print("Acurácia: {:.4f}\n".format(accuracy_score(y_test, target_dtc)))
# imprimir a área sob da curva
print("AUC: {:.4f}\n".format(roc_auc_score(y_test, target_dtc)))
print("MCC:  {:.4f}\n".format(matthews_corrcoef(y_test, target_dtc)))
print("F1-Score:  {:.4f}\n".format(f1_score(y_test, target_dtc)))

In [None]:
print('#### Matriz de Confusão - Modelo  Árvore de Decisão ####')
print()
cf_matrix = confusion_matrix(y_test,target_dtc)
sns.heatmap(cf_matrix, annot=True, fmt='.0f')
plt.show()

# **Modelo Boosted Tree (XGBoost):**

In [None]:
# Definir um limite de decisão (exemplo: 0.5)
limite_decisao = 0.5

# Converter as previsões em classes binárias
previsoes_binarias = [1 if predicao > limite_decisao else 0 for predicao in target_xgb]

# Converter os rótulos reais em classes binárias
rotulos_binarios = [1 if rotulo > limite_decisao else 0 for rotulo in y_test]

print('############# Relatório de Classificação - Modelo XGBoost #############')
print()

print("Relatório de Classificação:\n", classification_report(rotulos_binarios, previsoes_binarias, digits=4))

print("Acurácia: {:.4f}\n".format(accuracy_score(rotulos_binarios, previsoes_binarias)))
# imprimir a área sob da curva
print("AUC: {:.4f}\n".format(roc_auc_score(rotulos_binarios, previsoes_binarias)))
print("MCC:  {:.4f}\n".format(matthews_corrcoef(rotulos_binarios, previsoes_binarias)))
print("F1-Score:  {:.4f}\n".format(f1_score(rotulos_binarios, previsoes_binarias)))

In [None]:
print('#### Matriz de Confusão - Modelo XGBoost ####')
print()
cf_matrix = confusion_matrix(y_test,previsoes_binarias)
sns.heatmap(cf_matrix, annot=True, fmt='.0f')
plt.show()