In [10]:
import nltk
import re
import numpy as np
import csv
import unicodedata
import warnings

from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

## Clasificadores
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from skopt import BayesSearchCV

#from sklearn.datasets import load_digits


# import pandas as pd
# import numpy as np
# #load data
# df = pd.read_csv("./csv/0_500_union_salida_clasificada.csv")

warnings.filterwarnings('ignore')

file_name="./csv/0_500_union_salida_clasificada.csv"
stop_file="custom_stopwords.txt"    # Nombre del archivo de stopwords.

# Regex para emoticones en texto.
emoticons_str = r"""
(?:
    [:=;]               # Ojos
    [oO\-]?             # Nariz (optional)
    [D\)\]\(\]/\\OpP]   # Bocas
)"""

# Regex para tokenizar correctamente.
regex_str = [
    emoticons_str,
    r'(?:[\w_]+)',                                        # Otras palabras
    r'(?:\S)'                                             # Cualquier otra cosa
]

# Se arman objetos para regular expresions.
tokens_re   = re.compile(r'('+'|'.join(regex_str)+')', re.VERBOSE | re.IGNORECASE)

# Se carga archivos de STOPWORDS
with open(stop_file, newline='') as file:
    stopwords = file.read().splitlines()

X = []
y = []
    
# Se abre archivo con tweets y se lo recorre    
with open(file_name, newline='') as csvfile:

    reader = csv.reader(csvfile, delimiter=',', quotechar='\"')
    header = next(reader)
    
    for row in reader:

        y.append(row[0].lower())
        
        tweet = row[1].lower()                    # Se normaliza texto, todo a minusculas.
        tweet = re.sub(r'@[a-z0-9_]+', '', tweet) # Se quitan menciones. @xxxxxxxx
        tweet = re.sub(r'http[s]?://(?:[a-z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-f][0-9a-f]))+', '', tweet)
        #tweet = re.sub(r'\#+[\w_]+[\w\'_\-]*[\w_]+', '', tweet)
        #tweet = re.sub(r'(#)', '', tweet)
        tweet = tweet.translate(str.maketrans('','', '.~¡!-_—#$%¿?:+-/°);(/",*“”‘’'))
        
        # Manejo de emoticones.
        emoticones = [
                      ['😆',' risa'],['😂','risa'],['😱',' asombro'],['🥳',' felicidad'],['💙',' amor'],['😍',' amor'],
                      ['😀',' sonreir'],['👍',' ok'],['🤔',' ok'],['🎊',' piñata'],['🙏',' ojala'],['💪🏻',' fuerza'],
                      ['😡',' enojo'],['😛',' broma'],['😮',' asombro'],['🤮',' desagradable'],['👏🏻',' aplauso'],
                      ['😏',' canchero'],['😩',' decepcion'],['😳',' verguenza'],['😊',' contento'],['😥',' triste'],
                      ['😤',' furioso'],['🖕',' enojo'],['👏',' aplauso'],['💪',' fuerza'],['🤦‍','increible'],
                      ['🙄','duda']
                     ]
        for emoji in emoticones:
            tweet = tweet.replace(emoji[0], emoji[1])

        # Manejo de acentos.
        dict_acentos = [['á','a'],['é','e'],['í','i'],['ó','o'],['ú','u']]
        for acento in dict_acentos:
            tweet = tweet.replace(acento[0], acento[1])
            
        # Remueve letras repetidas y deja una sola.
        for letra in ['a','e','i','o','u','s','c']:
            pattern = letra + '{2,}'
            tweet = re.sub(pattern, letra, tweet)
        
        tweet = tweet.translate(str.maketrans('','', '🥝🐧🐳🖐🛫😑✈🇦🇷🇵🇾👇🙃▶💻►→⬇️😒🔫🔝🔥💀🚫😞♂❤❤❤😎👊🤞🏻'))
        tweet = re.sub(r'\d+', '', tweet)         # Se quitan numeros.
        tweet = tweet.strip()
    
        # Tokenizado
        tokens = tokens_re.findall(tweet)

        # Remocion de stopwords
        tokens = [token for token in tokens if token not in stopwords]

        s = ' '
        X.append(s.join(tokens))
        #print(tokens)


In [11]:
# --- Pipelines ---
pipeline1 = Pipeline([('vectorizer', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SVC())
                     ])
pipeline2 = Pipeline([('vectorizer', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', DecisionTreeClassifier())
                     ])
pipeline3 = Pipeline([('vectorizer', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MLPClassifier())
                     ])
pipeline4 = Pipeline([('vectorizer', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', MultinomialNB())
                     ])


# --- Parameters ---
param_svm = [{ 'vectorizer__ngram_range': [(1, 1),(1, 2),(2, 2)],
               'tfidf__use_idf': (True, False),
               'clf__kernel': ['linear', 'poly', 'rbf'],  # categorical parameter
               'clf__gamma': (1e-6, 1e+1, 'log-uniform'),
               'clf__C': (1e-6, 1e+6, 'log-uniform'),  
               'clf__degree': (1, 8),  # integer valued parameter
             }]
param_tree = [{ 'vectorizer__ngram_range': [(1, 1),(1, 2),(2, 2)],
                'tfidf__use_idf': (True, False),
                'clf__criterion': ['gini', 'entropy'],
                'clf__max_depth': [ 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],
                'clf__min_samples_leaf': [1, 5, 10, 20]
              }]    
param_red =  [{ #'vectorizer__ngram_range': [(1, 2),(2, 2)],
                'tfidf__use_idf': (True, False),
                'clf__solver': ['lbfgs'], 
                'clf__max_iter': [300], 
                'clf__alpha': 10.0 ** -np.arange(1, 7), 
                'clf__hidden_layer_sizes':np.arange(5, 10), 
                'clf__random_state':[8] 
              }]
param_nb  =  [{ #'vectorizer__ngram_range': [(1, 1),(1, 2)],
                'tfidf__use_idf': (True, False),
                #'clf__alpha': np.linspace(0.5, 1.5, 6),
                'clf__alpha': (0.5,0.7,0.9,1.1,1.3,1.5),
                'clf__fit_prior': [True, False] 
              }]


# --- Scores ---
scores = ['precision', 'recall']
model = ["Naive Bayes", "Random Forest", "Neural Network", "SVM"]
pips = [pipeline4, pipeline2, pipeline3, pipeline1] 
pars = [param_nb, param_tree, param_red, param_svm] 


# --- Cros Validate ---
cvNum = 10

print("Comenzando BayesSearchCV")

for i in range(len(pars)):
    print(model[i])
    for score in scores:
        
        gs_clf_svm = BayesSearchCV( pips[i], pars[i], cv=cvNum, scoring='%s_weighted' % score, n_jobs=-1, verbose=0, refit=False, n_iter=9)
        gs_clf_svm = gs_clf_svm.fit(X, y)
        
        print(score, gs_clf_svm.best_score_)
        print("Parametros: ", gs_clf_svm.best_params_)

Comenzando BayesSearchCV
Naive Bayes
precision 0.7291229481998011
Parametros:  {'clf__alpha': 0.9, 'clf__fit_prior': False, 'tfidf__use_idf': True}
recall 0.720558882235529
Parametros:  {'clf__alpha': 0.9, 'clf__fit_prior': False, 'tfidf__use_idf': True}
Random Forest
precision 0.6464402986604582
Parametros:  {'clf__criterion': 'entropy', 'clf__max_depth': 10, 'clf__min_samples_leaf': 10, 'tfidf__use_idf': True, 'vectorizer__ngram_range': (1, 2)}
recall 0.5808383233532934
Parametros:  {'clf__criterion': 'gini', 'clf__max_depth': 4, 'clf__min_samples_leaf': 10, 'tfidf__use_idf': False, 'vectorizer__ngram_range': (1, 1)}
Neural Network
precision 0.7038338419414977
Parametros:  {'clf__alpha': 0.1, 'clf__hidden_layer_sizes': 6, 'clf__max_iter': 300, 'clf__random_state': 8, 'clf__solver': 'lbfgs', 'tfidf__use_idf': False}
recall 0.7125748502994012
Parametros:  {'clf__alpha': 0.1, 'clf__hidden_layer_sizes': 9, 'clf__max_iter': 300, 'clf__random_state': 8, 'clf__solver': 'lbfgs', 'tfidf__use_