In [415]:
import pandas as pd
import re
from nltk.stem.snowball import SnowballStemmer

def PreProcessing():
    df = pd.read_csv("comments.csv", encoding = "utf-8")
    
    df["text"] = df["full_text"]
    df["text"] = df["text"].str.lower()
    df["text"] = [re.sub(r"(?:\@|https?\://)\S+", "", text) for text in df["text"]]
    
    intab = "áéíóúïü"
    outtab = "aeiouiu"
    trantab = str.maketrans(intab, outtab)
    df["text"] = [text.translate(trantab) for text in df["text"]]
    
    df["text"] = df["text"].str.replace("[^a-zñ\s]", "")
    
    st = SnowballStemmer("spanish")
    df["text"] = df["text"].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
    
    return df

In [416]:
import pandas as pd
import warnings
import numpy as np
from gensim.models import word2vec

def GetFVs(comment):
    words = str(comment).split()
    vectors = []
    for w in words:
        if w in w2v_model.wv.vocab:
            vectors.append(w2v_model[w].tolist())
    
    return vectors

def FeatureExtraction(df):
    warnings.filterwarnings("ignore")
    
    df2 = pd.DataFrame(df["text"].values)
    np.savetxt("comments.txt", df2, fmt = "%s", encoding = "utf-8")
    comments = word2vec.Text8Corpus("comments.txt")
    
    size = 100
    w2v_model = word2vec.Word2Vec(sentences = comments, size = size, min_count = 3)
    y = df["class"]
    
    comments_fvs = []
    for comment in df["text"]:
        fvs = GetFVs(comment)
        comments_fvs.append(fvs)

    new_comment_fvs = []
    for comment_fvs in comments_fvs:
        new_comment_fv = np.zeros(size)
        k = 0
        for comment_fs in comment_fvs:
            new_comment_fv =  new_comment_fv + comment_fs
            k = k + 1

        if k > 0:
            new_comment_fvs.append(new_comment_fv / k)
        else:
            new_comment_fvs.append(new_comment_fv)

    column_names = ["f" + str(i) for i in range(1, size + 1)]
    column_names.append("class")

    x = new_comment_fvs
    df = pd.DataFrame(x) 
    df["class"] = y
    df.columns = column_names

    return df

In [417]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import f1_score

def MachineLearning(df):
    df.drop(df[ df["class"] == "neutro"].index, inplace = True)
    
    x = df.drop("class", axis = 1, inplace = False)
    df["class"] = [5 if tag == "positivo"  else 1 for tag in df["class"]]
    y = df["class"]
    
    scaler = StandardScaler()
    normalized_x = scaler.fit_transform(x)
    x_train, x_test, y_train, y_test = train_test_split(normalized_x,
                                                        y,
                                                        test_size = 0.2,
                                                        random_state = 0,
                                                        stratify = y)

    param_grid = {  
        'C': [1, 10, 100],
        'gamma': [0.1, 0.01, 0.001, 0.0001],
        'kernel': ['rbf'],
        'tol': [0.01, 0.001, 0.0001, 0.00001]
    }

    param_grid = {  
        'C': [100],
        'gamma': [0.001],
        'kernel': ['rbf'],
        'tol': [0.001]
    }

    grid_search = GridSearchCV(estimator = SVC(),
                               param_grid = param_grid,
                               scoring = 'f1',
                               cv = 5)

    grid_search.fit(x_train, y_train)

    print('Tuneando los hiper-parámetros para f1')
    print()

    print('Los mejores hiper-parámetros encontrados con validación cruzada:')
    print()
    print(grid_search.best_params_)
    print()
    print('Puntajes de la métrica f1 en el conjunto de validación:')
    print()

    means = grid_search.cv_results_['mean_test_score']
    stds = grid_search.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, grid_search.cv_results_['params']):
        print('%0.4f (+/-%0.04f) for %r'
              % (mean, std * 2, params))

    support_vector_model = grid_search
    support_vector_model_train_prediction = support_vector_model.predict(x_train)
    support_vector_model_test_prediction = support_vector_model.predict(x_test)

    print('Máquina de soporte vectorial')
    print('\t-Mejores hiper-parámetros: ' + str(support_vector_model.best_params_))
    print('\t-Puntaje f1 en entrenamiento: %.4f' % f1_score(y_train, support_vector_model_train_prediction, average = 'binary'))
    print('\t-Puntaje f1 en pruebas: %.4f' % f1_score(y_test, support_vector_model_test_prediction, average = 'binary'))
    
    return support_vector_model

In [418]:
def TrainModel():
    df = PreProcessing()
    df = FeatureExtraction(df)
    model = MachineLearning(df)
    
    return model

In [419]:
model = TrainModel()

Tuneando los hiper-parámetros para f1

Los mejores hiper-parámetros encontrados con validación cruzada:

{'C': 100, 'gamma': 0.001, 'kernel': 'rbf', 'tol': 0.001}

Puntajes de la métrica f1 en el conjunto de validación:

0.8219 (+/-0.0224) for {'C': 100, 'gamma': 0.001, 'kernel': 'rbf', 'tol': 0.001}
Máquina de soporte vectorial
	-Mejores hiper-parámetros: {'C': 100, 'gamma': 0.001, 'kernel': 'rbf', 'tol': 0.001}
	-Puntaje f1 en entrenamiento: 0.8354
	-Puntaje f1 en pruebas: 0.8177


In [409]:
model.predict(GetFVs("buena atencion").reshape(1,-1))

AttributeError: 'list' object has no attribute 'reshape'