# CNN + SVM para clasificar texto

Este notebook está inspirado en el trabajo de este paper https://www.researchgate.net/publication/331701896_Short_Text_Classification_With_A_Convolutional_Neural_Networks_Based_Method

In [166]:
import numpy as np
import tensorflow as tf
import random as python_random

np.random.seed(123)

python_random.seed(123)

tf.random.set_seed(1234)

import pandas as pd
import re
from datetime import datetime
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from keras.models import Model
from keras.layers import Dense, Dropout, Embedding, concatenate, Input
from keras.layers import Conv1D, GlobalMaxPool1D, SpatialDropout1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.optimizers import Adam
from sklearn.metrics import f1_score, confusion_matrix, precision_score, recall_score
from keras.utils import plot_model
from keras import backend as K
from matplotlib import pyplot as plt
from gensim.models import KeyedVectors
from sklearn import svm
from sklearn.model_selection import RandomizedSearchCV
from sklearn.utils.fixes import loguniform

%matplotlib inline

def metrics(predictions, y_test):
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
    print(f'Verdaderos Negativos: {tn}')
    print(f'Falsos Negativos: {fn}')
    print(f'Verdaderos Positivos: {tp}')
    print(f'Falsos Positivos: {fp}')
    print()
    print(f'precision score: {precision_score(y_test, predictions)}')
    print(f'recall score: {recall_score(y_test, predictions)}')
    print(f'f1 score: {f1_score(y_test,  predictions)}')

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [113]:
url_train = 'https://raw.githubusercontent.com/fsicardir/datos-tp2/master/dataset/train.csv?token=AFVAIUW66UE3NA5X2SYXNPC7GHGJY'
url_test = 'https://raw.githubusercontent.com/fsicardir/datos-tp2/master/dataset/test.csv?token=AFVAIUUSBVEOOMDIFV4GU6C7GHGNK'

read_train = pd.read_csv(url_train)
read_test = pd.read_csv(url_test)

In [3]:
# Vamos a usar estos embeddings primero, y luego probar los de GloVe.
!wget -c "https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz"

EMBEDDING_FILE = 'GoogleNews-vectors-negative300.bin.gz'
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

--2020-08-08 19:34:53--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.88.37
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.88.37|:443... connected.
HTTP request sent, awaiting response... 416 Requested Range Not Satisfiable

    The file is already fully retrieved; nothing to do.



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [114]:
# Créditos a este notebook https://www.kaggle.com/nmaguette/up-to-date-list-of-slangs-for-text-preprocessing
abbreviations = {
    "$" : " dollar ",
    "€" : " euro ",
    "4ao" : "for adults only",
    "a.m" : "before midday",
    "a3" : "anytime anywhere anyplace",
    "aamof" : "as a matter of fact",
    "acct" : "account",
    "adih" : "another day in hell",
    "afaic" : "as far as i am concerned",
    "afaict" : "as far as i can tell",
    "afaik" : "as far as i know",
    "afair" : "as far as i remember",
    "afk" : "away from keyboard",
    "app" : "application",
    "approx" : "approximately",
    "apps" : "applications",
    "asap" : "as soon as possible",
    "asl" : "age, sex, location",
    "atk" : "at the keyboard",
    "ave." : "avenue",
    "aymm" : "are you my mother",
    "ayor" : "at your own risk", 
    "b&b" : "bed and breakfast",
    "b+b" : "bed and breakfast",
    "b.c" : "before christ",
    "b2b" : "business to business",
    "b2c" : "business to customer",
    "b4" : "before",
    "b4n" : "bye for now",
    "b@u" : "back at you",
    "bae" : "before anyone else",
    "bak" : "back at keyboard",
    "bbbg" : "bye bye be good",
    "bbc" : "british broadcasting corporation",
    "bbias" : "be back in a second",
    "bbl" : "be back later",
    "bbs" : "be back soon",
    "be4" : "before",
    "bfn" : "bye for now",
    "blvd" : "boulevard",
    "bout" : "about",
    "brb" : "be right back",
    "bros" : "brothers",
    "brt" : "be right there",
    "bsaaw" : "big smile and a wink",
    "btw" : "by the way",
    "bwl" : "bursting with laughter",
    "c/o" : "care of",
    "cet" : "central european time",
    "cf" : "compare",
    "cia" : "central intelligence agency",
    "csl" : "can not stop laughing",
    "cu" : "see you",
    "cul8r" : "see you later",
    "cv" : "curriculum vitae",
    "cwot" : "complete waste of time",
    "cya" : "see you",
    "cyt" : "see you tomorrow",
    "dae" : "does anyone else",
    "dbmib" : "do not bother me i am busy",
    "diy" : "do it yourself",
    "dm" : "direct message",
    "dwh" : "during work hours",
    "e123" : "easy as one two three",
    "eet" : "eastern european time",
    "eg" : "example",
    "embm" : "early morning business meeting",
    "encl" : "enclosed",
    "encl." : "enclosed",
    "etc" : "and so on",
    "faq" : "frequently asked questions",
    "fawc" : "for anyone who cares",
    "fb" : "facebook",
    "fc" : "fingers crossed",
    "fig" : "figure",
    "fimh" : "forever in my heart", 
    "ft." : "feet",
    "ft" : "featuring",
    "ftl" : "for the loss",
    "ftw" : "for the win",
    "fwiw" : "for what it is worth",
    "fyi" : "for your information",
    "g9" : "genius",
    "gahoy" : "get a hold of yourself",
    "gal" : "get a life",
    "gcse" : "general certificate of secondary education",
    "gfn" : "gone for now",
    "gg" : "good game",
    "gl" : "good luck",
    "glhf" : "good luck have fun",
    "gmt" : "greenwich mean time",
    "gmta" : "great minds think alike",
    "gn" : "good night",
    "g.o.a.t" : "greatest of all time",
    "goat" : "greatest of all time",
    "goi" : "get over it",
    "gps" : "global positioning system",
    "gr8" : "great",
    "gratz" : "congratulations",
    "gyal" : "girl",
    "h&c" : "hot and cold",
    "hp" : "horsepower",
    "hr" : "hour",
    "hrh" : "his royal highness",
    "ht" : "height",
    "ibrb" : "i will be right back",
    "ic" : "i see",
    "icq" : "i seek you",
    "icymi" : "in case you missed it",
    "idc" : "i do not care",
    "idgadf" : "i do not give a damn fuck",
    "idgaf" : "i do not give a fuck",
    "idk" : "i do not know",
    "ie" : "that is",
    "i.e" : "that is",
    "ifyp" : "i feel your pain",
    "IG" : "instagram",
    "iirc" : "if i remember correctly",
    "ilu" : "i love you",
    "ily" : "i love you",
    "imho" : "in my humble opinion",
    "imo" : "in my opinion",
    "imu" : "i miss you",
    "iow" : "in other words",
    "irl" : "in real life",
    "j4f" : "just for fun",
    "jic" : "just in case",
    "jk" : "just kidding",
    "jsyk" : "just so you know",
    "l8r" : "later",
    "lb" : "pound",
    "lbs" : "pounds",
    "ldr" : "long distance relationship",
    "lmao" : "laugh my ass off",
    "lmfao" : "laugh my fucking ass off",
    "lol" : "laughing out loud",
    "ltd" : "limited",
    "ltns" : "long time no see",
    "m8" : "mate",
    "mf" : "motherfucker",
    "mfs" : "motherfuckers",
    "mfw" : "my face when",
    "mofo" : "motherfucker",
    "mph" : "miles per hour",
    "mr" : "mister",
    "mrw" : "my reaction when",
    "ms" : "miss",
    "mte" : "my thoughts exactly",
    "nagi" : "not a good idea",
    "nbc" : "national broadcasting company",
    "nbd" : "not big deal",
    "nfs" : "not for sale",
    "ngl" : "not going to lie",
    "nhs" : "national health service",
    "nrn" : "no reply necessary",
    "nsfl" : "not safe for life",
    "nsfw" : "not safe for work",
    "nth" : "nice to have",
    "nvr" : "never",
    "nyc" : "new york city",
    "oc" : "original content",
    "og" : "original",
    "ohp" : "overhead projector",
    "oic" : "oh i see",
    "omdb" : "over my dead body",
    "omg" : "oh my god",
    "omw" : "on my way",
    "p.a" : "per annum",
    "p.m" : "after midday",
    "pm" : "prime minister",
    "poc" : "people of color",
    "pov" : "point of view",
    "pp" : "pages",
    "ppl" : "people",
    "prw" : "parents are watching",
    "ps" : "postscript",
    "pt" : "point",
    "ptb" : "please text back",
    "pto" : "please turn over",
    "qpsa" : "what happens", #"que pasa",
    "ratchet" : "rude",
    "rbtl" : "read between the lines",
    "rlrt" : "real life retweet", 
    "rofl" : "rolling on the floor laughing",
    "roflol" : "rolling on the floor laughing out loud",
    "rotflmao" : "rolling on the floor laughing my ass off",
    "rt" : "retweet",
    "ruok" : "are you ok",
    "sfw" : "safe for work",
    "sk8" : "skate",
    "smh" : "shake my head",
    "sq" : "square",
    "srsly" : "seriously", 
    "ssdd" : "same stuff different day",
    "tbh" : "to be honest",
    "tbs" : "tablespooful",
    "tbsp" : "tablespooful",
    "tfw" : "that feeling when",
    "thks" : "thank you",
    "tho" : "though",
    "thx" : "thank you",
    "tia" : "thanks in advance",
    "til" : "today i learned",
    "tl;dr" : "too long i did not read",
    "tldr" : "too long i did not read",
    "tmb" : "tweet me back",
    "tntl" : "trying not to laugh",
    "ttyl" : "talk to you later",
    "u" : "you",
    "u2" : "you too",
    "u4e" : "yours for ever",
    "utc" : "coordinated universal time",
    "w/" : "with",
    "w/o" : "without",
    "w8" : "wait",
    "wassup" : "what is up",
    "wb" : "welcome back",
    "wtf" : "what the fuck",
    "wtg" : "way to go",
    "wtpa" : "where the party at",
    "wuf" : "where are you from",
    "wuzup" : "what is up",
    "wywh" : "wish you were here",
    "yd" : "yard",
    "ygtr" : "you got that right",
    "ynk" : "you never know",
    "zzz" : "sleeping bored and tired"
}


def convert_abbrev(word):
    return abbreviations[word.lower()] if word.lower() in abbreviations.keys() else word

# Esta lista de contractions la obtuvimos de un notebook de Kaggle también, el cual pone como fuente al siguiente
# post de stackoverflow http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are",
"thx"   : "thanks"
}


def remove_contractions(text):
    return contractions[text.lower()] if text.lower() in contractions.keys() else text

df_train = read_train[['id', 'text', 'target']]
df_test = read_test[['id', 'text']]

# Limpiamos los datos de la forma usual
# Quitamos las urls
df_train['text'] = df_train['text'].str.replace(r'http:\/\/.*', '', regex=True).replace(r'https:\/\/.*', '', regex=True)
df_test['text'] = df_test['text'].str.replace(r'http:\/\/.*', '', regex=True).replace(r'https:\/\/.*', '', regex=True)

# Quitamos user mentions, signos de puntuación, hashtags y stopwords.
def clean_text(text):
    words = text.lower().split(' ')
    words = [convert_abbrev(word) for word in words]
    words = [remove_contractions(word) for word in words]
    text = ' '.join([word for word in words if not word.startswith('@') and word not in stopwords.words('english')])
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

df_train['text'] = df_train['text'].apply(clean_text)
df_test['text'] = df_test['text'].apply(clean_text)

train_tweets = df_train['text'].tolist()
train_target = df_train['target']
test_tweets = df_test['text'].tolist()
len(train_tweets), len(train_target), len(test_tweets)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user

(7613, 7613, 3263)

In [115]:
oov_token = "<UNK>"

tokenizer = Tokenizer(oov_token=oov_token)
tokenizer.fit_on_texts(train_tweets)
vocabulary_size = len(tokenizer.word_index) + 1

sequences_train = tokenizer.texts_to_sequences(train_tweets)
sequence_test = tokenizer.texts_to_sequences(test_tweets)

max_padding = 25
padded_vecs_train = pad_sequences(sequences_train, maxlen=max_padding, padding='post')
padded_vecs_test = pad_sequences(sequence_test, maxlen=max_padding, padding='post')

# Ahora vamos a crear una matriz que tendrá los embeddings de Google
# correspondientes a cada palabra de nuestro vocabulario.
# Esto se lo pasaremos como pesos a la capa de Embedding del modelo a entrenar.
embedding_dim = 300
embedding_matrix = np.zeros((vocabulary_size, embedding_dim))
oov_words = 0
for word, i in tokenizer.word_index.items():
  try:
    embedding_vector = word2vec[word]
    embedding_matrix[i] = embedding_vector
  except:
    oov_words += 1
    continue

oov_words

4603

In [116]:
X_train, X_test, y_train, y_test = train_test_split(padded_vecs_train, train_target, test_size=0.2, random_state=31)

X_train.shape, X_test.shape, y_train.shape, y_test.shape 

((6090, 25), (1523, 25), (6090,), (1523,))

In [161]:
N_FILTERS = 128
DENSE_DROPOUT = 0.5

input_text = Input(shape=(max_padding))
emb_text = Embedding(input_dim=vocabulary_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_padding, trainable=False)(input_text)

conv1 = Conv1D(N_FILTERS, 3, padding='same', activation='relu')(emb_text)
conv1 = GlobalMaxPool1D()(conv1)

conv2 = Conv1D(N_FILTERS, 4, padding='same', activation='relu')(emb_text)
conv2 = GlobalMaxPool1D()(conv2)

conv3 = Conv1D(N_FILTERS, 5, padding='same', activation='relu')(emb_text)
conv3 = GlobalMaxPool1D()(conv3)

conv_output = concatenate([conv1, conv2, conv3], axis=1, name='concat_pooling')

conv_output = Dense(64, activation='relu',  name='output_for_svc')(conv_output)
conv_output = Dropout(DENSE_DROPOUT)(conv_output)

prediction = Dense(1, activation='sigmoid')(conv_output)

model_conv = Model(input_text, prediction)
opt = Adam(learning_rate=0.001)
model_conv.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy', f1_m])
model_conv.summary()

Model: "functional_82"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_25 (InputLayer)           [(None, 25)]         0                                            
__________________________________________________________________________________________________
embedding_24 (Embedding)        (None, 25, 300)      4609200     input_25[0][0]                   
__________________________________________________________________________________________________
conv1d_72 (Conv1D)              (None, 25, 128)      115328      embedding_24[0][0]               
__________________________________________________________________________________________________
conv1d_73 (Conv1D)              (None, 25, 128)      153728      embedding_24[0][0]               
______________________________________________________________________________________

In [162]:
history = model_conv.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=64, verbose=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [163]:
extractor = Model(model_conv.inputs, model_conv.get_layer('output_for_svc').output)
features_train = extractor.predict(X_train)
features_val = extractor.predict(X_test)

In [164]:
features_train.shape, features_val.shape

((6090, 64), (1523, 64))

In [171]:
svc = svm.SVC()

params = {'C': loguniform(1e0, 1e3),
          'gamma': loguniform(1e-4, 1e-3),
          'kernel': ['rbf', 'linear', 'poly']
}

grid = RandomizedSearchCV(svc, param_distributions=params, verbose=True, cv=5, n_iter=10, n_jobs=-1)
grid.fit(features_train, y_train)

grid.best_params_, grid.best_score_

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   49.6s
[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:   51.5s finished


({'C': 2.724380743522747, 'gamma': 0.0004788032177166067, 'kernel': 'rbf'},
 0.9674876847290641)

In [173]:
preds = grid.predict(features_val)
metrics(preds, y_test)

Verdaderos Negativos: 711
Falsos Negativos: 159
Verdaderos Positivos: 510
Falsos Positivos: 143

precision score: 0.781010719754977
recall score: 0.7623318385650224
f1 score: 0.7715582450832071


In [174]:
features_test = extractor.predict(padded_vecs_test)
kaggle_preds = grid.predict(features_test)

results = df_test[['id']]
results['target'] = kaggle_preds
results.to_csv('cnn-svm-v1.csv', index=False)

# Otra CNN al momento + SVM

Ahora vamos a intentar correr el modelo que mejor nos dio pero con otros embeddings, y aplicar la misma idea de utilizar los pesos de la última capa fully connected para usar a modo de features al entrenar SVM.

In [177]:
# Este modelo sacó 0.81274 en Kaggle.
# Pasamos a la api funcional para poder meter filtros en paralelo.
# No hay justificación para el dropout en la capa de Embedding, it just worked.
n_filters = 200

inputs = Input(shape=(max_padding))
emb = Embedding(input_dim=vocabulary_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_padding, trainable=False)(inputs)
drop = Dropout(0.5)(emb)

conv1 = Conv1D(n_filters, 5, padding='same', activation='relu')(drop)
conv1 = GlobalMaxPool1D()(conv1)
conv1 = Dropout(0.5)(conv1)

conv2 = Conv1D(n_filters, 4, padding='same', activation='relu')(drop)
conv2 =  GlobalMaxPool1D()(conv2)
conv2 = Dropout(0.5)(conv2)

conv3 = Conv1D(n_filters, 3, padding='same', activation='relu')(drop)
conv3 = GlobalMaxPool1D()(conv3)
conv3 = Dropout(0.5)(conv3)

concat = concatenate([conv1, conv2, conv3], axis=1)
drop_concat = Dropout(0.25)(concat)
out = Dense(128, activation='relu', name='output_for_svc')(drop_concat)
out = Dropout(0.25)(out)
out = Dense(1, activation='sigmoid')(out)

model = Model(inputs, out)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [178]:
epochs = 20
model.fit(X_train, y_train, epochs=epochs, validation_data=(X_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f8fa9c8be10>

In [180]:
extractor_v2 = Model(model.inputs, model.get_layer('output_for_svc').output)
features_train_v2 = extractor_v2.predict(X_train)
features_val_v2 = extractor_v2.predict(X_test)
features_train_v2.shape, features_val_v2.shape

((6090, 128), (1523, 128))

In [194]:
svc_v2 = svm.SVC()

params = {'C': loguniform(1e0, 1e3),
          'gamma': loguniform(1e-4, 1e-3)
}

grid_v2 = RandomizedSearchCV(svc_v2, param_distributions=params, verbose=True, cv=5, n_iter=20, n_jobs=-1)
grid_v2.fit(features_train_v2, y_train)

grid_v2.best_params_, grid_v2.best_score_

Fitting 5 folds for each of 20 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   20.1s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:   45.9s finished


({'C': 208.3547464007065, 'gamma': 0.00020416800005588378}, 0.964367816091954)

In [195]:
preds_v2 = grid_v2.predict(features_val_v2)
metrics(preds_v2, y_test)

Verdaderos Negativos: 730
Falsos Negativos: 174
Verdaderos Positivos: 495
Falsos Positivos: 124

precision score: 0.7996768982229402
recall score: 0.7399103139013453
f1 score: 0.7686335403726708


In [199]:
features_test_v2 = extractor_v2.predict(padded_vecs_test)
kaggle_preds_v2 = grid_v2.predict(features_test_v2)

results = df_test[['id']]
results['target'] = kaggle_preds_v2
results.to_csv('cnn-svm-v2.csv', index=False)