# Universal Sentence Encoder + clasificadores

Paper https://arxiv.org/pdf/1803.11175.pdf

In [17]:
import tensorflow as tf
import numpy as np
import pandas as pd

from sklearn import svm
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix, precision_score, recall_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import VotingClassifier
import tensorflow_hub as hub 
from keras.layers import Dense, Dropout, Input, Lambda, SpatialDropout1D
from keras.models import  Model
from keras.callbacks import EarlyStopping


import string

In [7]:
def metrics(predictions, y_test):
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
    print(f'Verdaderos Negativos: {tn}')
    print(f'Falsos Negativos: {fn}')
    print(f'Verdaderos Positivos: {tp}')
    print(f'Falsos Positivos: {fp}')
    print()
    print(f'precision score: {precision_score(y_test, predictions)}')
    print(f'recall score: {recall_score(y_test, predictions)}')
    print(f'f1 score: {f1_score(y_test,  predictions)}')

In [8]:
# Cargamos USE
embed = hub.load('https://tfhub.dev/google/universal-sentence-encoder-large/5')

# Obtenemos el embedding correspondiente para el tweet
def transform(text):
    vectors = [tf.reshape(embed([line]), [-1]).numpy() for line in text]
    return vectors

In [9]:
# Créditos a este notebook https://www.kaggle.com/nmaguette/up-to-date-list-of-slangs-for-text-preprocessing
abbreviations = {
    "$" : " dollar ",
    "€" : " euro ",
    "4ao" : "for adults only",
    "a.m" : "before midday",
    "a3" : "anytime anywhere anyplace",
    "aamof" : "as a matter of fact",
    "acct" : "account",
    "adih" : "another day in hell",
    "afaic" : "as far as i am concerned",
    "afaict" : "as far as i can tell",
    "afaik" : "as far as i know",
    "afair" : "as far as i remember",
    "afk" : "away from keyboard",
    "app" : "application",
    "approx" : "approximately",
    "apps" : "applications",
    "asap" : "as soon as possible",
    "asl" : "age, sex, location",
    "atk" : "at the keyboard",
    "ave." : "avenue",
    "aymm" : "are you my mother",
    "ayor" : "at your own risk", 
    "b&b" : "bed and breakfast",
    "b+b" : "bed and breakfast",
    "b.c" : "before christ",
    "b2b" : "business to business",
    "b2c" : "business to customer",
    "b4" : "before",
    "b4n" : "bye for now",
    "b@u" : "back at you",
    "bae" : "before anyone else",
    "bak" : "back at keyboard",
    "bbbg" : "bye bye be good",
    "bbc" : "british broadcasting corporation",
    "bbias" : "be back in a second",
    "bbl" : "be back later",
    "bbs" : "be back soon",
    "be4" : "before",
    "bfn" : "bye for now",
    "blvd" : "boulevard",
    "bout" : "about",
    "brb" : "be right back",
    "bros" : "brothers",
    "brt" : "be right there",
    "bsaaw" : "big smile and a wink",
    "btw" : "by the way",
    "bwl" : "bursting with laughter",
    "c/o" : "care of",
    "cet" : "central european time",
    "cf" : "compare",
    "cia" : "central intelligence agency",
    "csl" : "can not stop laughing",
    "cu" : "see you",
    "cul8r" : "see you later",
    "cv" : "curriculum vitae",
    "cwot" : "complete waste of time",
    "cya" : "see you",
    "cyt" : "see you tomorrow",
    "dae" : "does anyone else",
    "dbmib" : "do not bother me i am busy",
    "diy" : "do it yourself",
    "dm" : "direct message",
    "dwh" : "during work hours",
    "e123" : "easy as one two three",
    "eet" : "eastern european time",
    "eg" : "example",
    "embm" : "early morning business meeting",
    "encl" : "enclosed",
    "encl." : "enclosed",
    "etc" : "and so on",
    "faq" : "frequently asked questions",
    "fawc" : "for anyone who cares",
    "fb" : "facebook",
    "fc" : "fingers crossed",
    "fig" : "figure",
    "fimh" : "forever in my heart", 
    "ft." : "feet",
    "ft" : "featuring",
    "ftl" : "for the loss",
    "ftw" : "for the win",
    "fwiw" : "for what it is worth",
    "fyi" : "for your information",
    "g9" : "genius",
    "gahoy" : "get a hold of yourself",
    "gal" : "get a life",
    "gcse" : "general certificate of secondary education",
    "gfn" : "gone for now",
    "gg" : "good game",
    "gl" : "good luck",
    "glhf" : "good luck have fun",
    "gmt" : "greenwich mean time",
    "gmta" : "great minds think alike",
    "gn" : "good night",
    "g.o.a.t" : "greatest of all time",
    "goat" : "greatest of all time",
    "goi" : "get over it",
    "gps" : "global positioning system",
    "gr8" : "great",
    "gratz" : "congratulations",
    "gyal" : "girl",
    "h&c" : "hot and cold",
    "hp" : "horsepower",
    "hr" : "hour",
    "hrh" : "his royal highness",
    "ht" : "height",
    "ibrb" : "i will be right back",
    "ic" : "i see",
    "icq" : "i seek you",
    "icymi" : "in case you missed it",
    "idc" : "i do not care",
    "idgadf" : "i do not give a damn fuck",
    "idgaf" : "i do not give a fuck",
    "idk" : "i do not know",
    "ie" : "that is",
    "i.e" : "that is",
    "ifyp" : "i feel your pain",
    "IG" : "instagram",
    "iirc" : "if i remember correctly",
    "ilu" : "i love you",
    "ily" : "i love you",
    "imho" : "in my humble opinion",
    "imo" : "in my opinion",
    "imu" : "i miss you",
    "iow" : "in other words",
    "irl" : "in real life",
    "j4f" : "just for fun",
    "jic" : "just in case",
    "jk" : "just kidding",
    "jsyk" : "just so you know",
    "l8r" : "later",
    "lb" : "pound",
    "lbs" : "pounds",
    "ldr" : "long distance relationship",
    "lmao" : "laugh my ass off",
    "lmfao" : "laugh my fucking ass off",
    "lol" : "laughing out loud",
    "ltd" : "limited",
    "ltns" : "long time no see",
    "m8" : "mate",
    "mf" : "motherfucker",
    "mfs" : "motherfuckers",
    "mfw" : "my face when",
    "mofo" : "motherfucker",
    "mph" : "miles per hour",
    "mr" : "mister",
    "mrw" : "my reaction when",
    "ms" : "miss",
    "mte" : "my thoughts exactly",
    "nagi" : "not a good idea",
    "nbc" : "national broadcasting company",
    "nbd" : "not big deal",
    "nfs" : "not for sale",
    "ngl" : "not going to lie",
    "nhs" : "national health service",
    "nrn" : "no reply necessary",
    "nsfl" : "not safe for life",
    "nsfw" : "not safe for work",
    "nth" : "nice to have",
    "nvr" : "never",
    "nyc" : "new york city",
    "oc" : "original content",
    "og" : "original",
    "ohp" : "overhead projector",
    "oic" : "oh i see",
    "omdb" : "over my dead body",
    "omg" : "oh my god",
    "omw" : "on my way",
    "p.a" : "per annum",
    "p.m" : "after midday",
    "pm" : "prime minister",
    "poc" : "people of color",
    "pov" : "point of view",
    "pp" : "pages",
    "ppl" : "people",
    "prw" : "parents are watching",
    "ps" : "postscript",
    "pt" : "point",
    "ptb" : "please text back",
    "pto" : "please turn over",
    "qpsa" : "what happens", #"que pasa",
    "ratchet" : "rude",
    "rbtl" : "read between the lines",
    "rlrt" : "real life retweet", 
    "rofl" : "rolling on the floor laughing",
    "roflol" : "rolling on the floor laughing out loud",
    "rotflmao" : "rolling on the floor laughing my ass off",
    "rt" : "retweet",
    "ruok" : "are you ok",
    "sfw" : "safe for work",
    "sk8" : "skate",
    "smh" : "shake my head",
    "sq" : "square",
    "srsly" : "seriously", 
    "ssdd" : "same stuff different day",
    "tbh" : "to be honest",
    "tbs" : "tablespooful",
    "tbsp" : "tablespooful",
    "tfw" : "that feeling when",
    "thks" : "thank you",
    "tho" : "though",
    "thx" : "thank you",
    "tia" : "thanks in advance",
    "til" : "today i learned",
    "tl;dr" : "too long i did not read",
    "tldr" : "too long i did not read",
    "tmb" : "tweet me back",
    "tntl" : "trying not to laugh",
    "ttyl" : "talk to you later",
    "u" : "you",
    "u2" : "you too",
    "u4e" : "yours for ever",
    "utc" : "coordinated universal time",
    "w/" : "with",
    "w/o" : "without",
    "w8" : "wait",
    "wassup" : "what is up",
    "wb" : "welcome back",
    "wtf" : "what the fuck",
    "wtg" : "way to go",
    "wtpa" : "where the party at",
    "wuf" : "where are you from",
    "wuzup" : "what is up",
    "wywh" : "wish you were here",
    "yd" : "yard",
    "ygtr" : "you got that right",
    "ynk" : "you never know",
    "zzz" : "sleeping bored and tired"
}


def convert_abbrev(word):
    return abbreviations[word.lower()] if word.lower() in abbreviations.keys() else word

# Esta lista de contractions la obtuvimos de un notebook de Kaggle también, el cual pone como fuente al siguiente
# post de stackoverflow http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are",
"thx"   : "thanks",
"didnt" : "did not"
}


def remove_contractions(text):
    return contractions[text.lower()] if text.lower() in contractions.keys() else text

def clean_text(text):
    words = text.lower().split(' ')
    words = [convert_abbrev(word) for word in words]
    words = [remove_contractions(word) for word in words]
    text = ' '.join([word for word in words if not word.startswith('@')])
    return text

In [10]:
url_train = 'https://raw.githubusercontent.com/fsicardir/datos-tp2/master/dataset/train.csv?token=AFVAIUVCNNLG2DE4LNMEN2C7HMHQE'
url_test = 'https://raw.githubusercontent.com/fsicardir/datos-tp2/master/dataset/test.csv?token=AFVAIUWNQDPWBVOREJGS2727HMHPG'

df_train = pd.read_csv(url_train)
df_test = pd.read_csv(url_test)

# Quitamos las urls
df_train['text'] = df_train['text'].str.replace(r'http:\/\/.*', '', regex=True).replace(r'https:\/\/.*', '', regex=True)
df_test['text'] = df_test['text'].str.replace(r'http:\/\/.*', '', regex=True).replace(r'https:\/\/.*', '', regex=True)

# Con las keywords concatenadas no dio mejor resultado.
# df_train['keyword'].fillna("unknown", inplace=True)
# df_test['keyword'].fillna("unknown", inplace=True)
# df_train['keyword'] = df_train['keyword'].apply(lambda x: x.replace("%20", " "))
# df_test['keyword'] = df_test['keyword'].apply(lambda x: x.replace("%20", " "))

# df_train['text'] = df_train['text'] + df_train['keyword']
# df_test['text'] = df_test['text'] + df_test['keyword']

df_train['text'] = df_train['text'].apply(clean_text)
df_test['text'] = df_test['text'].apply(clean_text)

In [11]:
vectors_train = transform(df_train.text)
vectors_test = transform(df_test.text)
# Con random_state=1337 da .81 de f1 score.
# El submit de kaggle que dio 0.83 fue hecho con random_state=42 y sin limpiar los datos, ni siquiera URLs!

In [12]:
X_train, X_test, y_train, y_test = train_test_split(vectors_train, df_train.target, test_size=0.2, random_state=42)

In [16]:
svc = svm.SVC(random_state=42)
svc.fit(X_train, y_train)

preds = svc.predict(X_test)
metrics(preds, y_test)

Verdaderos Negativos: 801
Falsos Negativos: 173
Verdaderos Positivos: 476
Falsos Positivos: 73

precision score: 0.8670309653916212
recall score: 0.7334360554699538
f1 score: 0.7946577629382304


In [19]:
kaggle_preds = svc.predict(vectors_test)
df_test['target'] = kaggle_preds
df_test[['id', 'target']].to_csv('use-svm-clean-text.csv', index=False)

# Probemos utilizar KNN y otros clasificadores

In [65]:
knn = KNeighborsClassifier()
params = {
    'n_neighbors': [5, 25, 50],
    'weights': ['uniform', 'distance']
}

grid_knn = GridSearchCV(knn, param_grid=params, n_jobs=-1, cv=5)
grid_knn.fit(X_train, y_train)
knn_preds = grid_knn.predict(X_test)
metrics(knn_preds, y_test)

Verdaderos Negativos: 693
Falsos Negativos: 128
Verdaderos Positivos: 521
Falsos Positivos: 181

precision score: 0.7421652421652422
recall score: 0.802773497688752
f1 score: 0.771280532938564


In [66]:
regr = LogisticRegression(max_iter=1000)
params = {
    'C': [0.001,0.01,0.1,1,10,100,1000]
}
grid_regr = GridSearchCV(regr, param_grid=params)
grid_regr.fit(X_train, y_train)
regr_preds = grid_regr.predict(X_test)
metrics(regr_preds, y_test)
grid_regr.best_params_

Verdaderos Negativos: 778
Falsos Negativos: 167
Verdaderos Positivos: 482
Falsos Positivos: 96

precision score: 0.8339100346020761
recall score: 0.7426810477657936
f1 score: 0.7856560717196415


{'C': 1}

In [67]:
ridge = RidgeClassifier(alpha=12, solver='saga')
ridge.fit(X_train, y_train)
ridge_preds = ridge.predict(X_test)
metrics(ridge_preds, y_test)

Verdaderos Negativos: 778
Falsos Negativos: 176
Verdaderos Positivos: 473
Falsos Positivos: 96

precision score: 0.8312829525483304
recall score: 0.7288135593220338
f1 score: 0.7766830870279147


In [68]:
vcf = VotingClassifier(estimators=[('svc', svc), ('knn', grid_knn.best_estimator_),
                                   ('lr', grid_regr.best_estimator_), ('ridge', ridge)], weights=[2, 1, 1, 1])

In [69]:
vcf.fit(X_train, y_train)
vcf_preds = vcf.predict(X_test)
metrics(vcf_preds, y_test)

Verdaderos Negativos: 789
Falsos Negativos: 166
Verdaderos Positivos: 483
Falsos Positivos: 85

precision score: 0.8503521126760564
recall score: 0.7442218798151001
f1 score: 0.7937551355792933


In [70]:
# 0.82 en Kaggle :()
kaggle_preds = vcf.predict(vectors_test)
df_test['target'] = kaggle_preds
df_test[['id', 'target']].to_csv('use-ensemble.csv', index=False)

# NN

Vamos a intentar usar una red neuronal tradicional de dos capas.

In [71]:
X_train, X_test, y_train, y_test = train_test_split(df_train.text, df_train.target, test_size=0.2, random_state=1337)

In [72]:
DROPOUT_RATE = 0.5
NEURONS = 128
# Mejor resultado hasta ahora con dos capas densas de 128 neuronas y dropout 0.5
input = Input(shape=[], dtype=tf.string)
embedding = hub.KerasLayer('https://tfhub.dev/google/universal-sentence-encoder-large/5', 
                    trainable=False)(input)
dense = Dense(NEURONS, activation='relu')(embedding)
dense = Dropout(DROPOUT_RATE)(dense)
dense = Dense(NEURONS, activation='relu', name='output_for_svc')(dense)
dense = Dropout(DROPOUT_RATE)(dense)                   
prediction = Dense(1, activation='sigmoid')(dense)
model = Model(input, prediction)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None,)]                 0         
_________________________________________________________________
keras_layer (KerasLayer)     (None, 512)               147354880 
_________________________________________________________________
dense (Dense)                (None, 128)               65664     
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
output_for_svc (Dense)       (None, 128)               16512     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                

In [73]:
es = EarlyStopping(monitor='val_loss', min_delta=0, patience=3, verbose=0, mode='auto')
epochs = 10
model.fit(X_train, y_train, epochs=epochs, validation_data=(X_test, y_test), callbacks=[es])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10


<tensorflow.python.keras.callbacks.History at 0x7f7656789d30>

In [74]:
nn_preds = model.predict(X_test)
nn_preds = [1 if x >= 0.5 else 0 for x in nn_preds]
metrics(nn_preds, y_test)

Verdaderos Negativos: 790
Falsos Negativos: 174
Verdaderos Positivos: 490
Falsos Positivos: 69

precision score: 0.8765652951699463
recall score: 0.7379518072289156
f1 score: 0.8013082583810303


In [79]:
nn_kaggle_preds = model.predict(df_test.text)
df_test['target'] = [1 if x >= 0.5 else 0 for x in nn_kaggle_preds]
df_test[['id', 'target']].to_csv('use-nn-v2.csv', index=False)

# SVM + NN


In [76]:
extractor = Model(model.inputs, model.get_layer('output_for_svc').output)
features_train = extractor.predict(X_train)
features_val = extractor.predict(X_test)

In [77]:
svc_nn = svm.SVC()
svc_nn.fit(features_train, y_train)
preds = svc_nn.predict(features_val)
metrics(preds, y_test)

Verdaderos Negativos: 773
Falsos Negativos: 169
Verdaderos Positivos: 495
Falsos Positivos: 86

precision score: 0.8519793459552496
recall score: 0.7454819277108434
f1 score: 0.7951807228915664


In [None]:
#model.save_weights('use-nn.h5')