In [None]:
!pip install gensim

In [84]:
import pandas as pd
from datetime import datetime
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import tensorflow as tf
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Embedding
from keras.layers import Conv1D, GlobalMaxPool1D
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
!wget -c http://nlp.stanford.edu/data/glove.6B.zip

--2020-08-04 00:15:36--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-08-04 00:15:36--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2020-08-04 00:15:36--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2020-0

In [30]:
!unzip glove.6B.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [31]:
url_train = 'https://raw.githubusercontent.com/fsicardir/datos-tp2/master/dataset/train.csv?token=AFVAIUW66UE3NA5X2SYXNPC7GHGJY'
url_test = 'https://raw.githubusercontent.com/fsicardir/datos-tp2/master/dataset/test.csv?token=AFVAIUUSBVEOOMDIFV4GU6C7GHGNK'

read_train = pd.read_csv(url_train)
read_test = pd.read_csv(url_test)

In [32]:
df_train = read_train[['id', 'text', 'target']]
df_test = read_test[['id', 'text']]

In [None]:
# Limpiamos los datos de la forma usual
# Quitamos las urls
df_train['text'] = df_train['text'].str.replace(r'http:\/\/.*', '', regex=True).replace(r'https:\/\/.*', '', regex=True)
df_test['text'] = df_test['text'].str.replace(r'http:\/\/.*', '', regex=True).replace(r'https:\/\/.*', '', regex=True)

# Quitamos user mentions, signos de puntuación, hashtags y stopwords.
def clean_text(text):
    words = text.lower().split(' ')
    text = ' '.join([word for word in words if not word.startswith('@') and word not in stopwords.words('english')])
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

df_train['text'] = df_train['text'].apply(clean_text)
df_test['text'] = df_test['text'].apply(clean_text)


In [37]:
train_tweets = df_train['text'].tolist()
train_target = df_train['target']
test_tweets = df_test['text'].tolist()
len(train_tweets), len(train_target), len(test_tweets)

(7613, 7613, 3263)

In [39]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_tweets)
vocabulary_size = len(tokenizer.word_index) + 1

In [44]:
sequences_train = tokenizer.texts_to_sequences(train_tweets)
len(sequences)

7613

In [45]:
# Sabemos que el tweet más largo en palabras tiene 25 palabras.
# Vamos a agregar el padding "al final"
max_padding = 25
padded_vecs_train = pad_sequences(sequences_train, maxlen=max_padding, padding='post')

In [46]:
# Cargamos los embeddings de GloVe.
glove_dict = {}
with open('glove.6B.100d.txt') as f:
  for line in f:
      values = line.split()
      word = values[0]
      vector = np.asarray(values[1:], dtype='float32')
      glove_dict[word] = vector

In [47]:
# Ahora vamos a crear una matriz que tendrá los embeddings de GloVe
# correspondientes a cada palabra de nuestro vocabulario.
# Esto se lo pasaremos como pesos a la capa de Embedding del modelo a entrenar.
embedding_dim = 100
embedding_matrix = np.zeros((vocabulary_size, embedding_dim))
for word, i in tokenizer.word_index.items():
    embedding_vector = glove_dict.get(word, None)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [65]:
# Vamos a dividir el set de entrenamiento para obtener un set de validación
# y así comenzar a entrenar nuestro modelo.
X_train, X_test, y_train, y_test = train_test_split(padded_vecs_train, train_target, test_size=0.2, random_state=17)

X_train.shape, y_train.shape, X_test.shape, y_test.shape

((6090, 25), (6090,), (1523, 25), (1523,))

In [68]:
# Ahora creemos nuestro modelo
model = Sequential()
emb_layer = Embedding(input_dim=vocabulary_size, output_dim=embedding_dim, weights=[embedding_matrix], input_length=max_padding, trainable=False)

model.add(emb_layer)
model.add(Conv1D(100, 10, activation='relu'))
model.add(GlobalMaxPool1D())
model.add(Dense(1, activation='sigmoid'))
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 25, 100)           1540300   
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 16, 100)           100100    
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 100)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 1,640,501
Trainable params: 100,201
Non-trainable params: 1,540,300
_________________________________________________________________


In [70]:
epochs = 5
model.fit(X_train, y_train, epochs=epochs, validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f23b355d5f8>

In [86]:
def metrics(predictions, y_test):
    tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
    print(f'Verdaderos Negativos: {tn}')
    print(f'Falsos Negativos: {fn}')
    print(f'Verdaderos Positivos: {tp}')
    print(f'Falsos Positivos: {fp}')
    print()
    print(f'f1 score: {f1_score(y_test,  predictions)}')

In [87]:
preds = model.predict_classes(X_test)

metrics(preds, y_test)

Verdaderos Negativos: 766
Falsos Negativos: 242
Verdaderos Positivos: 425
Falsos Positivos: 90

f1 score: 0.71912013536379


In [74]:
# Ahora haremos una predicción para la competencia.
sequences_test = tokenizer.texts_to_sequences(test_tweets)
padded_vecs_test = pad_sequences(sequences_test, maxlen=max_padding, padding='post')

kaggle_preds = model.predict_classes(padded_vecs_test)
len(kaggle_preds)

3263

In [79]:
result = df_test[['id']]
result['target'] = kaggle_preds
result.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1


In [85]:
now = datetime.now(tz=None).strftime('%Y-%m-%dT%H:%M:%S')
result.to_csv(f'prediction_{now}.csv', encoding='utf-8', index=False)