[View in Colaboratory](https://colab.research.google.com/github/diego-carvalho/tweet-classifier/blob/master/Tweet_classifier_Ngram_model.ipynb)

Para baixar os dados e executar:

Copie esse arquivo para seu google Drive:  https://drive.google.com/open?id=1cEAp9cCI7Z-FZL00gIzDcz1Lq2Agxotp

troque o `file_id` na linha 17 pelo id que aparece no link compartilhável do Google Drive.

para executar uma célula, use `shift-enter` (executa e vai para a próxima célula) ou `ctrl-enter`(só executa a célula)


In [0]:
!pip install -U -q PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from nltk.tokenize import RegexpTokenizer
import re

def clean_tweet(x):
  tokenizer = RegexpTokenizer(r'\w+')
  return re.sub(r'\d+', '', " ".join(tokenizer.tokenize(x)).lower())
  


def load_dataset(seed=42, test_size=0.2, binary=False):
  """Baixa o dataset e separa em treino e teste
  Params:
    seed: Seed para separação aleatória
    test_size: Percentual dos dados para teste
    binary: Se a avaliação vai ser binária (discurso de ódio ou não) 
            ou de três classes: Discurso de ódio, linguagem ofensivo ou nada
  
  Return:
    train(test)_tweets: Tweets para treino/test do algoritmo (pd.series de strings)
    train(test)_labels: categorização dos tweets (array de integers)
    
  """
  auth.authenticate_user()
  gauth = GoogleAuth()
  gauth.credentials = GoogleCredentials.get_application_default()
  drive = GoogleDrive(gauth)

  file_id = "1nHhy8X2MN85qvBUR4PlJZS8j9j-gwp61"
  downloaded  = drive.CreateFile({'id':file_id})
  downloaded.GetContentFile('labeled_data.p')
  
  data_set = pd.read_pickle("labeled_data.p")
  if binary:
    data_set["class"] = data_set["class"].apply(lambda x: 1 if x==1 else 0)
  train, test = train_test_split(data_set, test_size=test_size, shuffle=True)
  train_tweets = train.tweet.apply(clean_tweet)
  train_labels = train["class"]
  test_tweets = test.tweet.apply(clean_tweet)
  test_labels = test["class"]  
  return ((train_tweets, np.array(train_labels)), 
          (test_tweets, np.array(test_labels)))
(train, train_labels), (test, test_labels) = load_dataset(binary=True)

In [0]:
#tokenization + vetorization + Tf-IDF encoding
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

NGRAM_RANGE = (1,2)
TOP_K = 20000
TOKEN_MODE = 'word'
MIN_DOCUMENT_FREQUENCY = 29

def ngram_vectorizer(train, train_labels, test):
  vectorizer = TfidfVectorizer(ngram_range=NGRAM_RANGE,
                               strip_accents='unicode', decode_error='replace',
                               analyzer=TOKEN_MODE, 
                               min_df=MIN_DOCUMENT_FREQUENCY)
  """Gera vetor de TF-IDF com ngrams de tweets
  Params:
    ngram_range: tupla com o range de ngrams (1,2), por exemplo
    strip_accents: Estratégia para retirar acentos
    decode_error: Estratégia para lidar com erros
    analyzer: Modo de análise (palavra ou char?)
    min_df: Mínimo de documentos que devem conter o termo para ele ser considerado
    """

  X_train = vectorizer.fit_transform(train)
  X_test = vectorizer.transform(test)

  selector = SelectKBest(f_classif, k=min(TOP_K, X_train.shape[1]))
  selector.fit(X_train, train_labels)
  X_train = selector.transform(X_train).astype('float32')
  X_test = selector.transform(X_test).astype('float32')
  return X_train, X_test, vectorizer, selector



In [0]:
from tensorflow.python.keras import models
from tensorflow.python.keras.layers import Dense, Dropout

def mlp_model(layers, units, input_shape, num_classes,  dropout_rate=0.0):
  
  """Define o modelo de multi-layer perceptron a ser utilizado
  Params:
    layers: Número de camadas
    units: Número de neuronios por camada
    input_shape: Tamanho da entrada
    num_classes: Quantas classes devem ser classificadas
    droput_rate: Probabilide de um neurônio ser ignorado. 
                 Serve para evitar overfitting
    """
  output_units = num_classes
  if output_units==2:
    output_units=1
    output_activation="sigmoid"
  else:
    output_activation = "softmax"
  print(output_activation)
  model = models.Sequential()
  model.add(Dropout(rate=dropout_rate, input_shape=input_shape))
  
  for i in range(layers-1):
    model.add(Dense(units = units[i], activation='relu',
                    kernel_initializer="lecun_uniform",
                    bias_initializer="zeros"))
    
    model.add(Dropout(rate=dropout_rate))
  model.add(Dense(units=output_units, activation=output_activation,
                  kernel_initializer="lecun_uniform",
                  bias_initializer="zeros"))
  return model


In [0]:
import tensorflow as tf
def train_ngram_model(data,
                      learning_rate=1e-3,
                      epochs=1000,
                      batch_size=128,
                      layers=2,
                      units=64,
                      dropout_rate=0.0
                     ):
  """Treina um modelo de ponta-a-ponta
  Params:
    data: Tupla de tuplas: ((treino, label_treino), (teste, label_teste))
    learning_rate: O quão rápido o algoritmo vai aprender.
                   Quanto menor, melhor, mas mais demorado.
    epochs: Máximo de "passadas" pelos dados
    batch_size: Quantos exemplos são usados por vez para aprender
    layers: Número de camadas
    units: Número de neuronios por camada
    droput_rate: Probabilide de um neurônio ser ignorado. 
                 Serve para evitar overfitting                   
    """
  
  (X_train, train_labels), (X_test, test_labels) = data
  X_train, X_test, vectorizer, selector = ngram_vectorizer(X_train, train_labels, X_test)
  num_classes = len(np.unique(train_labels))

  model = mlp_model(layers=layers, units=units, dropout_rate=dropout_rate,
                  input_shape=X_train.shape[1:], num_classes=num_classes)

  if num_classes==2:
    loss="binary_crossentropy"
  else:
    loss="sparse_categorical_crossentropy"
  optimizer = tf.keras.optimizers.Adam(lr=learning_rate)
  model.compile(optimizer=optimizer, loss=loss, metrics=["acc"])

  callbacks= [tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=2)]
  history = model.fit(X_train, train_labels,
                      epochs=epochs, callbacks=callbacks,
                      validation_data=(X_test, test_labels),
                      verbose=1, batch_size=batch_size)
  history = history.history
  print('Validation accuracy: {acc}, loss: {loss}'.format(
      acc=history['val_acc'][-1], loss=history['val_loss'][-1]))
  model.save("hate_sppech_model.md5")
  return model, vectorizer, selector

In [5]:
model, vectorizer, selector = train_ngram_model(((train, train_labels), (test, test_labels)), layers=3, units=(256, 128, 64))

sigmoid
Train on 19826 samples, validate on 4957 samples
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Validation accuracy: 0.8900544680797792, loss: 0.32540685554377513


In [6]:
sample_tweet = "Someone in SF just honked and yelled \"learn to fucking drive.\" It was a driveless car"
# sample_tweet = "fucking hispanics stealing our jobs"
transformed_tweet = selector.transform(vectorizer.transform([clean_tweet(sample_tweet)]))
model.predict(transformed_tweet)

array([[0.91333956]], dtype=float32)