In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

# HatEval Spanish Task A

Put a description

## Load data and exploration

First of all, we need to load the dataset in format CSV to a python DataFrame

In [3]:
from lectura import load_data
df = load_data('../../data/train_es_A.tsv')
df

Unnamed: 0,id,text,HS,TR,AG
0,20001,Easyjet quiere duplicar el número de mujeres p...,1,0,0
1,20002,El gobierno debe crear un control estricto de ...,1,0,0
2,20003,Yo veo a mujeres destruidas por acoso laboral ...,0,0,0
3,20004,"— Yo soy respetuoso con los demás, sólamente l...",0,0,0
4,20007,Antonio Caballero y como ser de mal gusto e ig...,0,0,0
5,20008,@Zurine3 Cuánto hijo de puta sobra en el mundo...,0,0,0
6,20009,Pablo Casado debería estudiar un máster en inm...,0,0,0
7,20010,vos me haces eso te pego una patada en la cara...,0,0,0
8,20012,#Noticiasdelalinea: Detienen en Guatapanal mot...,0,0,0
9,20013,@MaivePerez Llora te lo.mereces por zorra,1,1,1


Let's see the dataset size

In [4]:
df.shape

(4469, 5)

A total of 4469 tweets. We can examinate the distribution of the dataset for binary hate speech

In [5]:
df.groupby('HS').size() / df.shape[0]

HS
0    0.588722
1    0.411278
dtype: float64

* 2631 (59%) no hate speech
* 1838 (41%) hate speech

We see if there is any null data

In [6]:
df.isnull().sum()

id      0
text    0
HS      0
TR      0
AG      0
dtype: int64

There isn't any null data

We start seeing general information about the data

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4469 entries, 0 to 4468
Data columns (total 5 columns):
id      4469 non-null int64
text    4469 non-null object
HS      4469 non-null int64
TR      4469 non-null int64
AG      4469 non-null int64
dtypes: int64(4), object(1)
memory usage: 174.6+ KB


In [8]:
# We can list non numerical properties, with a boolean indexing of the Series df.dtypes
df.dtypes[df.dtypes == object]

text    object
dtype: object

### gsitk preprocessing

In [9]:
from gsitk.preprocess import pprocess_twitter, normalize
import string
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.corpus import stopwords
import re
import emoji
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
class gsitkTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, stemming):
        self.stemming = stemming
        self.emojis_list = emoji.UNICODE_EMOJI
        self.emoji_pattern = emoji.get_emoji_regexp()
        self.trash = ['“', '”', '—', '’', '–', '«', '»', '・・・', 'tq', '€', '…', '‘', 'elonghashtag', 'âž\x9d', 'ðÿ',
                     '¦', '‹ðÿ\x8f', 'â‚¬', '„', '¯', 'ðÿ‡\xadðÿ‡·', 'âšï¸\x8f', '‰', 'ðÿ¤', 'ª', '\x8f', 
                     'ðÿ¥\x81ðÿ¥\x81', 'ðÿ˜', '♪', '►', '͡°', '•', 'âž\x9d', '¤', '〝']
    stop = stopwords.words('spanish')
    def tokenize_doc(self, doc):
        text = pprocess_twitter.preprocess(doc).replace('<allcaps>', '').replace('<elong>', '')
        text = text.replace('<hastag>', '<hashtag>').replace('<user>', '<usuario>').replace('<repeat>', '<repetición>')
        text = text.replace('<number>', '<número>').replace('\u2060', '').replace('\u2066', '').replace('¤', '')
        text = text.replace('<', '').replace('>', '').replace('.user', 'user').replace('¿', '').replace('—', '').replace('¡', '')
        text = text.replace('xdd', 'xd').replace('\u2069', '').replace('\u2069', '').replace('…', '').replace('˜', '')
        text = text.replace('ª', '')
        trash = ['“', '”', '—', '’', '–', '«', '»', '・・・', 'tq', '€', '…']        
        tokens = normalize.preprocess(text)
        if self.stemming:
            snow = SnowballStemmer('spanish')
            tokens = [snow.stem(t) for t in tokens]
        tokens = [w for w in tokens if w not in string.punctuation and w not in trash 
                  and w not in self.stop and w not in self.emojis_list
                  and not self.emoji_pattern.search(w)]
        return tokens
    
    def fit(self, docs, y=None):
        return self
    
    def transform(self, docs):
        return [self.tokenize_doc(doc) for doc in docs]        

In [8]:
from collections import Counter
textrf = gsitkTransformer()
tokens = textrf.transform(df['text'].values)
tokens = [word for doc in tokens for word in doc]

In [9]:
contador = Counter(tokens)
comunes = contador.most_common()

In [262]:
comunes[:1000]

[('usuario', 321),
 ('puta', 168),
 ('url', 158),
 ('repetición', 121),
 ('hashtag', 120),
 ('perra', 66),
 ('callate', 53),
 ('si', 53),
 ('número', 50),
 ('zorra', 48),
 ('q', 43),
 ('cállate', 41),
 ('madre', 36),
 ('hijo', 31),
 ('ser', 27),
 ('mierda', 26),
 ('árabe', 25),
 ('mereces', 24),
 ('árabes', 22),
 ('bien', 20),
 ('inmigrantes', 19),
 ('polla', 18),
 ('mejor', 17),
 ('acoso', 17),
 ('solo', 16),
 ('voy', 16),
 ('mujer', 16),
 ('guarra', 15),
 ('enseñar', 15),
 ('subsaharianos', 14),
 ('va', 14),
 ('hace', 14),
 ('vos', 14),
 ('mas', 14),
 ('hacer', 13),
 ('ver', 13),
 ('novia', 13),
 ('mujeres', 13),
 ('españa', 13),
 ('pues', 12),
 ('así', 12),
 ('cara', 12),
 ('día', 12),
 ('gente', 12),
 ('escoria', 12),
 ('vida', 11),
 ('coño', 11),
 ('país', 11),
 ('tan', 11),
 ('refugiados', 11),
 ('merecía', 10),
 ('papeles', 10),
 ('sos', 10),
 ('mundo', 10),
 ('calles', 10),
 ('mal', 10),
 ('basura', 9),
 ('mira', 9),
 ('gran', 9),
 ('pinche', 9),
 ('españoles', 9),
 ('pasa', 9)

In [263]:
"ok" in set(embedding_model.vocab.keys())

True

## Features

### Word Embeddings

In [10]:
from gsitk.features.word2vec import Word2VecFeatures
w2v_extractor = Word2VecFeatures(w2v_model_path='../../../../NOBACKUP/SBW-vectors-300-min5.txt', w2v_format='google_txt', convolution=[1,0,0])

### Simon Features

In [11]:
from lectura import lexicon_generation
lexicon = lexicon_generation(df)

In [74]:
from collections import Counter
# lexicon mejorado
def generate_lexicon(df):
    textrf = gsitkTransformer(False)

    # HS
    hs = df['text'][df['HS'] == 1].values
    hs_tokens = textrf.transform(hs)
    hs_tokens = [word for doc in hs_tokens for word in doc]
    hs_lexicon = []

    contador = Counter(hs_tokens)
    comunes = contador.most_common()

    for tupla in comunes:
        hs_lexicon.append(tupla[0])

    # NO HS
    no_hs = df['text'][df['HS'] == 0].values
    no_hs_tokens = textrf.transform(no_hs)
    no_hs_tokens = [word for doc in no_hs_tokens for word in doc]
    no_hs_lexicon = []

    contador = Counter(no_hs_tokens)
    comunes = contador.most_common()

    for tupla in comunes:
        no_hs_lexicon.append(tupla[0])
    no_hs_lexicon = list(set(no_hs_lexicon) - set(hs_lexicon))
    return [hs_lexicon, no_hs_lexicon]

#### Hatebase lexicon

Inmigrants lexicon

In [87]:
import requests
import json
inmigrants_lexicon = []
url = "https://api.hatebase.org/4-0/get_vocabulary"

for i in range(1,3):
    params = {'token': "WWZiZwNyjwfzcvenJyxikbjHizuxdRmf", 'language': "SPA", 'page': i}
    s = requests.post(url, data=params).content.decode('utf-8')
    s = json.loads(s)
    for result in s['result']:
        if result['is_about_ethnicity']:
            inmigrants_lexicon.append(result['term'])
#inmigrants_lexicon = [word.lower() for word in inmigrants_lexicon if len(word.split(' ')) < 2]
inmigrants_lexicon = [word.lower() for word in inmigrants_lexicon]

Gender Lexicon

In [88]:
gender_lexicon = []
for i in range(1,3):
    params = {'token': "WWZiZwNyjwfzcvenJyxikbjHizuxdRmf", 'language': "SPA", 'page': i}
    s = requests.post(url, data=params).content.decode('utf-8')
    s = json.loads(s)
    for result in s['result']:
        if result['is_about_gender']:
            gender_lexicon.append(result['term'])
#gender_lexicon = [word.lower() for word in gender_lexicon if len(word.split(' ')) < 2]
gender_lexicon = [word.lower() for word in gender_lexicon]

In [91]:
inmigrants_lexicon = list(set(inmigrants_lexicon) - set(gender_lexicon))
hatebase = inmigrants_lexicon + gender_lexicon
len(hatebase)

99

In [92]:
bound = int(len(hatebase)/2)
hatebase_lexicon = [hatebase[:bound], hatebase[bound:]]

In [123]:
from gsitk.features import simon
from gensim.models.keyedvectors import KeyedVectors

#embedding_model = KeyedVectors.load_word2vec_format('../../../../NOBACKUP/SBW-vectors-300-min5.txt', binary=False)
simon_model = simon.Simon(lexicon=hatebase_lexicon, embedding=embedding_model, n_lexicon_words=100)
similarity_model = simon.simon_pipeline(simon_transformer=simon_model, percentile=100)

#### Sentiment lexicon

In [14]:
with open('../../data/positive_words_es.txt') as f:
    positive_words = f.readlines()
positive_words = [word.strip() for word in positive_words]
with open('../../data/negative_words_es.txt') as f:
    negative_words = f.readlines()
negative_words = [word.strip() for word in negative_words]

In [15]:
sentiment_lexicon = [positive_words, negative_words]
sentiment_model = simon.Simon(lexicon=sentiment_lexicon, embedding=embedding_model, n_lexicon_words=250)
senti_model = simon.simon_pipeline(simon_transformer=sentiment_model, percentile=25)

In [16]:
sentiment_vocabulary = [word for sublexicon in sentiment_lexicon for word in sublexicon]

#### Hashtag_lexicon

In [106]:
from lectura import hashtag_lexicon_generation
hashtag_lexicon = hashtag_lexicon_generation(df)

### Surface Features

We include the TF-IDF vectorizer

In [18]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.feature_extraction import DictVectorizer
from lectura import spanish_tokenizer

#count_vect = CountVectorizer(ngram_range=(1, 2), stop_words='english')
#X_counts = count_vect.fit_transform(transcription_train)
#X_test = count_vect.transform(transcription_test)
#X_counts.shape
#tfidf_transformer = TfidfTransformer()
#vectors_train = tfidf_transformer.fit_transform(X_counts)
#vectors_test = tfidf_transformer.transform(X_test)
#vectors_train.toarray()
gsitk_tokenizer = gsitkTransformer(True)


ngrams_featurizer = Pipeline([
  ('count_vectorizer',  CountVectorizer(ngram_range = (2, 3), encoding = 'ISO-8859-1', 
                                        tokenizer=gsitk_tokenizer.tokenize_doc, binary=True)),
  #('tfidf_transformer', TfidfTransformer())
])

## Classification

The estimators to be used will be:

In [19]:
from sklearn import svm, linear_model, tree, ensemble, naive_bayes, neighbors, gaussian_process, multiclass
import numpy as np
less_than_1 = [.0001,.001,.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
classifiers = {'logistic_regression': 
                   {'model': linear_model.LogisticRegression(), 
                    'params': 
                    {'clf__penalty': ['l1','l2'], 'clf__tol': [0.0001,0.001,0.01,0.1], 
                     'clf__C': np.append(less_than_1, np.arange(1,16))}}, 
               'linear_SVC': 
                   {'model': svm.LinearSVC(dual=False), 
                    'params': 
                    {'clf__penalty': ['l1','l2'], 'clf__tol': [0.0001,0.001,0.01,0.1], 
                     'clf__C': np.append(less_than_1, np.arange(1,16))}}, 
               'random_forest': 
                   {'model': ensemble.RandomForestClassifier(), 
                    'params': 
                    {'clf__n_estimators': np.arange(10, 110, 10), 'clf__max_depth': np.append(np.arange(1, 11), None)}} 
}

In [20]:
#vocab_set = set(embedding_model.vocab.keys())
#print(df.shape)
#df = df[df['text'].str.split(' ').apply(lambda toks: len(set(toks) & vocab_set)) > 0]
#print(df.shape)
# Execute this if you use simon or word embeddings (Tweet with problems)
df.drop([3356], inplace=True)

In [21]:
df.groupby('HS').size()/df.shape[0]

HS
0    0.58863
1    0.41137
dtype: float64

Load test data and exploration in the same way than train data.

In [22]:
df_test = load_data('../../data/dev_es_A.tsv')
df_test

Unnamed: 0,id,text,HS,TR,AG
0,20005,Me estoy comiendo la picada árabe más rica de ...,0,0,0
1,20006,@Haryachyzaychyk Callate zorra y mama duro! 😍,1,1,1
2,20011,Acabo de escuchar a Casado diciendo que hay DE...,0,0,0
3,20019,Y NADIE SE HA PREGUNTADO LO QUE LE VA A COSTAR...,1,0,1
4,20033,"@Fed_Durand Callate come sobra, más zorra son ...",1,1,1
5,20039,te quiero hacer mi reina árabe bomboncitocaramel,0,0,0
6,20046,@andreaacata cállate perra JAJAJAJAJAJA,1,1,1
7,20047,"En ""La Reina del Sur"" de @perezreverte , el pe...",0,0,0
8,20063,@AnderssonBoscan Vieja zorra imbécil y MENTIRO...,1,1,1
9,20101,@rjimenez_perez tu eres un hijo de puta perver...,0,0,0


In [23]:
df_test.shape

(500, 5)

In [24]:
df_test.groupby('HS').size() / df_test.shape[0]

HS
0    0.556
1    0.444
dtype: float64

In [25]:
# Train Test splitting
X_train = df['text'].values
y_train = df['HS'].values
X_test = df_test['text'].values
y_test = df_test['HS'].values

In [65]:
df_evaluation = load_data('../../data/test_es.tsv')
df_evaluation.shape

(1600, 2)

In [66]:
X_evaluation = df_evaluation['text'].values

In [67]:
import pandas as pd
df_dev = pd.concat([df, df_test])
df_dev.shape

(4968, 5)

In [75]:
dev_lexicon = generate_lexicon(df_dev)
dev_hashtags = hashtag_lexicon_generation(df_dev)

In [144]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import metrics
from sklearn.metrics import make_scorer, accuracy_score
from lectura import TextTransformer, LexicalStats, spanish_tokenizer, LowerTransformer, SubjectivityStats
from sklearn.decomposition import LatentDirichletAllocation
from lectura import TwitterStats, DenseTransformer
import emoji
from nltk.tokenize import TweetTokenizer
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_classif, chi2
from sklearn.decomposition import PCA
from sklearn.preprocessing import MaxAbsScaler

tweet_tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True)
gsitk_tokenizer = gsitkTransformer(True)

textFeatures = Pipeline([
    ('features', FeatureUnion([
        ('lexical_stats', Pipeline([
            ('stats', LexicalStats()),
            ('vectors', DictVectorizer()),
            ('scaler', MaxAbsScaler())
        ])),
        ('word_embeddings', Pipeline([
            ('text_to_tokens', gsitkTransformer(False)),
            ('embedding_features', w2v_extractor)
        ])),
        ('simon_features', Pipeline([
            ('text_to_tokens', gsitkTransformer(False)),
            ('similarity', similarity_model)
        ])),
        #('hatebase', Pipeline([
        #    ('lowecase_transformation', LowerTransformer()),
        #    ('hatebase_vectorizer', TfidfVectorizer(vocabulary=hatebase, tokenizer=tweet_tokenizer.tokenize))
        #])),
        #('sentiments', Pipeline([
        #   ('text_to_tokens', gsitkTransformer()),
        #   ('sentiment', senti_model)
        #])),
        ('sentiments', TfidfVectorizer(vocabulary=sentiment_vocabulary, tokenizer=tweet_tokenizer.tokenize)),
        ('hashtags', Pipeline([
            ('lowercase_transformation', LowerTransformer()),
            ('hashtag_vectorizer', CountVectorizer(vocabulary=hashtag_lexicon, tokenizer=tweet_tokenizer.tokenize))            
        ])),
        ('twitter stats', Pipeline([
            ('stats', TwitterStats()),
            ('vectors', DictVectorizer())
        ])),
        ('emojis', TfidfVectorizer(token_pattern=emoji.get_emoji_regexp(), tokenizer=tweet_tokenizer.tokenize)),
        ('words', TfidfVectorizer(tokenizer=gsitk_tokenizer.tokenize_doc)),
        ('ngrams', ngrams_featurizer),
        ('character_ngram', TfidfVectorizer(analyzer='char', ngram_range=(2,7))),
        #('subjectivity_stats', Pipeline([
        #    ('stats', SubjectivityStats()),
        #    ('vectors', DictVectorizer())
        #])),
        ('lda', Pipeline([
            ('count', CountVectorizer(tokenizer=gsitk_tokenizer.tokenize_doc)),
            ('lda', LatentDirichletAllocation(n_topics=10, max_iter=5, learning_method='online', learning_offset=50,
                                             random_state=0))
        ]))
    ])),
    ('KBest', SelectKBest(f_classif, k=4000)),
    #('to_dense', DenseTransformer()),    
    #('Percentile', SelectPercentile(f_classif, percentile=0.75))
    #('PCA', PCA(n_components=0.90, svd_solver='full'))
])

In [121]:
X_train[3356]

'- ¡¡Mi novia es sevillista y es una puta más oeee!! - Casi que prefiero procrear con la serpiente, Adán. - ¡¡Ha sido penalti, Eva!! - Paraíso mis ovarios.'

In [145]:
from scipy.sparse import hstack, vstack, coo_matrix

features_train = textFeatures.fit_transform(X_train, y_train)
features_train.shape

(4468, 4000)

In [146]:
features_test = textFeatures.transform(X_test)
features_test.shape

(500, 4000)

In [79]:
X_dev = df_dev['text'].values
y_dev = df_dev['HS'].values

features_dev = textFeatures.fit_transform(X_dev, y_dev)
features_dev.shape

(4968, 3000)

In [80]:
features_evaluation = textFeatures.transform(X_evaluation)
features_evaluation.shape

(1600, 3000)

In [33]:
% pdb

Automatic pdb calling has been turned OFF


We define our model

### Evaluation

In [57]:
import multiprocessing
n_jobs = int(multiprocessing.cpu_count() * 0.8)

In [147]:
import warnings 
warnings.filterwarnings("ignore")
from evaluation import evaluate
results = evaluate(features_train, y_train, features_test, y_test, classifiers, n_jobs=n_jobs)
results

  0%|          | 0/3 [00:00<?, ?it/s]

logistic_regression
Best score:  0.841060250391
Best params:  {'clf__C': 12.0, 'clf__penalty': 'l2', 'clf__tol': 0.1}




 33%|███▎      | 1/3 [01:49<03:38, 109.19s/it]

linear_SVC
Best score:  0.830179028133
Best params:  {'clf__C': 0.40000000000000002, 'clf__penalty': 'l2', 'clf__tol': 0.0001}




 67%|██████▋   | 2/3 [06:48<02:46, 166.22s/it]

random_forest
Best score:  0.821545050644
Best params:  {'clf__max_depth': None, 'clf__n_estimators': 100}




100%|██████████| 3/3 [08:04<00:00, 139.29s/it]


{'logistic_regression': {'params': {'memory': None,
   'steps': [('clf',
     LogisticRegression(C=12.0, class_weight=None, dual=False, fit_intercept=True,
               intercept_scaling=1, max_iter=100, multi_class='warn',
               n_jobs=None, penalty='l2', random_state=None, solver='warn',
               tol=0.1, verbose=0, warm_start=False))],
   'clf': LogisticRegression(C=12.0, class_weight=None, dual=False, fit_intercept=True,
             intercept_scaling=1, max_iter=100, multi_class='warn',
             n_jobs=None, penalty='l2', random_state=None, solver='warn',
             tol=0.1, verbose=0, warm_start=False),
   'clf__C': 12.0,
   'clf__class_weight': None,
   'clf__dual': False,
   'clf__fit_intercept': True,
   'clf__intercept_scaling': 1,
   'clf__max_iter': 100,
   'clf__multi_class': 'warn',
   'clf__n_jobs': None,
   'clf__penalty': 'l2',
   'clf__random_state': None,
   'clf__solver': 'warn',
   'clf__tol': 0.1,
   'clf__verbose': 0,
   'clf__warm_start': 

### Outputting the results

In [81]:
from evaluation import createModel
classifier = svm.LinearSVC(dual=False, penalty='l2', tol=0.0001, C=0.8)
model = createModel(classifier)

In [82]:
model.fit(features_dev, y_dev)

predicted = model.predict(features_evaluation)

In [84]:
df_results = df_evaluation.copy()
df_results.insert(2, 'HS', df_predictions['HS'].values)
df_results['TR'] = pd.Series(np.zeros(df_results.shape[0])-1, dtype='int64')
df_results['AG'] = pd.Series(np.zeros(df_results.shape[0])-1, dtype='int64')