In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

# HatEval Spanish Task B

Put a description

## Load data and exploration

First of all, we need to load the dataset in format CSV to a python DataFrame

In [326]:
from lectura import load_data
df = load_data('../../data/train_es_B.tsv')
df

Unnamed: 0,id,text,HS,TR,AG
0,20001,Easyjet quiere duplicar el número de mujeres p...,1,0,0
1,20002,El gobierno debe crear un control estricto de ...,1,0,0
2,20003,Yo veo a mujeres destruidas por acoso laboral ...,0,0,0
3,20004,"— Yo soy respetuoso con los demás, sólamente l...",0,0,0
4,20007,Antonio Caballero y como ser de mal gusto e ig...,0,0,0
5,20008,@Zurine3 Cuánto hijo de puta sobra en el mundo...,0,0,0
6,20009,Pablo Casado debería estudiar un máster en inm...,0,0,0
7,20010,vos me haces eso te pego una patada en la cara...,0,0,0
8,20012,#Noticiasdelalinea: Detienen en Guatapanal mot...,0,0,0
9,20013,@MaivePerez Llora te lo.mereces por zorra,1,1,1


Let's see the datset size

In [4]:
df.shape

(4469, 5)

A total of 4469 tweets. We can examinate the distribution of the dataset for binary hate speech

In [5]:
df.groupby('HS').size() / df.shape[0]

HS
0    0.588722
1    0.411278
dtype: float64

* 2631 (59%) no hate speech
* 1838 (41%) hate speech

In [6]:
df[df['HS'] == 1].groupby('AG').size() / 1838

AG
0    0.192057
1    0.807943
dtype: float64

* 353 (19%) no agressive
* 1485 (81%) agressive

In [7]:
df[df['HS'] == 1].groupby('TR').size() / 1838

TR
0    0.392274
1    0.607726
dtype: float64

* 721 (39%) no agressive
* 1117 (61%) agressive

We see if there is any null data

In [8]:
df.isnull().sum()

id      0
text    0
HS      0
TR      0
AG      0
dtype: int64

There isn't any null data

We start seeing general information about the data

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4469 entries, 0 to 4468
Data columns (total 5 columns):
id      4469 non-null int64
text    4469 non-null object
HS      4469 non-null int64
TR      4469 non-null int64
AG      4469 non-null int64
dtypes: int64(4), object(1)
memory usage: 174.6+ KB


In [10]:
# We can list non numerical properties, with a boolean indexing of the Series df.dtypes
df.dtypes[df.dtypes == object]

text    object
dtype: object

#### gsitk preprocessing

In [53]:
from gsitk.preprocess import pprocess_twitter, normalize
import string
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.corpus import stopwords
import re
import emoji
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
class gsitkTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, stemming):
        self.stemming = stemming
        self.emojis_list = emoji.UNICODE_EMOJI
        self.emoji_pattern = emoji.get_emoji_regexp()
        self.trash = ['“', '”', '—', '’', '–', '«', '»', '・・・', 'tq', '€', '…', '‘', 'elonghashtag', 'âž\x9d', 'ðÿ',
                     '¦', '‹ðÿ\x8f', 'â‚¬', '„', '¯', 'ðÿ‡\xadðÿ‡·', 'âšï¸\x8f', '‰', 'ðÿ¤', 'ª', '\x8f', 
                     'ðÿ¥\x81ðÿ¥\x81', 'ðÿ˜', '♪', '►', '͡°', '•', 'âž\x9d', '¤', '〝']
    stop = stopwords.words('spanish')
    def tokenize_doc(self, doc):
        text = pprocess_twitter.preprocess(doc).replace('<allcaps>', '').replace('<elong>', '')
        text = text.replace('<hastag>', '<hashtag>').replace('<user>', '<usuario>').replace('<repeat>', '<repetición>')
        text = text.replace('<number>', '<número>').replace('\u2060', '').replace('\u2066', '').replace('¤', '')
        text = text.replace('<', '').replace('>', '').replace('.user', 'user').replace('¿', '').replace('—', '').replace('¡', '')
        text = text.replace('xdd', 'xd').replace('\u2069', '').replace('\u2069', '').replace('…', '').replace('˜', '')
        text = text.replace('ª', '')
        trash = ['“', '”', '—', '’', '–', '«', '»', '・・・', 'tq', '€', '…']        
        tokens = normalize.preprocess(text)
        if self.stemming:
            snow = SnowballStemmer('spanish')
            tokens = [snow.stem(t) for t in tokens]
        tokens = [w for w in tokens if w not in string.punctuation and w not in trash 
                  and w not in self.stop and w not in self.emojis_list
                  and not self.emoji_pattern.search(w)]
        return tokens
    
    def fit(self, docs, y=None):
        return self
    
    def transform(self, docs):
        return [self.tokenize_doc(doc) for doc in docs]        

## Features

### Word Embeddings

In [12]:
from gsitk.features.word2vec import Word2VecFeatures
w2v_extractor = Word2VecFeatures(w2v_model_path='../../../../NOBACKUP/SBW-vectors-300-min5.txt', w2v_format='google_txt', convolution=[1,0,0])

### Simon Features

In [13]:
from lectura import lexicon_generation
lexicon = lexicon_generation(df)

In [498]:
from collections import Counter
# lexicon mejorado
def generate_lexicon(df):
    textrf = gsitkTransformer(False)

    # HS
    hs = df['text'][df['HS'] == 1].values
    hs_tokens = textrf.transform(hs)
    hs_tokens = [word for doc in hs_tokens for word in doc]
    hs_lexicon = []

    contador = Counter(hs_tokens)
    comunes = contador.most_common()

    for tupla in comunes:
        hs_lexicon.append(tupla[0])

    # NO HS
    no_hs = df['text'][df['HS'] == 0].values
    no_hs_tokens = textrf.transform(no_hs)
    no_hs_tokens = [word for doc in no_hs_tokens for word in doc]
    no_hs_lexicon = []

    contador = Counter(no_hs_tokens)
    comunes = contador.most_common()

    for tupla in comunes:
        no_hs_lexicon.append(tupla[0])
    no_hs_lexicon = list(set(no_hs_lexicon) - set(hs_lexicon))
    return [hs_lexicon, no_hs_lexicon]

In [624]:
from gsitk.features import simon
from gensim.models.keyedvectors import KeyedVectors

#embedding_model = KeyedVectors.load_word2vec_format('../../../../NOBACKUP/SBW-vectors-300-min5.txt', binary=False)
simon_model = simon.Simon(lexicon=dev_lexicon, embedding=embedding_model, n_lexicon_words=100)
similarity_model = simon.simon_pipeline(simon_transformer=simon_model, percentile=100)

#### Sentiment lexicon

In [16]:
with open('../../data/positive_words_es.txt') as f:
    positive_words = f.readlines()
positive_words = [word.strip() for word in positive_words]
with open('../../data/negative_words_es.txt') as f:
    negative_words = f.readlines()
negative_words = [word.strip() for word in negative_words]

In [17]:
sentiment_lexicon = [positive_words, negative_words]
sentiment_model = simon.Simon(lexicon=sentiment_lexicon, embedding=embedding_model, n_lexicon_words=250)
senti_model = simon.simon_pipeline(simon_transformer=sentiment_model, percentile=25)

In [18]:
sentiment_vocabulary = [word for sublexicon in sentiment_lexicon for word in sublexicon]

#### Hashtag lexicon

In [19]:
from lectura import hashtag_lexicon_generation
hashtag_lexicon = hashtag_lexicon_generation(df)

### Surface Features

We include the TF-IDF vectorizer

In [20]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.feature_extraction import DictVectorizer
from lectura import spanish_tokenizer

#count_vect = CountVectorizer(ngram_range=(1, 2), stop_words='english')
#X_counts = count_vect.fit_transform(transcription_train)
#X_test = count_vect.transform(transcription_test)
#X_counts.shape
#tfidf_transformer = TfidfTransformer()
#vectors_train = tfidf_transformer.fit_transform(X_counts)
#vectors_test = tfidf_transformer.transform(X_test)
#vectors_train.toarray()

ngrams_featurizer = Pipeline([
  ('count_vectorizer',  CountVectorizer(ngram_range = (2, 3), encoding = 'ISO-8859-1', 
                                        tokenizer=spanish_tokenizer)),
  ('tfidf_transformer', TfidfTransformer())
])

## Classification

The estimators to be used will be:

In [21]:
from sklearn import svm, linear_model, tree, ensemble, naive_bayes, neighbors, gaussian_process, multiclass
import numpy as np
less_than_1 = [.0001,.001,.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
classifiers = {'logistic_regression': 
                   {'model': linear_model.LogisticRegression(), 
                    'params': 
                    {'clf__penalty': ['l1','l2'], 'clf__tol': [0.0001,0.001,0.01,0.1], 
                     'clf__C': np.append(less_than_1, np.arange(1,16))}}, 
               'linear_SVC': 
                   {'model': svm.LinearSVC(dual=False), 
                    'params': 
                    {'clf__penalty': ['l1','l2'], 'clf__tol': [0.0001,0.001,0.01,0.1], 
                     'clf__C': np.append(less_than_1, np.arange(1,16))}}, 
               'random_forest': 
                   {'model': ensemble.RandomForestClassifier(), 
                    'params': 
                    {'clf__n_estimators': np.arange(10, 110, 10), 'clf__max_depth': np.append(np.arange(1, 11), None)}} 
}

In [22]:
# Training and test spliting
from sklearn import preprocessing

# Evaluation
from sklearn import metrics
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [24]:
df.groupby('HS').size()/df.shape[0]

HS
0    0.588722
1    0.411278
dtype: float64

Load test data and exploration in the same way than train data.

In [25]:
df_test = load_data('../../data/dev_es_A.tsv')
df_test

Unnamed: 0,id,text,HS,TR,AG
0,20005,Me estoy comiendo la picada árabe más rica de ...,0,0,0
1,20006,@Haryachyzaychyk Callate zorra y mama duro! 😍,1,1,1
2,20011,Acabo de escuchar a Casado diciendo que hay DE...,0,0,0
3,20019,Y NADIE SE HA PREGUNTADO LO QUE LE VA A COSTAR...,1,0,1
4,20033,"@Fed_Durand Callate come sobra, más zorra son ...",1,1,1
5,20039,te quiero hacer mi reina árabe bomboncitocaramel,0,0,0
6,20046,@andreaacata cállate perra JAJAJAJAJAJA,1,1,1
7,20047,"En ""La Reina del Sur"" de @perezreverte , el pe...",0,0,0
8,20063,@AnderssonBoscan Vieja zorra imbécil y MENTIRO...,1,1,1
9,20101,@rjimenez_perez tu eres un hijo de puta perver...,0,0,0


In [26]:
df_test.shape

(500, 5)

In [33]:
df_test.groupby('HS').size() / df_test.shape[0]

HS
0    0.556
1    0.444
dtype: float64

In [34]:
df_test[df_test['HS'] == 1].groupby('AG').size() / 222

AG
0    0.207207
1    0.792793
dtype: float64

In [35]:
df_test[df_test['HS'] == 1].groupby('TR').size() / 222

TR
0    0.382883
1    0.617117
dtype: float64

In [308]:
#vocab_set = set(embedding_model.vocab.keys())
#print(df.shape)
#df = df[df['text'].str.split(' ').apply(lambda toks: len(set(toks) & vocab_set)) > 0]
#print(df.shape)
# Execute this if you use simon or word embeddings (Tweet with problems)
df.drop([2664], inplace=True)

In [330]:
df['text'].values[2665]

'@Manaka_msyra @BrianD_Campbell callate zorra malnacida esta el armario empotrado tmb y tengo mil cosas ahi dentro'

In [338]:
df_train = df[df['HS'] == 1]
df_test = df_test[df_test['HS'] == 1]

In [471]:
# Train Test splitting
X_train = df_train['text'].values
y_train = df_train['TR'].values
X_test = df_test['text'].values
y_test = df_test['TR'].values

In [472]:
print(sorted(Counter(y_train).items()))

[(0, 721), (1, 1117)]


In [473]:
items = Counter(y_train)
diff_train = items[1] - items[0]
diff_train

396

In [474]:
print(sorted(Counter(y_test).items()))

[(0, 85), (1, 137)]


In [475]:
items = Counter(y_test)
diff_test = items[1] - items[0]
diff_test

52

In [544]:
df_evaluation = load_data('../../data/test_es.tsv')
df_evaluation.shape

(1600, 2)

In [545]:
X_evaluation = df_evaluation['text'].values

In [493]:
df_competition_train = load_data('../../data/train_es_B.tsv')
df_competition_test = load_data('../../data/dev_es_B.tsv')
df_competition_train.drop([3356], inplace=True)

In [494]:
import pandas as pd
df_dev = pd.concat([df_competition_train, df_competition_test])
df_dev.shape

(4968, 5)

In [622]:
dev_lexicon = generate_lexicon(df_dev)
dev_hashtags = hashtag_lexicon_generation(df_dev)

In [500]:
df_HS = df_dev[df_dev['HS'] == 1]

#### Oversampling

In [476]:
df_extra = df[df['HS'] == 0]
X_extra = df_extra['text'].values
y_extra = df_extra['TR'].values

In [477]:
X_extra = np.random.choice(X_extra, size=diff_train)
y_extra = y_extra[:diff_train]

In [478]:
X_train = np.append(X_train, X_extra)
y_train = np.append(y_train, y_extra)

In [479]:
assert len(X_train) == len(y_train)
permutation = np.random.permutation(len(X_train))
X_train = X_train[permutation]
y_train = y_train[permutation]

In [480]:
print(sorted(Counter(y_train).items()))

[(0, 1117), (1, 1117)]


In [278]:
df_extra_test = df_test[df['HS'] == 0]
X_extra_test = df_extra_test['text'].values
y_extra_test = df_extra_test['AG'].values
X_extra_test = np.random.choice(X_extra, size=diff_test)
y_extra_test = y_extra[:diff_test]
X_test = np.append(X_test, X_extra_test)
y_test = np.append(y_test, y_extra_test)
assert len(X_test) == len(y_test)
permutation = np.random.permutation(len(X_test))
X_test = X_test[permutation]
y_test = y_test[permutation]

In [481]:
print(sorted(Counter(y_test).items()))

[(0, 85), (1, 137)]


In [634]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import metrics
from sklearn.metrics import make_scorer, accuracy_score
from lectura import TextTransformer, LexicalStats, spanish_tokenizer, LowerTransformer, SubjectivityStats
from sklearn.decomposition import LatentDirichletAllocation
from lectura import TwitterStats, DenseTransformer
import emoji
from nltk.tokenize import TweetTokenizer
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_classif, chi2
from sklearn.decomposition import PCA
from sklearn.preprocessing import MaxAbsScaler

tweet_tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True)
gsitk_tokenizer = gsitkTransformer(True)

textFeatures = Pipeline([
    ('features', FeatureUnion([
        ('lexical_stats', Pipeline([
            ('stats', LexicalStats()),
            ('vectors', DictVectorizer()),
            ('scaler', MaxAbsScaler())
        ])),
        ('word_embeddings', Pipeline([
            ('text_to_tokens', gsitkTransformer(False)),
            ('embedding_features', w2v_extractor)
        ])),
        ('simon_features', Pipeline([
            ('text_to_tokens', gsitkTransformer(False)),
            ('similarity', similarity_model)
        ])),
        #('sentiments', Pipeline([
        #   ('text_to_tokens', gsitkTransformer()),
        #   ('sentiment', senti_model)
        #])),
        ('sentiments', TfidfVectorizer(vocabulary=sentiment_vocabulary, tokenizer=tweet_tokenizer.tokenize)),
        ('hashtags', Pipeline([
            ('lowercase_transformation', LowerTransformer()),
            ('hashtag_vectorizer', CountVectorizer(vocabulary=dev_hashtags, tokenizer=tweet_tokenizer.tokenize))            
        ])),
        ('twitter stats', Pipeline([
            ('stats', TwitterStats()),
            ('vectors', DictVectorizer())
        ])),
        ('emojis', TfidfVectorizer(token_pattern=emoji.get_emoji_regexp(), tokenizer=tweet_tokenizer.tokenize)),
        ('words', TfidfVectorizer(tokenizer=gsitk_tokenizer.tokenize_doc)),
        ('ngrams', ngrams_featurizer),
        ('character_ngram', TfidfVectorizer(analyzer='char', ngram_range=(2,7))),
        #('subjectivity_stats', Pipeline([
        #    ('stats', SubjectivityStats()),
        #    ('vectors', DictVectorizer())
        #])),
        ('lda', Pipeline([
            ('count', CountVectorizer(tokenizer=gsitk_tokenizer.tokenize_doc)),
            ('lda', LatentDirichletAllocation(n_topics=10, max_iter=5, learning_method='online', learning_offset=50,
                                             random_state=0))
        ]))
    ])),
    ('KBest', SelectKBest(f_classif, k=2500)),
    #('to_dense', DenseTransformer()),    
    #('Percentile', SelectPercentile(f_classif, percentile=0.75))
    #('PCA', PCA(n_components=0.90, svd_solver='full'))
])

In [337]:
tweet_malo = X_train[382]

In [486]:
from scipy.sparse import hstack, vstack, coo_matrix

features_train = textFeatures.fit_transform(X_train, y_train)
features_train.shape

(2234, 3000)

In [487]:
features_test = textFeatures.transform(X_test)
features_test.shape

(222, 3000)

In [522]:
X_ag = df_HS['text'].values
y_ag = df_HS['AG'].values
X_tr = df_HS['text'].values
y_tr = df_HS['TR'].values

In [524]:
print(sorted(Counter(y_ag).items()))

[(0, 399), (1, 1661)]


In [525]:
items = Counter(y_ag)
diff_ag = items[1] - items[0]
diff_ag

1262

In [526]:
items = Counter(y_tr)
diff_tr = items[1] - items[0]
diff_tr

448

In [527]:
# AG
df_dev_extra = df_dev[df_dev['HS'] == 0]
X_extra = df_dev_extra['text'].values
y_extra = df_dev_extra['AG'].values
X_extra = np.random.choice(X_extra, size=diff_ag)
y_extra = y_extra[:diff_ag]
X_ag = np.append(X_ag, X_extra)
y_ag = np.append(y_ag, y_extra)
assert len(X_ag) == len(y_ag)
permutation = np.random.permutation(len(X_ag))
X_ag = X_ag[permutation]
y_ag = y_ag[permutation]

# TR
X_extra = df_dev_extra['text'].values
y_extra = df_dev_extra['TR'].values
X_extra = np.random.choice(X_extra, size=diff_tr)
y_extra = y_extra[:diff_tr]
X_tr = np.append(X_tr, X_extra)
y_tr = np.append(y_tr, y_extra)
assert len(X_tr) == len(y_tr)
permutation = np.random.permutation(len(X_tr))
X_tr = X_tr[permutation]
y_tr = y_tr[permutation]

In [627]:
print(sorted(Counter(y_tr).items()))

[(0, 1254), (1, 1254)]


In [None]:
features_dev = textFeatures.fit_transform(X_ag, y_ag)
features_dev.shape

In [445]:
% pdb

Automatic pdb calling has been turned OFF


We define our model

### Evaluation

In [115]:
import multiprocessing
n_jobs = int(multiprocessing.cpu_count() * 0.8)

In [488]:
import warnings 
warnings.filterwarnings("ignore")
from evaluation import evaluate
results = evaluate(features_train, y_train, features_test, y_test, classifiers, n_jobs=n_jobs)
results

  0%|          | 0/3 [00:00<?, ?it/s]

logistic_regression
Best score:  0.925252525253
Best params:  {'clf__C': 0.10000000000000001, 'clf__penalty': 'l1', 'clf__tol': 0.0001}




 33%|███▎      | 1/3 [01:13<02:27, 73.94s/it]

linear_SVC
Best score:  0.923744096179
Best params:  {'clf__C': 0.69999999999999996, 'clf__penalty': 'l1', 'clf__tol': 0.1}




 67%|██████▋   | 2/3 [04:12<01:45, 105.29s/it]

random_forest
Best score:  0.925252525253
Best params:  {'clf__max_depth': 8, 'clf__n_estimators': 40}




100%|██████████| 3/3 [05:08<00:00, 90.44s/it] 


{'logistic_regression': {'params': {'memory': None,
   'steps': [('clf',
     LogisticRegression(C=0.10000000000000001, class_weight=None, dual=False,
               fit_intercept=True, intercept_scaling=1, max_iter=100,
               multi_class='warn', n_jobs=None, penalty='l1', random_state=None,
               solver='warn', tol=0.0001, verbose=0, warm_start=False))],
   'clf': LogisticRegression(C=0.10000000000000001, class_weight=None, dual=False,
             fit_intercept=True, intercept_scaling=1, max_iter=100,
             multi_class='warn', n_jobs=None, penalty='l1', random_state=None,
             solver='warn', tol=0.0001, verbose=0, warm_start=False),
   'clf__C': 0.10000000000000001,
   'clf__class_weight': None,
   'clf__dual': False,
   'clf__fit_intercept': True,
   'clf__intercept_scaling': 1,
   'clf__max_iter': 100,
   'clf__multi_class': 'warn',
   'clf__n_jobs': None,
   'clf__penalty': 'l1',
   'clf__random_state': None,
   'clf__solver': 'warn',
   'clf__tol'

### Outputting the results

In [None]:
from evaluation import createModel
classifier = svm.LinearSVC(dual=False, penalty='l2', tol=0.0001, C=0.8)
#classifier = linear_model.LogisticRegression(penalty='l1', C=0.1, tol=0.0001)

model = createModel(classifier)

In [None]:
model.fit(features_dev, y_ag)

In [539]:
df_predictions = load_data('es_a.tsv')
df_predictions

Unnamed: 0,id,HS
0,30344,0
1,30466,0
2,31084,0
3,34552,1
4,32538,1
5,32951,0
6,33564,1
7,31367,1
8,34047,0
9,33491,1


In [646]:
df_results = df_evaluation.copy()
df_results.insert(2, 'HS', df_predictions['HS'].values)
df_results['TR'] = pd.Series(np.zeros(df_results.shape[0])-1, dtype='int64')
df_results['AG'] = pd.Series(np.zeros(df_results.shape[0])-1, dtype='int64')

In [None]:
from tqdm import tqdm
for row in tqdm(df_results.itertuples()):
        df_results.at[row.Index, 'AG'] = 0 if row[3] == 0 else model.predict(textFeatures.transform([row[2]]))

In [647]:
df_results

Unnamed: 0,id,text,HS,TR,AG
0,30344,#CadaMañana cállate la puta que me pario Kohan...,0,-1,-1
1,30466,Estas navidades mi polla mereces,0,-1,-1
2,31084,Si no aprobas te pego una cojida que no te la ...,0,-1,-1
3,34552,"@AlecitoGamer @falklands_utd A, no entendiste ...",1,-1,-1
4,32538,"@deborahhq1973 Es normal, en Cataluña los Inde...",1,-1,-1
5,32951,@Zurine3 Incluyendo mundo educativo. Conociend...,0,-1,-1
6,33564,Tu eres una hija de puta en la cama que hace p...,1,-1,-1
7,31367,@Kaikot Víctimas son los subsaharianos que vie...,1,-1,-1
8,34047,@itsbreeen Soy la puta ama y lo demás son tont...,0,-1,-1
9,33491,La zorra asiática termina con una abundante co...,1,-1,-1


In [645]:
df_results.drop(columns=['text'], inplace=True)
df_results.to_csv('es_b.tsv', index=False, sep="\t", header=None)