In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

# HatEval English Task B

Put a description

## Load data and exploratin

First of all, we need to load the dataset in format CSV to a python DataFrame

In [3]:
from lectura import load_data
df = load_data('../../data/train_en_B.tsv')
df

Unnamed: 0,id,text,HS,TR,AG
0,201,"Hurray, saving us $$$ in so many ways @potus @...",1,0,0
1,202,Why would young fighting age men be the vast m...,1,0,0
2,203,@KamalaHarris Illegals Dump their Kids at the ...,1,0,0
3,204,NY Times: 'Nearly All White' States Pose 'an A...,0,0,0
4,205,Orban in Brussels: European leaders are ignori...,0,0,0
5,206,@KurtSchlichter LEGAL is. Not illegal. #BuildT...,1,0,0
6,207,@RitaPanahi @826Maureen @RealCandaceO Antifa a...,0,0,0
7,208,Ex-Teacher Pleads Not guilty To Rape Charges h...,0,0,0
8,209,still places on our Bengali (Sylheti) class! i...,0,0,0
9,210,DFID Africa Regional Profile: July 2018 https:...,0,0,0


Let's see the dataset size

In [4]:
df.shape

(9000, 5)

9000 tweets in total. We can examinate the distribution of the dataset for binary hate speech

In [5]:
df.groupby('HS').size() / df.shape[0]

HS
0    0.579667
1    0.420333
dtype: float64

* 5217 (58%) no hate speech
* 3783 (42%) hate speech

In [6]:
df[df['HS'] == 1].groupby('AG').size() / 3783

AG
0    0.587893
1    0.412107
dtype: float64

* 2224 (59%) no agressive
* 1559 (41%) agressive

In [7]:
df[df['HS'] == 1].groupby('TR').size() / 3783

TR
0    0.645519
1    0.354481
dtype: float64

* 2442 (65%) no target
* 1341 (35%) target

We see if there is any null data

In [8]:
df.isnull().sum()

id      0
text    0
HS      0
TR      0
AG      0
dtype: int64

We start seeing general information about the data

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 5 columns):
id      9000 non-null int64
text    9000 non-null object
HS      9000 non-null int64
TR      9000 non-null int64
AG      9000 non-null int64
dtypes: int64(4), object(1)
memory usage: 351.6+ KB


Classifiers must work with numbers type so we see if any label is encoded as object in order to transform object types into int64 types.

In [10]:
# We can list non numerical properties, with a boolean indexing of the Series df.dtypes
df.dtypes[df.dtypes == object]

text    object
dtype: object

We do not have to make any encoding

### gsitk preprocessing

In [11]:
from gsitk.preprocess import pprocess_twitter, normalize, simple
import string
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.corpus import stopwords
import re
import emoji
class gsitkTransformer(BaseEstimator, TransformerMixin):
    emojis_list = emoji.UNICODE_EMOJI
    emoji_pattern = emoji.get_emoji_regexp()
    trash = ['“', '”', '—', '’', '–', '«', '»', '・・・', 'tq', '€', '…', '‘', 'elonghashtag', 'âž\x9d', 'ðÿ',
             '¦', '‹ðÿ\x8f', 'â‚¬', '„', '¯', 'ðÿ‡\xadðÿ‡·', 'âšï¸\x8f', '‰', 'ðÿ¤', 'ª', '\x8f', 
             'ðÿ¥\x81ðÿ¥\x81', 'ðÿ˜', '♪', '►', '•', '͡°', 'âž\x9d', '¤', '〝']
    stop = stopwords.words('english')
    def tokenize_doc(self, doc):
        text = pprocess_twitter.preprocess(doc).replace('<allcaps>', '').replace('<elong>', '')
        text = text.replace('\u2060', '').replace('\u2066', '').replace('¤', '').replace('ª', '')
        text = text.replace('<hastag>', '<hashtag>').replace('\u2069', '').replace('…', '').replace('˜', '')
        text = text.replace('<', '').replace('>', '').replace('¿', '').replace('—', '')
        text = text.replace('¡', '').replace('.user', 'user')       
        
        tokens = normalize.preprocess(text)
        tokens = [w for w in tokens if w not in string.punctuation and w not in self.trash 
                  and w not in self.stop and w not in self.emojis_list
                  and not self.emoji_pattern.search(w)]
        return tokens
    
    def fit(self, docs, y=None):
        return self
    
    def transform(self, docs):
        return [self.tokenize_doc(doc) for doc in docs]        

## Features

### Word Embeddings

In [12]:
from gsitk.features.word2vec import Word2VecFeatures
w2v_extractor = Word2VecFeatures(w2v_model_path='../../../../NOBACKUP/crawl-300d-2M.vec', w2v_format='google_txt', convolution=[1,0,0])

### Simon Features

In [13]:
from lectura import lexicon_generation
lexicon = lexicon_generation(df)

In [14]:
from collections import Counter
# lexicon mejorado
textrf = gsitkTransformer()

#Agressive
agressive = df['text'][df['AG'] == 1].values
agressive_tokens = textrf.transform(agressive)
agressive_tokens = [word for doc in agressive_tokens for word in doc]
agressive_lexicon = []

contador = Counter(agressive_tokens)
comunes = contador.most_common()

for tupla in comunes:
    agressive_lexicon.append(tupla[0])
    
# Target
target = df['text'][df['TR'] == 1].values
target_tokens = textrf.transform(target)
target_tokens = [word for doc in target_tokens for word in doc]
target_lexicon = []

contador = Counter(target_tokens)
comunes = contador.most_common()

for tupla in comunes:
    target_lexicon.append(tupla[0])
target_lexicon = list(set(target_lexicon) - set(agressive_lexicon))
lexicon_mejorado = [agressive_lexicon, target_lexicon]

In [15]:
from collections import Counter
# lexicon mejorado
textrf = gsitkTransformer()

# HS
hs = df['text'][df['HS'] == 1].values
hs_tokens = textrf.transform(hs)
hs_tokens = [word for doc in hs_tokens for word in doc]
hs_lexicon = []

contador = Counter(hs_tokens)
comunes = contador.most_common()

for tupla in comunes:
    hs_lexicon.append(tupla[0])
    
# NO HS
no_hs = df['text'][df['HS'] == 0].values
no_hs_tokens = textrf.transform(no_hs)
no_hs_tokens = [word for doc in no_hs_tokens for word in doc]
no_hs_lexicon = []

contador = Counter(no_hs_tokens)
comunes = contador.most_common()

for tupla in comunes:
    no_hs_lexicon.append(tupla[0])
no_hs_lexicon = list(set(no_hs_lexicon) - set(hs_lexicon))
lexicon_mejorado = [hs_lexicon, no_hs_lexicon]

In [None]:
from gsitk.features import simon
from gensim.models.keyedvectors import KeyedVectors

embedding_model = KeyedVectors.load_word2vec_format('../../../../NOBACKUP/crawl-300d-2M.vec', binary=False)
simon_model = simon.Simon(lexicon=lexicon_mejorado, embedding=embedding_model, n_lexicon_words=100)
similarity_model = simon.simon_pipeline(simon_transformer=simon_model, percentile=100)

#### Sentiment Lexicons

nltk

In [17]:
from nltk.corpus import opinion_lexicon
postive_words = opinion_lexicon.positive()
negative_words = opinion_lexicon.negative()
negative_words = list(set(negative_words) - set(postive_words))
sentiment_lexicon = [postive_words, negative_words]

Afinn

In [None]:
from lectura import read_afinn
afinn = read_afinn('../../data/AFINN-111.txt.txt')
afinn_lexicon['afinn'] = [list(afinn.index[afinn['value'] > 0].values), list(afinn.index[afinn['value'] < 0].values)]
afinn_lexicon_values['afinn'] = afinn['value'].to_dict()

ANEW

In [None]:
from lectura import read_anew
anew = read_anew('../../data/all.csv')
anew_lexicon['anew'] = [list(anew.Description[anew['value'] > 0].values), list(anew.Description[anew['value'] < 0].values)]
anew_lexicon_values['anew'] = anew.set_index('Description')['value'].to_dict()

Sentiwordnet

In [None]:
sentiwn = read_swn()
swn_lexicon['swn'] = [list(sentiwn.index[sentiwn['value'] > 0].values), list(sentiwn.index[sentiwn['value'] < 0].values)]
lswn_exicon_values['swn'] = sentiwn['value'].to_dict()

In [22]:
sentiment_model = simon.Simon(lexicon=sentiment_lexicon, embedding=embedding_model, n_lexicon_words=250)
senti_model = simon.simon_pipeline(simon_transformer=sentiment_model, percentile=25)

In [19]:
sentiment_vocabulary = [word for sublexicon in sentiment_lexicon for word in sublexicon]

#### Subjectivity Lexicon

In [23]:
from lectura import subjectivity_lexicon_generation
subjectivity_lexicon = subjectivity_lexicon_generation()
subjectivity_model = simon.Simon(lexicon=subjectivity_lexicon, embedding=embedding_model, n_lexicon_words=100)
subj_model = simon.simon_pipeline(simon_transformer=subjectivity_model, percentile=100)

In [24]:
subjectivity_vocabulary = [word for sublexicon in subjectivity_lexicon for word in sublexicon]

#### hashtag lexicon

In [25]:
from lectura import hashtag_lexicon_generation
hashtag_lexicon = hashtag_lexicon_generation(df)

### Surface Features

In [26]:
from lectura import english_tokenizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
ngrams_featurizer = Pipeline([
  ('count_vectorizer',  CountVectorizer(ngram_range = (2, 3), encoding = 'ISO-8859-1', 
                                        tokenizer=english_tokenizer)),
  ('tfidf_transformer', TfidfTransformer())
])

## Classification

The estimators and hyperparams to be used will be:

In [722]:
from sklearn import svm, linear_model, tree, ensemble, naive_bayes, neighbors, gaussian_process, multiclass
import numpy as np
from sklearn.neural_network import MLPClassifier
less_than_1 = [.0001,.001,.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
classifiers = {
    'logistic_regression': 
                   {'model': linear_model.LogisticRegression(), 
                    'params': 
                    {'clf__penalty': ['l1','l2'], 'clf__tol': [0.0001,0.001,0.01,0.1], 
                     'clf__C': np.append(less_than_1, np.arange(1,16))}}, 
               'linear_SVC': 
                   {'model': svm.LinearSVC(dual=False), 
                    'params': 
                    {'clf__penalty': ['l1','l2'], 'clf__tol': [0.0001,0.001,0.01,0.1], 
                     'clf__C': np.append(less_than_1, np.arange(1,16))}}, 
               'random_forest': 
                   {'model': ensemble.RandomForestClassifier(), 
                    'params': 
                    {'clf__n_estimators': np.arange(10, 110, 10), 'clf__max_depth': np.append(np.arange(1, 11), None)}},
}

Now that we have all the features and the estimators it's time to split the dataset into training and test

In [28]:
# Training and test spliting
from sklearn import preprocessing

# Evaluation
from sklearn import metrics
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

Load test data and exploration in the same way than train data.

In [29]:
df_test = load_data('../../data/dev_en_A.tsv')
df_test

Unnamed: 0,id,text,HS,TR,AG
0,18201,I swear I’m getting to places just in the nick...,0,0,0
1,18202,I’m an immigrant — and Trump is right on immig...,0,0,0
2,18203,#IllegalImmigrants #IllegalAliens #ElectoralSy...,1,0,1
3,18204,@DRUDGE_REPORT We have our own invasion issues...,1,0,1
4,18205,Worker Charged With Sexually Molesting Eight C...,0,0,0
5,18206,"UN picks refugees for UK, picks mostly Syrian ...",0,0,0
6,18207,@htTweets Delhi will be much better under Prez...,0,0,0
7,18208,Global Compact on Migration moves forward http...,0,0,0
8,18209,"Violence toward tourists in Barcelona, Spain -...",0,0,0
9,18210,Immigration decline costing UK economy billion...,0,0,0


In [30]:
df_test.shape

(1000, 5)

In [31]:
df_test.groupby('HS').size() / df_test.shape[0]

HS
0    0.573
1    0.427
dtype: float64

In [32]:
df_test[df_test['HS'] == 1].groupby('AG').size() / 427

AG
0    0.522248
1    0.477752
dtype: float64

In [33]:
df_test[df_test['HS'] == 1].groupby('TR').size() / 427

TR
0    0.487119
1    0.512881
dtype: float64

In [34]:
# gsitk problematic tweets
df.drop([2149], inplace=True)
#df.drop([7122], inplace=True)

In [35]:
df_train = df[df['HS'] == 1]
df_test = df_test[df_test['HS'] == 1]

In [663]:
# Train Test splitting
X_train = df_train['text'].values
y_train = df_train['TR'].values
X_test = df_test['text'].values
y_test = df_test['TR'].values

In [664]:
print(sorted(Counter(y_train).items()))

[(0, 2441), (1, 1341)]


In [None]:
df_evaluation = load_data('../../data/test_en.tsv')
df_evaluation.shape

In [None]:
df_competition_train = load_data('../../data/train_es_B.tsv')
df_competition_test = load_data('../../data/dev_es_B.tsv')

In [None]:
import pandas as pd
df_dev = pd.concat([df_competition_train, df_competition_test])
df_dev.shape

In [None]:
dev_lexicon = generate_lexicon(df_dev)
dev_hashtags = hashtag_lexicon_generation(df_dev)

In [None]:
df_HS = df_dev[df_dev['HS'] == 1]

Feature extraction

In [814]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import metrics
from sklearn.metrics import make_scorer, accuracy_score
from lectura import TextTransformer, LexicalStats, english_tokenizer, PosStats, LowerTransformer, SubjectivityStats
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from lectura import TwitterStats
from nltk.tokenize import TweetTokenizer
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_classif, chi2, RFE
from sklearn.decomposition import PCA

tweet_tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True)
gsitk_tokenizer = gsitkTransformer()

textFeatures = Pipeline([
    ('features', FeatureUnion([
        ('lexical_stats', Pipeline([
            ('stats', LexicalStats()),
            ('vectors', DictVectorizer()),
            ('scaler', MaxAbsScaler())
        ])),
        ('pos_stats', Pipeline([
            ('pos_stats', PosStats()),
            ('vectors', DictVectorizer())
        ])),
        ('word_embeddings', Pipeline([
            ('text_to_tokens', gsitkTransformer()),
            ('embedding_features', w2v_extractor),
        ])),
        ('simon_features', Pipeline([
            ('text_to_tokens', gsitkTransformer()),
            ('similarity', similarity_model),
        ])),
        ('sentiments_simon', Pipeline([
            ('text_to_tokens', gsitkTransformer()),
            ('sentiment', senti_model),
        ])),
        ('subjectivity_simon', Pipeline([
            ('text_to_tokens', gsitkTransformer()),
            ('subjectivity_simon', subj_model),
        ])),
        ('subjectivity', TfidfVectorizer(vocabulary=subjectivity_vocabulary, tokenizer=tweet_tokenizer.tokenize)),
        ('words', TfidfVectorizer(tokenizer=english_tokenizer)),
        ('ngrams', ngrams_featurizer),
        ('character_ngram', TfidfVectorizer(analyzer='char', ngram_range=(2,9))),
        ('sentiments', TfidfVectorizer(vocabulary=sentiment_vocabulary, tokenizer=tweet_tokenizer.tokenize)),
        ('emojis', TfidfVectorizer(token_pattern=emoji.get_emoji_regexp(), tokenizer=tweet_tokenizer.tokenize)),
        ('hashtags', Pipeline([
            ('lowercase_transformation', LowerTransformer()),
            ('hashtag_vectorizer', TfidfVectorizer(vocabulary=hashtag_lexicon, tokenizer=tweet_tokenizer.tokenize))            
        ])), 
        ('twitter stats', Pipeline([
            ('stats', TwitterStats()),
            ('vectors', DictVectorizer()),
            ('scaler', MaxAbsScaler())
        ])),
        ('subjectivity_stats', Pipeline([
            ('stats', SubjectivityStats()),
            ('subjectivity_vectors', DictVectorizer())
        ])),
        ('lda', Pipeline([
            ('count', CountVectorizer(tokenizer=english_tokenizer)),
            ('lda', LatentDirichletAllocation(n_topics=10, max_iter=5, learning_method='online', learning_offset=50,
                                             random_state=0))
        ]))
    ])),
    ('KBest', SelectKBest(f_classif, k=2500)),
    #('Percentile', SelectPercentile(f_classif, percentile=0.75))
    #('PCA', PCA(n_components=0.90, svd_solver='full'))
    #('RFE', RFE(estimator, n_features_to_select=10000, step=100))
])


In [459]:
X_train[866]

"More of Australia's rejected migrants headed to your American city https://t.co/dV3WozlXTX via @wordpressdotcom What's up with this ? Is USA the dumping ground ?"

In [None]:
from scipy.sparse import hstack, vstack, coo_matrix

features_train = textFeatures.fit_transform(X_train, y_train)
features_train.shape

In [None]:
features_test = textFeatures.transform(X_test)
features_test.shape

In [None]:
X_ag = df_HS['text'].values
y_ag = df_HS['AG'].values
X_tr = df_HS['text'].values
y_tr = df_HS['TR'].values

In [None]:
print(sorted(Counter(y_ag).items()))

In [None]:
features_dev = textFeatures.fit_transform(X_ag, y_ag)
features_dev.shape

In [291]:
% pdb

Automatic pdb calling has been turned OFF


#### Under-sampling

In [512]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(random_state=42)
features_train, y_train = rus.fit_resample(features_train, y_train)
features_train.shape

(3116, 10000)

In [513]:
print(sorted(Counter(y_train).items()))

[(0, 1558), (1, 1558)]


### Evaluation

In [287]:
import multiprocessing
n_jobs = int(multiprocessing.cpu_count() * 0.8)

In [None]:
import warnings 
warnings.filterwarnings("ignore")
from evaluation import evaluate
results = evaluate(features_train, y_train, features_test, y_test, classifiers, n_jobs=n_jobs)
results

### Outputting the results

In [None]:
from evaluation import createModel
# Agressive classifier: k=32500 and non-negative features 0.72
# classifier = svm.LinearSVC(dual=False, penalty='l1', tol=0.01, C=0.5)

# Target Classifier
# classifier = ...

model = createModel(classifier)

In [None]:
model.fit(features_dev, y_ag)

In [None]:
df_predictions = load_data('en_a.tsv')
df_predictions

In [None]:
from tqdm import tqdm
for row in tqdm(df_results.itertuples()):
        df_results.at[row.Index, 'AG'] = 0 if row[3] == 0 else model.predict(textFeatures.transform([row[2]]))

In [27]:
df_results = df_test.copy()
df_results.drop(columns=['text', 'HS', 'TR', 'AG'], inplace=True)
df_results.insert(1, 'prediction', predicted)

In [None]:
df_results.drop(columns=['text'], inplace=True)
df_results.to_csv('es_b.tsv', index=False, sep="\t", header=None)