In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

# HatEval English Task A

Put a description

## Load data and exploratin

First of all, we need to load the dataset in format CSV to a python DataFrame

In [3]:
from lectura import load_data
df = load_data('../../data/train_en_A.tsv')
df

Unnamed: 0,id,text,HS,TR,AG
0,201,"Hurray, saving us $$$ in so many ways @potus @...",1,0,0
1,202,Why would young fighting age men be the vast m...,1,0,0
2,203,@KamalaHarris Illegals Dump their Kids at the ...,1,0,0
3,204,NY Times: 'Nearly All White' States Pose 'an A...,0,0,0
4,205,Orban in Brussels: European leaders are ignori...,0,0,0
5,206,@KurtSchlichter LEGAL is. Not illegal. #BuildT...,1,0,0
6,207,@RitaPanahi @826Maureen @RealCandaceO Antifa a...,0,0,0
7,208,Ex-Teacher Pleads Not guilty To Rape Charges h...,0,0,0
8,209,still places on our Bengali (Sylheti) class! i...,0,0,0
9,210,DFID Africa Regional Profile: July 2018 https:...,0,0,0


Let's see the dataset size

In [4]:
df.shape

(9000, 5)

9000 tweets in total. We can examinate the distribution of the dataset for binary hate speech

In [5]:
df.groupby('HS').size() / df.shape[0]

HS
0    0.579667
1    0.420333
dtype: float64

* 5217 (58%) no hate speech
* 3783 (42%) hate speech

We see if there is any null data

In [6]:
df.isnull().sum()

id      0
text    0
HS      0
TR      0
AG      0
dtype: int64

We start seeing general information about the data

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9000 entries, 0 to 8999
Data columns (total 5 columns):
id      9000 non-null int64
text    9000 non-null object
HS      9000 non-null int64
TR      9000 non-null int64
AG      9000 non-null int64
dtypes: int64(4), object(1)
memory usage: 351.6+ KB


Classifiers must work with numbers type so we see if any label is encoded as object in order to transform object types into int64 types.

In [8]:
# We can list non numerical properties, with a boolean indexing of the Series df.dtypes
df.dtypes[df.dtypes == object]

text    object
dtype: object

We do not have to make any encoding

### gsitk preprocessing

In [9]:
from gsitk.preprocess import pprocess_twitter, normalize, simple
import string
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.corpus import stopwords
import re
import emoji
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
class gsitkTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, stemming):
        self.stemming = stemming
        self.emojis_list = emoji.UNICODE_EMOJI
        self.emoji_pattern = emoji.get_emoji_regexp()
        self.trash = ['“', '”', '—', '’', '–', '«', '»', '・・・', 'tq', '€', '…', '‘', 'elonghashtag', 'âž\x9d', 'ðÿ',
                 '¦', '‹ðÿ\x8f', 'â‚¬', '„', '¯', 'ðÿ‡\xadðÿ‡·', 'âšï¸\x8f', '‰', 'ðÿ¤', 'ª', '\x8f', 
                 'ðÿ¥\x81ðÿ¥\x81', 'ðÿ˜', '♪', '►', '•', '͡°', 'âž\x9d', '¤', '〝']
    stop = stopwords.words('english')
    def tokenize_doc(self, doc):
        text = pprocess_twitter.preprocess(doc).replace('<allcaps>', '').replace('<elong>', '')
        text = text.replace('\u2060', '').replace('\u2066', '').replace('¤', '').replace('ª', '')
        text = text.replace('<hastag>', '<hashtag>').replace('\u2069', '').replace('…', '').replace('˜', '')
        text = text.replace('<', '').replace('>', '').replace('¿', '').replace('—', '')
        text = text.replace('¡', '').replace('.user', 'user')       
        
        tokens = normalize.preprocess(text)
        if self.stemming:
            porter = PorterStemmer()
            tokens = [porter.stem(t) for t in tokens]
        tokens = [w for w in tokens if w not in string.punctuation and w not in self.trash 
                  and w not in self.stop and w not in self.emojis_list
                  and not self.emoji_pattern.search(w)]
        return tokens
    
    def fit(self, docs, y=None):
        return self
    
    def transform(self, docs):
        return [self.tokenize_doc(doc) for doc in docs]        

In [10]:
from collections import Counter
textrf = gsitkTransformer()
tokens = textrf.transform(df['text'].values)
tokens = [word for doc in tokens for word in doc]

TypeError: __init__() missing 1 required positional argument: 'stemming'

In [11]:
contador = Counter(tokens)
comunes = contador.most_common()

In [167]:
comunes

[('hashtag', 6350),
 ('user', 6063),
 ('url', 4721),
 ('number', 1653),
 ('repeat', 1274),
 ('bitch', 1064),
 ("'s", 1032),
 ("n't", 881),
 ('refugees', 878),
 ('women', 852),
 ('immigrant', 683),
 ('like', 676),
 ('immigration', 660),
 ('illegal', 582),
 ('men', 523),
 ('migrants', 512),
 ('get', 498),
 ('people', 493),
 ('woman', 466),
 ('rape', 448),
 ('cunt', 442),
 ('trump', 432),
 ('go', 414),
 ('whore', 396),
 ("'re", 378),
 ('immigrants', 374),
 ('buildthatwall', 371),
 ('amp', 366),
 ('us', 363),
 ('one', 358),
 ('want', 355),
 ('fuck', 350),
 ('time', 335),
 ('``', 322),
 ('know', 316),
 ('ass', 306),
 ('refugee', 305),
 ('country', 303),
 ('u', 295),
 ('would', 291),
 ('stop', 290),
 ("''", 288),
 ('via', 287),
 ('back', 287),
 ('fucking', 287),
 ('hysterical', 276),
 ('hoe', 272),
 ('make', 268),
 ('home', 265),
 ('need', 259),
 ('girl', 257),
 ('maga', 255),
 ('stupid', 254),
 ('new', 235),
 ('children', 234),
 ('take', 234),
 ('say', 230),
 ('dick', 230),
 ('right', 220),

In [155]:
"ive" in set(embedding_model.vocab.keys())

True

## Features

### Word Embeddings

In [10]:
from gsitk.features.word2vec import Word2VecFeatures
w2v_extractor = Word2VecFeatures(w2v_model_path='../../../../NOBACKUP/crawl-300d-2M.vec', w2v_format='google_txt', convolution=[1,0,0])

### Simon Features

A bad lexicon

In [11]:
from lectura import lexicon_generation
lexicon = lexicon_generation(df)

A better lexicon

In [475]:
from collections import Counter
# lexicon mejorado
def generate_lexicon(df):
    textrf = gsitkTransformer(False)

    # HS
    hs = df['text'][df['HS'] == 1].values
    hs_tokens = textrf.transform(hs)
    hs_tokens = [word for doc in hs_tokens for word in doc]
    hs_lexicon = []

    contador = Counter(hs_tokens)
    comunes = contador.most_common()

    for tupla in comunes:
        hs_lexicon.append(tupla[0])

    # NO HS
    no_hs = df['text'][df['HS'] == 0].values
    no_hs_tokens = textrf.transform(no_hs)
    no_hs_tokens = [word for doc in no_hs_tokens for word in doc]
    no_hs_lexicon = []

    contador = Counter(no_hs_tokens)
    comunes = contador.most_common()

    for tupla in comunes:
        no_hs_lexicon.append(tupla[0])
    no_hs_lexicon = list(set(no_hs_lexicon) - set(hs_lexicon))
    return [hs_lexicon, no_hs_lexicon]

#### Hatebase lexicon

Inmigrants lexicon

In [115]:
import requests
import json
inmigrants_lexicon = []
url = "https://api.hatebase.org/4-0/get_vocabulary"

for i in range(1,14):
    params = {'token': "wZrgLqUdivhbduafPKwzaAVXrmLQjVhQ", 'language': "ENG", 'page': i}
    s = requests.post(url, data=params).content.decode('utf-8')
    s = json.loads(s)
    for result in s['result']:
        if result['is_about_ethnicity']:
            inmigrants_lexicon.append(result['term'])
#inmigrants_lexicon = [word.lower() for word in inmigrants_lexicon if len(word.split(' ')) < 2]
inmigrants_lexicon = [word.lower() for word in inmigrants_lexicon]

Gender lexicon

In [116]:
gender_lexicon = []
for i in range(1,14):
    params = {'token': "wZrgLqUdivhbduafPKwzaAVXrmLQjVhQ", 'language': "ENG", 'page': i}
    s = requests.post(url, data=params).content.decode('utf-8')
    s = json.loads(s)
    for result in s['result']:
        if result['is_about_gender']:
            gender_lexicon.append(result['term'])
#gender_lexicon = [word.lower() for word in gender_lexicon if len(word.split(' ')) < 2]
gender_lexicon = [word.lower() for word in gender_lexicon]

In [117]:
inmigrants_lexicon = list(set(inmigrants_lexicon) - set(gender_lexicon))
hatebase = inmigrants_lexicon + gender_lexicon
len(hatebase)

994

In [118]:
bound = int(len(hatebase)/2)
hatebase_lexicon = [hatebase[:bound], hatebase[bound:]]

In [119]:
hatebase_vocabulary = [word for sublexicon in hatebase_lexicon for word in sublexicon]

In [546]:
from gsitk.features import simon
from gensim.models.keyedvectors import KeyedVectors

#embedding_model = KeyedVectors.load_word2vec_format('../../../../NOBACKUP/crawl-300d-2M.vec', binary=False)
simon_model = simon.Simon(lexicon=dev_lexicon, embedding=embedding_model, n_lexicon_words=100)
similarity_model = simon.simon_pipeline(simon_transformer=simon_model, percentile=100)

#### Sentiment lexicons

nltk

In [21]:
from nltk.corpus import opinion_lexicon
postive_words = opinion_lexicon.positive()
negative_words = opinion_lexicon.negative()
negative_words = list(set(negative_words) - set(postive_words))
sentiment_lexicon = [postive_words, negative_words]

In [22]:
sentiment_vocabulary = [word for sublexicon in sentiment_lexicon for word in sublexicon]

Afinn

In [23]:
from lectura import read_afinn
afinn = read_afinn('../../data/AFINN-111.txt')
afinn_lexicon = [list(afinn.index[afinn['value'] > 0].values), list(afinn.index[afinn['value'] < 0].values)]

In [24]:
afinn_vocabulary = [word for sublexicon in afinn_lexicon for word in sublexicon]

ANEW

In [25]:
from lectura import read_anew
anew = read_anew('../../data/all.csv')
anew_lexicon = [list(anew.Description[anew['value'] > 0].values), list(anew.Description[anew['value'] < 0].values)]

In [26]:
anew_vocabulary = [word for sublexicon in anew_lexicon for word in sublexicon]

Sentiwordnet

In [27]:
from lectura import read_swn
sentiwn = read_swn()
swn_lexicon = [list(sentiwn.index[sentiwn['value'] > 0].values), list(sentiwn.index[sentiwn['value'] < 0].values)]

In [28]:
swn_vocabulary = [word for sublexicon in swn_lexicon for word in sublexicon]

In [246]:
sentiment_model = simon.Simon(lexicon=sentiment_lexicon, embedding=embedding_model, n_lexicon_words=250)
senti_model = simon.simon_pipeline(simon_transformer=sentiment_model, percentile=25)

#### Subjectivity lexicons

In [30]:
from lectura import subjectivity_lexicon_generation
subjectivity_lexicon = subjectivity_lexicon_generation()
subjectivity_model = simon.Simon(lexicon=subjectivity_lexicon, embedding=embedding_model, n_lexicon_words=100)
subj_model = simon.simon_pipeline(simon_transformer=subjectivity_model, percentile=100)

In [31]:
subjectivity_vocabulary = [word for sublexicon in subjectivity_lexicon for word in sublexicon]

#### Hashtag lexicon

In [32]:
from lectura import hashtag_lexicon_generation
hashtag_lexicon = hashtag_lexicon_generation(df)

### Surface Features

In [33]:
from lectura import english_tokenizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
gsitk_tokenizer = gsitkTransformer(True)

ngrams_featurizer = Pipeline([
  ('count_vectorizer',  CountVectorizer(ngram_range = (2, 3), encoding = 'ISO-8859-1', 
                                        tokenizer=gsitk_tokenizer.tokenize_doc)),
  ('tfidf_transformer', TfidfTransformer())
])

## Classification

The estimators and hyperparams to be used will be:

In [375]:
from sklearn import svm, linear_model, tree, ensemble, naive_bayes, neighbors, gaussian_process, multiclass
import numpy as np
from sklearn.neural_network import MLPClassifier
less_than_1 = [.0001,.001,.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
classifiers = {
    'logistic_regression': 
                   {'model': linear_model.LogisticRegression(), 
                    'params': 
                    {'clf__penalty': ['l1','l2'], 'clf__tol': [0.0001,0.001,0.01,0.1], 
                     'clf__C': np.append(less_than_1, np.arange(1,16))}}, 
               'linear_SVC': 
                   {'model': svm.LinearSVC(dual=False), 
                    'params': 
                    {'clf__penalty': ['l1','l2'], 'clf__tol': [0.0001,0.001,0.01,0.1], 
                     'clf__C': np.append(less_than_1, np.arange(1,16))}}, 
               'random_forest': 
                   {'model': ensemble.RandomForestClassifier(), 
                    'params': 
                    {'clf__n_estimators': np.arange(10, 110, 10), 'clf__max_depth': np.append(np.arange(1, 11), None)}},
}

Now that we have all the features and the estimators it's time to split the dataset into training and test

In [35]:
# Training and test spliting
from sklearn import preprocessing

# Evaluation
from sklearn import metrics
from sklearn.model_selection import cross_val_score, KFold, StratifiedKFold, train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [14]:
# vocab_set = set(embedding_model.vocab.keys())
# print(df.shape)
# df = df[df['text'].str.split(' ').apply(lambda toks: len(set(toks) & vocab_set)) > 0]
#print(df.shape)
# Execute this if you are going to use Word embeddings or simon
# Tweets with problems
df.drop([616], inplace=True)
df.drop([681], inplace=True)
df.drop([891], inplace=True)
df.drop([1018], inplace=True)
df.drop([1063], inplace=True)
df.drop([1321], inplace=True)
df.drop([1649], inplace=True)
df.drop([1658+7], inplace=True)
df.drop([1886+8], inplace=True)
df.drop([1890+9], inplace=True)
df.drop([2031+10], inplace=True)
df.drop([2042+11], inplace=True)
df.drop([2484+12], inplace=True)
df.drop([2660+13], inplace=True)
df.drop([3236+14], inplace=True)
df.drop([3405+15], inplace=True)
df.drop([3408+16], inplace=True)
df.drop([3496+17], inplace=True)
df.drop([3529+18], inplace=True)
df.drop([3841+19], inplace=True)
df.drop([4215+20], inplace=True)
df.drop([4856+21], inplace=True)
df.drop([6820+22], inplace=True)
df.drop([6835+23], inplace=True)
df.drop([6844+24], inplace=True)
df.drop([6914+25], inplace=True)
df.drop([7147+26], inplace=True)
df.drop([7185+27], inplace=True)
df.drop([7186+28], inplace=True)
df.drop([7363+29], inplace=True)
df.drop([7413+30], inplace=True)
df.drop([7427+31], inplace=True)
df.drop([7458+32], inplace=True)
df.drop([7464+33], inplace=True)
df.drop([8534+34], inplace=True)

In [36]:
# gsitk problematic tweets
df.drop([2149], inplace=True)
df.drop([7122], inplace=True)

In [37]:
df.groupby('HS').size()/df.shape[0]

HS
0    0.579684
1    0.420316
dtype: float64

Load test data and exploration in the same way than train data.

In [38]:
df_test = load_data('../../data/dev_en_A.tsv')
df_test

Unnamed: 0,id,text,HS,TR,AG
0,18201,I swear I’m getting to places just in the nick...,0,0,0
1,18202,I’m an immigrant — and Trump is right on immig...,0,0,0
2,18203,#IllegalImmigrants #IllegalAliens #ElectoralSy...,1,0,1
3,18204,@DRUDGE_REPORT We have our own invasion issues...,1,0,1
4,18205,Worker Charged With Sexually Molesting Eight C...,0,0,0
5,18206,"UN picks refugees for UK, picks mostly Syrian ...",0,0,0
6,18207,@htTweets Delhi will be much better under Prez...,0,0,0
7,18208,Global Compact on Migration moves forward http...,0,0,0
8,18209,"Violence toward tourists in Barcelona, Spain -...",0,0,0
9,18210,Immigration decline costing UK economy billion...,0,0,0


In [39]:
df_test.shape

(1000, 5)

In [40]:
# Train Test splitting
X_train = df['text'].values
y_train = df['HS'].values
X_test = df_test['text'].values
y_test = df_test['HS'].values

In [548]:
df_evaluation = load_data('../../data/test_en.tsv')
df_evaluation.shape
df_evaluation.drop([1706], inplace=True)

In [549]:
X_evaluation = df_evaluation['text'].values

In [467]:
import pandas as pd
df_dev = pd.concat([df, df_test])
df_dev.shape

(9998, 5)

In [476]:
dev_lexicon = generate_lexicon(df_dev)
dev_hashtags = hashtag_lexicon_generation(df_dev)

Feature extraction

In [547]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn import metrics
from sklearn.metrics import make_scorer, accuracy_score
from lectura import TextTransformer, LexicalStats, english_tokenizer, PosStats, LowerTransformer, SubjectivityStats
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from lectura import TwitterStats, DenseTransformer
from nltk.tokenize import TweetTokenizer
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_classif, chi2
from sklearn.decomposition import PCA

tweet_tokenizer = TweetTokenizer(reduce_len=True, strip_handles=True)
gsitk_tokenizer = gsitkTransformer(True)

textFeatures = Pipeline([
    ('features', FeatureUnion([
        ('lexical_stats', Pipeline([
            ('stats', LexicalStats()),
            ('vectors', DictVectorizer()),
            ('scaler', MaxAbsScaler())
        ])),
        #('hatebase', Pipeline([
        #    ('lowecase_transformation', LowerTransformer()),
        #    ('hatebase_vectorizer', TfidfVectorizer(vocabulary=hatebase_vocabulary, tokenizer=tweet_tokenizer.tokenize))
        #])),
        #('pos_stats', Pipeline([
        #    ('pos_stats', PosStats()),
        #    ('vectors', DictVectorizer())
        #])),
        #('word_embeddings', Pipeline([
        #    ('text_to_tokens', gsitkTransformer(False)),
        #    ('embedding_features', w2v_extractor),
        #])),
        ('simon_features', Pipeline([
            ('text_to_tokens', gsitkTransformer(False)),
            ('similarity', similarity_model),
        ])),
        #('sentiments_simon', Pipeline([
        #    ('text_to_tokens', gsitkTransformer(False)),
        #    ('sentiment', senti_model),
        #])),
        ('subjectivity_simon', Pipeline([
            ('text_to_tokens', gsitkTransformer(False)),
            ('subjectivity_model', subj_model),
        ])),
        #('subjectivity', TfidfVectorizer(vocabulary=subjectivity_vocabulary, tokenizer=tweet_tokenizer.tokenize)),
        #('words', TfidfVectorizer(tokenizer=gsitk_tokenizer.tokenize_doc)),
        ('ngrams', ngrams_featurizer),
        ('character_ngram', TfidfVectorizer(analyzer='char', ngram_range=(2,7))),
        #('sentiments', TfidfVectorizer(vocabulary=sentiment_vocabulary, tokenizer=tweet_tokenizer.tokenize)),
        ('emojis', TfidfVectorizer(token_pattern=emoji.get_emoji_regexp(), tokenizer=tweet_tokenizer.tokenize)),
        ('hashtags', Pipeline([
            ('lowercase_transformation', LowerTransformer()),
            ('hashtag_vectorizer', TfidfVectorizer(vocabulary=dev_hashtags, tokenizer=tweet_tokenizer.tokenize))            
        ])), 
        #('twitter_stats', Pipeline([
        #    ('stats', TwitterStats()),
        #    ('vectors', DictVectorizer()),
        #    ('scaler', MaxAbsScaler())
        #])),
        ('subjectivity_stats', Pipeline([
            ('stats', SubjectivityStats()),
            ('vectors', DictVectorizer()),
        ])),
        #('lda', Pipeline([
        #    ('count', CountVectorizer(tokenizer=english_tokenizer)),
        #    ('lda', LatentDirichletAllocation(n_topics=5, max_iter=5, learning_method='online', learning_offset=50,
        #                                     random_state=0))
        #]))
    ])),
    ('KBest', SelectKBest(f_classif, k=17500))
    #('Percentile', SelectPercentile(f_classif, percentile=0.75))
    #('to_dense', DenseTransformer()), 
    #('PCA', PCA(n_components=0.90, svd_solver='full'))
])


In [543]:
from scipy.sparse import hstack, vstack, coo_matrix

features_train = textFeatures.fit_transform(X_train, y_train)
features_train.shape

(8998, 17500)

In [544]:
features_test = textFeatures.transform(X_test)
features_test.shape

(1000, 17500)

In [550]:
X_dev = df_dev['text'].values
y_dev = df_dev['HS'].values

features_dev = textFeatures.fit_transform(X_dev, y_dev)
features_dev.shape

(9998, 17500)

In [551]:
features_evaluation = textFeatures.transform(X_evaluation)
features_evaluation.shape

(2999, 17500)

In [485]:
% pdb

Automatic pdb calling has been turned OFF


In [486]:
X_evaluation[1706]

'THIS.#IllegalAliens'

### Evaluation

In [44]:
import multiprocessing
n_jobs = int(multiprocessing.cpu_count() * 0.8)

In [545]:
import warnings 
warnings.filterwarnings("ignore")
from evaluation import evaluate
results = evaluate(features_train, y_train, features_test, y_test, classifiers, n_jobs=n_jobs)
results




  0%|          | 0/3 [00:00<?, ?it/s][A[A[A

logistic_regression
Best score:  0.772154451054
Best params:  {'clf__C': 2.0, 'clf__penalty': 'l2', 'clf__tol': 0.0001}







 33%|███▎      | 1/3 [07:34<15:09, 454.58s/it][A[A[A

linear_SVC
Best score:  0.77223619711
Best params:  {'clf__C': 0.29999999999999999, 'clf__penalty': 'l2', 'clf__tol': 0.001}







 67%|██████▋   | 2/3 [20:53<09:17, 557.92s/it][A[A[A

random_forest
Best score:  0.76431433326
Best params:  {'clf__max_depth': None, 'clf__n_estimators': 80}







100%|██████████| 3/3 [22:26<00:00, 418.26s/it][A[A[A


[A[A[A

{'logistic_regression': {'params': {'memory': None,
   'steps': [('clf',
     LogisticRegression(C=2.0, class_weight=None, dual=False, fit_intercept=True,
               intercept_scaling=1, max_iter=100, multi_class='warn',
               n_jobs=None, penalty='l2', random_state=None, solver='warn',
               tol=0.0001, verbose=0, warm_start=False))],
   'clf': LogisticRegression(C=2.0, class_weight=None, dual=False, fit_intercept=True,
             intercept_scaling=1, max_iter=100, multi_class='warn',
             n_jobs=None, penalty='l2', random_state=None, solver='warn',
             tol=0.0001, verbose=0, warm_start=False),
   'clf__C': 2.0,
   'clf__class_weight': None,
   'clf__dual': False,
   'clf__fit_intercept': True,
   'clf__intercept_scaling': 1,
   'clf__max_iter': 100,
   'clf__multi_class': 'warn',
   'clf__n_jobs': None,
   'clf__penalty': 'l2',
   'clf__random_state': None,
   'clf__solver': 'warn',
   'clf__tol': 0.0001,
   'clf__verbose': 0,
   'clf__warm_st

### Outputting the results

In [552]:
from evaluation import createModel
classifier = linear_model.LogisticRegression(C=2, penalty='l2', tol=0.0001)
model = createModel(classifier)

In [553]:
model.fit(features_dev, y_dev)

predicted = model.predict(features_evaluation)

In [554]:
df_results = df_evaluation.copy()
df_results.drop(columns=['text'], inplace=True)
df_results.insert(1, 'prediction', predicted)

In [555]:
ID = np.int64(32278)
label = 1
tweet_raro = {'id': ID, 'prediction': label}

In [556]:
df_results = df_results.append(tweet_raro, ignore_index=True)

In [557]:
df_results

Unnamed: 0,id,prediction
0,31568,1
1,30734,1
2,31227,1
3,31224,1
4,31139,1
5,34076,1
6,32479,1
7,32985,1
8,30284,1
9,30072,1


In [558]:
df_results.to_csv('en_a.tsv', index=False, sep="\t", header=None)

In [559]:
df_results.groupby('prediction').size()

prediction
0     278
1    2722
dtype: int64