# Thesis 2020-2021: BERT

In this notebook, we will create a BERT model.

In [24]:
import pandas as pd
import numpy as np
import math

import matplotlib
import matplotlib.pyplot as plt

In [34]:
import re
from pattern.text.en import singularize
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords

tweet_tokenizer = TweetTokenizer()
stop_words = set(stopwords.words('english'))

# Create a function to clean the tweets
def cleanTxt(text):
    text = text.lower() # Convert everything to lower case
    text = re.sub(r'@[a-zA-Z0-9]+', '', text) # Remove @mentions
    text = re.sub(r'rt[\s]+', '', text) # Remove RT (retweet symbol)
    text = re.sub(r'&amp;', 'and', text) # Replace '&amp;' by 'and'
    text = re.sub(r'https?:\/\/\S+', '', text) # Remove hyper link  
    #text = re.sub(r'\d+', '0', text) # Replace all numbers by a zero
    text = " ".join([singularize(word) for word in tweet_tokenizer.tokenize(text) if word not in stop_words]) # Remove stopwords
    #text = " ".join([singularize(word) for word in text])
    #text = re.sub(r'[^\w\s#]', ' ', text) # Remove all non-alphanumeric symbols (excluding whitespace and # characters)
    text = re.sub(r'\s+', ' ', text) # Replace multiple whitespaces by a single whitespace
    text = text.strip() # Remove whitespaces at the beginning and at the end
    
    return text

In [35]:
import csv
    
df_train = pd.read_csv('data/hateval2019_en_train.csv')
df_dev = pd.read_csv('data/hateval2019_en_dev.csv')

df_train_dev = df_train.append(df_dev, ignore_index=True)
df_train_dev = df_train_dev.drop(['TR', 'AG'], axis=1)

df_test = pd.read_csv('data/hateval2019_en_test.csv')
df_test = df_test.drop(['TR', 'AG'], axis=1)

# Clean the data

df_train_dev['text_cleaned'] = df_train_dev['text'].apply(cleanTxt)
df_test['text_cleaned'] = df_test['text'].apply(cleanTxt)
df_train_dev

Unnamed: 0,id,text,HS,text_cleaned
0,201,"Hurray, saving us $$$ in so many ways @potus @...",1,"hurray , saving u $ $ $ many way #lockthemup #..."
1,202,Why would young fighting age men be the vast m...,1,would young fighting age man vast majority one...
2,203,@KamalaHarris Illegals Dump their Kids at the ...,1,illegal dump kid border like road kill refuse ...
3,204,NY Times: 'Nearly All White' States Pose 'an A...,0,ny time : 's nearly white 's state pose 's arr...
4,205,Orban in Brussels: European leaders are ignori...,0,orban brussel : european leader ignoring perso...
...,...,...,...,...
9995,19196,@SamEnvers you unfollowed me? Fuck you pussy,0,unfollowed ? fuck pussy
9996,19197,@DanReynolds STFU BITCH! AND YOU GO MAKE SOME ...,1,stfu bitch ! go make satanic music u illuminat...
9997,19198,"@2beornotbeing Honey, as a fellow white chick,...",0,"honey , fellow white chick , let tell need . s..."
9998,19199,I hate bitches who talk about niggaz with kids...,1,"hate bitch talk niggaz kid , everybody cant fi..."


In [5]:
!pip install transformers



In [36]:
import torch
from transformers import BertTokenizer, BertModel

# OPTIONAL: if you want to have more information on what's happening, activate the logger as follows
import logging
logging.basicConfig(level=logging.INFO)

import matplotlib.pyplot as plt
%matplotlib inline

# Load pre-trained model tokenizer (vocabulary)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [37]:
def preprocess(text):
    return "[CLS] " + text + " [SEP]"

df_train_dev['text_bert'] = df_train_dev['text'].apply(preprocess)
df_train_dev['text_cleaned_bert'] = df_train_dev['text_cleaned'].apply(preprocess)
df_test['text_bert'] = df_test['text'].apply(preprocess)
df_test['text_cleaned_bert'] = df_test['text_cleaned'].apply(preprocess)

# Tokenize our sentence with the BERT tokenizer.
df_train_dev['text_bert'] = df_train_dev['text_bert'].apply(lambda x: tokenizer.tokenize(x))
df_train_dev['text_cleaned_bert'] = df_train_dev['text_cleaned_bert'].apply(lambda x: tokenizer.tokenize(x))
df_test['text_bert'] = df_test['text_bert'].apply(lambda x: tokenizer.tokenize(x))
df_test['text_cleaned_bert'] = df_test['text_cleaned_bert'].apply(lambda x: tokenizer.tokenize(x))

#tokenized_text = tokenizer.tokenize(marked_text)

# Print out the tokens.
df_train_dev

Unnamed: 0,id,text,HS,text_cleaned,text_bert,text_cleaned_bert
0,201,"Hurray, saving us $$$ in so many ways @potus @...",1,"hurray , saving u $ $ $ many way #lockthemup #...","[[CLS], hu, ##rra, ##y, ,, saving, us, $, $, $...","[[CLS], hu, ##rra, ##y, ,, saving, u, $, $, $,..."
1,202,Why would young fighting age men be the vast m...,1,would young fighting age man vast majority one...,"[[CLS], why, would, young, fighting, age, men,...","[[CLS], would, young, fighting, age, man, vast..."
2,203,@KamalaHarris Illegals Dump their Kids at the ...,1,illegal dump kid border like road kill refuse ...,"[[CLS], @, kamal, ##aha, ##rri, ##s, illegal, ...","[[CLS], illegal, dump, kid, border, like, road..."
3,204,NY Times: 'Nearly All White' States Pose 'an A...,0,ny time : 's nearly white 's state pose 's arr...,"[[CLS], ny, times, :, ', nearly, all, white, '...","[[CLS], ny, time, :, ', s, nearly, white, ', s..."
4,205,Orban in Brussels: European leaders are ignori...,0,orban brussel : european leader ignoring perso...,"[[CLS], orb, ##an, in, brussels, :, european, ...","[[CLS], orb, ##an, br, ##uss, ##el, :, europea..."
...,...,...,...,...,...,...
9995,19196,@SamEnvers you unfollowed me? Fuck you pussy,0,unfollowed ? fuck pussy,"[[CLS], @, same, ##n, ##vers, you, un, ##fo, #...","[[CLS], un, ##fo, ##llo, ##wed, ?, fuck, pussy..."
9996,19197,@DanReynolds STFU BITCH! AND YOU GO MAKE SOME ...,1,stfu bitch ! go make satanic music u illuminat...,"[[CLS], @, dan, ##rey, ##no, ##ld, ##s, st, ##...","[[CLS], st, ##fu, bitch, !, go, make, satan, #..."
9997,19198,"@2beornotbeing Honey, as a fellow white chick,...",0,"honey , fellow white chick , let tell need . s...","[[CLS], @, 2, ##be, ##orno, ##t, ##bei, ##ng, ...","[[CLS], honey, ,, fellow, white, chick, ,, let..."
9998,19199,I hate bitches who talk about niggaz with kids...,1,"hate bitch talk niggaz kid , everybody cant fi...","[[CLS], i, hate, bitch, ##es, who, talk, about...","[[CLS], hate, bitch, talk, ni, ##gga, ##z, kid..."


In [41]:
# Map the token strings to their vocabulary indeces.
df_train_dev['text_bert'] = df_train_dev['text_bert'].apply(lambda x: tokenizer.convert_tokens_to_ids(x))
df_train_dev['text_cleaned_bert'] = df_train_dev['text_cleaned_bert'].apply(lambda x: tokenizer.convert_tokens_to_ids(x))
df_test['text_bert'] = df_test['text_bert'].apply(lambda x: tokenizer.convert_tokens_to_ids(x))
df_test['text_cleaned_bert'] = df_test['text_cleaned_bert'].apply(lambda x: tokenizer.convert_tokens_to_ids(x))

df_train_dev

Unnamed: 0,id,text,HS,text_cleaned,text_bert,text_cleaned_bert
0,201,"Hurray, saving us $$$ in so many ways @potus @...",1,"hurray , saving u $ $ $ many way #lockthemup #...","[101, 15876, 11335, 2100, 1010, 7494, 2149, 10...","[101, 15876, 11335, 2100, 1010, 7494, 1057, 10..."
1,202,Why would young fighting age men be the vast m...,1,would young fighting age man vast majority one...,"[101, 2339, 2052, 2402, 3554, 2287, 2273, 2022...","[101, 2052, 2402, 3554, 2287, 2158, 6565, 3484..."
2,203,@KamalaHarris Illegals Dump their Kids at the ...,1,illegal dump kid border like road kill refuse ...,"[101, 1030, 21911, 23278, 18752, 2015, 6206, 2...","[101, 6206, 15653, 4845, 3675, 2066, 2346, 310..."
3,204,NY Times: 'Nearly All White' States Pose 'an A...,0,ny time : 's nearly white 's state pose 's arr...,"[101, 6396, 2335, 1024, 1005, 3053, 2035, 2317...","[101, 6396, 2051, 1024, 1005, 1055, 3053, 2317..."
4,205,Orban in Brussels: European leaders are ignori...,0,orban brussel : european leader ignoring perso...,"[101, 19607, 2319, 1999, 9371, 1024, 2647, 417...","[101, 19607, 2319, 7987, 17854, 2884, 1024, 26..."
...,...,...,...,...,...,...
9995,19196,@SamEnvers you unfollowed me? Fuck you pussy,0,unfollowed ? fuck pussy,"[101, 1030, 2168, 2078, 14028, 2017, 4895, 148...","[101, 4895, 14876, 7174, 15557, 1029, 6616, 22..."
9996,19197,@DanReynolds STFU BITCH! AND YOU GO MAKE SOME ...,1,stfu bitch ! go make satanic music u illuminat...,"[101, 1030, 4907, 15202, 3630, 6392, 2015, 235...","[101, 2358, 11263, 7743, 999, 2175, 2191, 1679..."
9997,19198,"@2beornotbeing Honey, as a fellow white chick,...",0,"honey , fellow white chick , let tell need . s...","[101, 1030, 1016, 4783, 26295, 2102, 19205, 30...","[101, 6861, 1010, 3507, 2317, 14556, 1010, 229..."
9998,19199,I hate bitches who talk about niggaz with kids...,1,"hate bitch talk niggaz kid , everybody cant fi...","[101, 1045, 5223, 7743, 2229, 2040, 2831, 2055...","[101, 5223, 7743, 2831, 9152, 23033, 2480, 484..."


In [44]:
# Load pre-trained model (weights)
model = BertModel.from_pretrained('bert-base-uncased',
                                  output_hidden_states = True, # Whether the model returns all hidden-states.
                                  )

# Put the model in "evaluation" mode, meaning feed-forward operation.
model.eval()


INFO:filelock:Lock 2453532433136 acquired on C:\Users\Admin/.cache\huggingface\transformers\3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e.lock


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=570.0), HTML(value='')))

INFO:filelock:Lock 2453532433136 released on C:\Users\Admin/.cache\huggingface\transformers\3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e.lock





INFO:filelock:Lock 2453517282752 acquired on C:\Users\Admin/.cache\huggingface\transformers\a8041bf617d7f94ea26d15e218abd04afc2004805632abc0ed2066aa16d50d04.faf6ea826ae9c5867d12b22257f9877e6b8367890837bd60f7c54a29633f7f2f.lock


HBox(children=(HTML(value='Downloading'), FloatProgress(value=0.0, max=440473133.0), HTML(value='')))

INFO:filelock:Lock 2453517282752 released on C:\Users\Admin/.cache\huggingface\transformers\a8041bf617d7f94ea26d15e218abd04afc2004805632abc0ed2066aa16d50d04.faf6ea826ae9c5867d12b22257f9877e6b8367890837bd60f7c54a29633f7f2f.lock





Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [50]:
df_train_dev.text_bert

0       [101, 15876, 11335, 2100, 1010, 7494, 2149, 10...
1       [101, 2339, 2052, 2402, 3554, 2287, 2273, 2022...
2       [101, 1030, 21911, 23278, 18752, 2015, 6206, 2...
3       [101, 6396, 2335, 1024, 1005, 3053, 2035, 2317...
4       [101, 19607, 2319, 1999, 9371, 1024, 2647, 417...
                              ...                        
9995    [101, 1030, 2168, 2078, 14028, 2017, 4895, 148...
9996    [101, 1030, 4907, 15202, 3630, 6392, 2015, 235...
9997    [101, 1030, 1016, 4783, 26295, 2102, 19205, 30...
9998    [101, 1045, 5223, 7743, 2229, 2040, 2831, 2055...
9999    [101, 1030, 5754, 3597, 11314, 2121, 1030, 622...
Name: text_bert, Length: 10000, dtype: object

In [71]:
# Mark each of the 22 tokens as belonging to sentence "1".
segments_ids = [1] * len(df_train_dev.text_bert[0])
segments_ids_1 = [1] * len(df_train_dev.text_bert[1])

# Convert inputs to PyTorch tensors
tokens_tensor = torch.tensor([df_train_dev.text_bert[0]])
segments_tensors = torch.tensor([segments_ids])

ValueError: expected sequence of length 45 at dim 1 (got 76)

In [78]:
train_vectors = []
test_vectors = []
counter = 0
# Run the text through BERT, and collect all of the hidden states produced
# from all 12 layers. 
with torch.no_grad():
    for text in df_train_dev.text_bert:
        if counter % 100 == 0:
            print("PROCESS")
        counter += 1
        # Convert inputs to PyTorch tensors
        tokens_tensor = torch.tensor([text])
        segments_tensors = torch.tensor([[1] * len(text)])
        
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]
        train_vectors.append(torch.mean(hidden_states[-2][0], dim=0).tolist())

    
    
    
    
    # Evaluating the model will return a different number of objects based on 
    # how it's  configured in the `from_pretrained` call earlier. In this case, 
    # becase we set `output_hidden_states = True`, the third item will be the 
    # hidden states from all layers. See the documentation for more details:
    # https://huggingface.co/transformers/model_doc/bert.html#bertmodel
    #hidden_states = outputs[2]

PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS
PROCESS


In [79]:
test_vectors = []
counter = 0
# Run the text through BERT, and collect all of the hidden states produced
# from all 12 layers. 
with torch.no_grad():
    for text in df_test.text_bert:
        if counter % 200 == 0:
            print("PROCESS:", counter)
        counter += 1
        # Convert inputs to PyTorch tensors
        tokens_tensor = torch.tensor([text])
        segments_tensors = torch.tensor([[1] * len(text)])
        
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]
        test_vectors.append(torch.mean(hidden_states[-2][0], dim=0).tolist())

PROCESS: 0
PROCESS: 200
PROCESS: 400
PROCESS: 600
PROCESS: 800
PROCESS: 1000
PROCESS: 1200
PROCESS: 1400
PROCESS: 1600
PROCESS: 1800
PROCESS: 2000
PROCESS: 2200
PROCESS: 2400
PROCESS: 2600
PROCESS: 2800


In [80]:
train_vectors_cleaned = []
counter = 0
# Run the text through BERT, and collect all of the hidden states produced
# from all 12 layers. 
with torch.no_grad():
    for text in df_train_dev.text_cleaned_bert:
        if counter % 200 == 0:
            print("PROCESS:", counter)
        counter += 1
        # Convert inputs to PyTorch tensors
        tokens_tensor = torch.tensor([text])
        segments_tensors = torch.tensor([[1] * len(text)])
        
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]
        train_vectors_cleaned.append(torch.mean(hidden_states[-2][0], dim=0).tolist())

PROCESS: 0
PROCESS: 200
PROCESS: 400
PROCESS: 600
PROCESS: 800
PROCESS: 1000
PROCESS: 1200
PROCESS: 1400
PROCESS: 1600
PROCESS: 1800
PROCESS: 2000
PROCESS: 2200
PROCESS: 2400
PROCESS: 2600
PROCESS: 2800
PROCESS: 3000
PROCESS: 3200
PROCESS: 3400
PROCESS: 3600
PROCESS: 3800
PROCESS: 4000
PROCESS: 4200
PROCESS: 4400
PROCESS: 4600
PROCESS: 4800
PROCESS: 5000
PROCESS: 5200
PROCESS: 5400
PROCESS: 5600
PROCESS: 5800
PROCESS: 6000
PROCESS: 6200
PROCESS: 6400
PROCESS: 6600
PROCESS: 6800
PROCESS: 7000
PROCESS: 7200
PROCESS: 7400
PROCESS: 7600
PROCESS: 7800
PROCESS: 8000
PROCESS: 8200
PROCESS: 8400
PROCESS: 8600
PROCESS: 8800
PROCESS: 9000
PROCESS: 9200
PROCESS: 9400
PROCESS: 9600
PROCESS: 9800


In [81]:
test_vectors_cleaned = []
counter = 0
# Run the text through BERT, and collect all of the hidden states produced
# from all 12 layers. 
with torch.no_grad():
    for text in df_test.text_cleaned_bert:
        if counter % 200 == 0:
            print("PROCESS:", counter)
        counter += 1
        # Convert inputs to PyTorch tensors
        tokens_tensor = torch.tensor([text])
        segments_tensors = torch.tensor([[1] * len(text)])
        
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2]
        test_vectors_cleaned.append(torch.mean(hidden_states[-2][0], dim=0).tolist())

PROCESS: 0
PROCESS: 200
PROCESS: 400
PROCESS: 600
PROCESS: 800
PROCESS: 1000
PROCESS: 1200
PROCESS: 1400
PROCESS: 1600
PROCESS: 1800
PROCESS: 2000
PROCESS: 2200
PROCESS: 2400
PROCESS: 2600
PROCESS: 2800


In [82]:
y_train = df_train_dev.HS
y_test = df_test.HS

In [84]:
import import_ipynb
import evaluate # here we import the local evaluate.ipynb jupyter notebook
from sklearn.linear_model import LogisticRegression

# Evaluate using Logistic Regression as the classifier (without normalized data)

logreg = LogisticRegression().fit(train_vectors, y_train)
y_predict = logreg.predict(test_vectors)

# Create new test dataframe
df_test_bert = df_test.copy()
df_test_bert['HS'] = y_predict

# Create prediction file for the bert
df_test_bert[['id', 'HS']].to_csv('predictions/bert.tsv', sep='\t', index=False, header=False)
df_test_bert[['id', 'HS']].to_csv('input/res/en_a.tsv', sep='\t', index=False, header=False)

# Evaluate the result of the bert
evaluate.write_eval("scores_bert")

importing Jupyter notebook from evaluate.ipynb
taskA_fscore: 0.5462134785788648
taskA_precision: 0.6515250130962755
taskA_recall: 0.6093732895457034
taskA_accuracy: 0.563


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [86]:
import import_ipynb
import evaluate # here we import the local evaluate.ipynb jupyter notebook
from sklearn.linear_model import LogisticRegression

# Evaluate using Logistic Regression as the classifier (without normalized data)

logreg = LogisticRegression().fit(train_vectors_cleaned, y_train)
y_predict_cleaned = logreg.predict(test_vectors_cleaned)

# Create new test dataframe
df_test_bert_cleaned = df_test.copy()
df_test_bert_cleaned['HS'] = y_predict_cleaned

# Create prediction file for the bert_cleaned
df_test_bert_cleaned[['id', 'HS']].to_csv('predictions/bert_cleaned.tsv', sep='\t', index=False, header=False)
df_test_bert_cleaned[['id', 'HS']].to_csv('input/res/en_a.tsv', sep='\t', index=False, header=False)

# Evaluate the result of the bert_cleaned
evaluate.write_eval("scores_bert_cleaned")

taskA_fscore: 0.5007588161507743
taskA_precision: 0.6206242320015312
taskA_recall: 0.5773262178434593
taskA_accuracy: 0.5263333333333333


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [88]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn import preprocessing

# Normalize the data via StandardScaler

scaler = preprocessing.StandardScaler().fit(train_vectors)
train_vectors_scaled = scaler.transform(train_vectors)
test_vectors_scaled = scaler.transform(test_vectors)

scaler_cleaned = preprocessing.StandardScaler().fit(train_vectors_cleaned)
train_vectors_scaled_cleaned = scaler_cleaned.transform(train_vectors_cleaned)
test_vectors_scaled_cleaned = scaler_cleaned.transform(test_vectors_cleaned)

In [89]:
# Evaluate using Logistic Regression as the classifier (without normalized data)

logreg = LogisticRegression().fit(train_vectors_scaled, y_train)
y_predict = logreg.predict(test_vectors_scaled)

# Create new test dataframe
df_test_bert_scaled = df_test.copy()
df_test_bert_scaled['HS'] = y_predict

# Create prediction file for the bert_scaled
df_test_bert_scaled[['id', 'HS']].to_csv('predictions/bert_scaled.tsv', sep='\t', index=False, header=False)
df_test_bert_scaled[['id', 'HS']].to_csv('input/res/en_a.tsv', sep='\t', index=False, header=False)

# Evaluate the result of the bert_scaled
evaluate.write_eval("scores_bert_scaled")

taskA_fscore: 0.5475686130042365
taskA_precision: 0.6481366459627329
taskA_recall: 0.608784893267652
taskA_accuracy: 0.5633333333333334


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [90]:
# Evaluate using Logistic Regression as the classifier (without normalized data)

logreg = LogisticRegression().fit(train_vectors_scaled_cleaned, y_train)
y_predict = logreg.predict(test_vectors_scaled_cleaned)

# Create new test dataframe
df_test_bert_scaled_cleaned = df_test.copy()
df_test_bert_scaled_cleaned['HS'] = y_predict

# Create prediction file for the bert_scaled_cleaned
df_test_bert_scaled_cleaned[['id', 'HS']].to_csv('predictions/bert_scaled_cleaned.tsv', sep='\t', index=False, header=False)
df_test_bert_scaled_cleaned[['id', 'HS']].to_csv('input/res/en_a.tsv', sep='\t', index=False, header=False)

# Evaluate the result of the bert_scaled_cleaned
evaluate.write_eval("scores_bert_scaled_cleaned")

taskA_fscore: 0.5004151512670885
taskA_precision: 0.6168858569519657
taskA_recall: 0.5758073344280241
taskA_accuracy: 0.5253333333333333


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [67]:
hidden_states[0].size()

torch.Size([1, 45, 768])

In [76]:
torch.mean(hidden_states[-2][0], dim=0).tolist()

[0.6069624423980713,
 0.2419789433479309,
 0.7163507342338562,
 0.06172146275639534,
 0.4696476459503174,
 -0.149113729596138,
 0.05396163836121559,
 0.2692813575267792,
 -0.5679747462272644,
 -0.37784314155578613,
 0.07539860159158707,
 -0.3541121780872345,
 0.03370343893766403,
 0.5155255794525146,
 0.054346948862075806,
 0.9002467393875122,
 -0.23749478161334991,
 0.4720906615257263,
 -0.33111461997032166,
 0.3972581624984741,
 0.7518365383148193,
 0.26511290669441223,
 -0.14997993409633636,
 0.016306163743138313,
 0.38244205713272095,
 -0.09090139716863632,
 -0.0763014554977417,
 -0.6351617574691772,
 -0.7715164422988892,
 0.12208463996648788,
 0.2259512096643448,
 0.2197011262178421,
 0.6285077333450317,
 -0.2671724855899811,
 -0.29905569553375244,
 -0.6454766392707825,
 -0.3351753354072571,
 -0.2363114058971405,
 0.6731697916984558,
 0.3784375786781311,
 -0.1486649364233017,
 -0.23532456159591675,
 0.22471709549427032,
 -0.005852035712450743,
 -0.10157843679189682,
 0.23649147152