In [None]:
import pandas as pd
import numpy as np

from nltk.tokenize import RegexpTokenizer
tokenizerreg = RegexpTokenizer(r'\w+')

from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

from sklearn.preprocessing import StandardScaler

### Load data

In [None]:
# load all data (OLID train, test and HASOC train) 
olid_train = pd.read_csv('data/olid-train-all.csv', delimiter=',')
hasoc_train = pd.read_csv('data/hasoc-train-all.csv', delimiter=',')
olid_test = pd.read_csv('data/olid-test.csv', delimiter=',')

all_datasets = [olid_train, hasoc_train, olid_test]

In [None]:
# load hate speech lexicon 
hatepath = r'data\hatebase_dict_vua_format.csv'
hate_df = pd.read_csv(hatepath, delimiter=';')
hate_lex = list(hate_df['Entry'])

In [None]:
# load glove word embedding model
path = 'glove.twitter.27B.100d.txt' # set this path corectly
glove2word2vec(glove_input_file=path, word2vec_output_file="gensim_glove_vectors.txt")
model = KeyedVectors.load_word2vec_format("gensim_glove_vectors.txt", binary=False)

### Extract features

In [None]:
# set all functions to retrieve the features
def chars(val):
    return len(val)

def word_count(val):
    return len(tokenizerreg.tokenize(val))

def av_wordlen(val):
    words = tokenizerreg.tokenize(val)
    total_len = 0
    for word in words:
        total_len += len(word)
    try:
        return total_len/len(words)
    except:
        return 0

def caps_ratio(val):
    return sum(map(str.isupper, val))/len(val)

def excl_ratio(val):
    return val.count('!')/len(val)

def hate_ratio(val):
    words = tokenizerreg.tokenize(val)
    hates = len([t for t in words if t in hate_lex])
    return hates/len(words)

def pronoun_ratio(val):
    words = tokenizerreg.tokenize(val)
    pronouns = sum([words.count(x) for x in ['us','we','i', 'me', 'mine', 'ours', 'myself', 'ourselves']])
    # if pronouns==0:
    #     pronouns=1
    try:
        return pronouns/len(words)
    except:
        return 0

def unknown_ratio(val):
    tokens = tokenizerreg.tokenize(val)
    unknown=0
    for t in tokens:
        if t in model:
            continue
        else:
            unknown+=1
    return unknown/len(tokens)

In [None]:
# Add features to each dataset
for data in all_datasets:
    data['Number of Characters'] = data.text.transform(chars)
    data['Number of words'] = data.text.transform(word_count)
    data['Average wordlength'] = data.text.transform(av_wordlen)
    data['Capitalization ratio'] = data.text.transform(caps_ratio)
    data['Exclamation mark ratio'] = data.text.transform(excl_ratio)
    data['Hate word ratio'] = data.text.transform(hate_ratio)
    data['1st person pronoun ratio'] = data.text.transform(pronoun_ratio)
    data['Unknown words ratio'] = data.text.transform(unknown_ratio)

### Scale all features
and store a new file with scaled features and input ID's

In [None]:
# separate scaler for each training set
olid_scaler = StandardScaler()
hasoc_scaler = StandardScaler()

# set a temporary df contain only the id's of the data
temp_df = olid_train[['id']]

# scale the features and transform the scaler to this set and generate new df
olid_scaled = olid_scaler.fit_transform(olid_train[[
  'text_length','#words','av_wordlen','Caps','Excl','pronouns','unknownwords','Hate']].to_numpy())
olid_scaled = pd.DataFrame(olid_scaled, columns=[
  'text_length','#words','av_wordlen','Caps','Excl','pronouns','unknownwords','Hate'])

# Add scaled features to the temporary df
full_olid_train_scaled = pd.concat([temp_df, olid_scaled], axis=1)

# same for HASOC data
temp_df = hasoc_train[['id']]
hasoc_scaled = hasoc_scaler.fit_transform(hasoc_train[[
  'text_length','#words','av_wordlen','Caps','Excl','pronouns','unknownwords','Hate']].to_numpy())
hasoc_scaled = pd.DataFrame(hasoc_scaled, columns=[
  'text_length','#words','av_wordlen','Caps','Excl','pronouns','unknownwords','Hate'])

full_hasoc_train_scaled = pd.concat([temp_df, hasoc_scaled], axis=1)

In [None]:
# Test data is scaled based on both scalers, depending on model.
# For cross-domain, data is scaled with hasoc_scaler, in-domain with olid_scaler
temp_df = olid_test[['id']]

test_olid_scaled = olid_scaler.transform(olid_test[[
  'text_length','#words','av_wordlen','Caps','Excl','pronouns','unknownwords','Hate']].to_numpy())
test_olid_scaled = pd.DataFrame(test_olid_scaled, columns=[
  'text_length','#words','av_wordlen','Caps','Excl','pronouns','unknownwords','Hate'])
full_olid_test_olid_scaled = pd.concat([temp_df, test_olid_scaled], axis=1)

test_hasoc_scaled = hasoc_scaler.transform(olid_test[[
  'text_length','#words','av_wordlen','Caps','Excl','pronouns','unknownwords','Hate']].to_numpy())
test_hasoc_scaled = pd.DataFrame(test_hasoc_scaled, columns=[
  'text_length','#words','av_wordlen','Caps','Excl','pronouns','unknownwords','Hate'])
full_olid_test_hasoc_scaled = pd.concat([temp_df, test_hasoc_scaled], axis=1)

In [None]:
# store data with added features
full_olid_train_scaled.to_csv('data\olid_train_scaled_features.csv', sep=';')
full_hasoc_train_scaled.to_csv('data\hasoc_train_scaled_features.csv', sep=';')

full_olid_test_olid_scaled.to_csv('data\olid_test_olid_scaled_features.csv', sep=';')
full_olid_test_hasoc_scaled.to_csv('data\olid_test_hasoc_scaled_features.csv', sep=';')