# SVM Approach

In [77]:
import json

In [3]:
data = dict()
datasets = ['ArMIS','MD-Agreement','ConvAbuse', 'HS-Brexit'] 
splits = ['train','dev']
for current_dataset in datasets:
    data[current_dataset] = {}
    for current_split in splits:
        current_file = './data/data_practice/' + current_dataset + '_dataset/' + current_dataset + '_' + current_split + '.json' 
        data[current_dataset][current_split] = json.load(open(current_file, 'r', encoding = 'UTF-8'))                                   
    


### Simple SVM classifier
For a simple start just using word count and inverse document frequency to create simple vectors.

In [23]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.svm import SVC, SVR

In [5]:
def myf1_score(conf_m):
    precision = conf_m[0][0] /  (conf_m[0][0] + conf_m[1][0])
    recall =  conf_m[0][0] / (conf_m[0][0] + conf_m[0][1])

    return 2 * (precision * recall) / (precision + recall)

In [6]:
def simpleSVM(df, stop_words_lang="english"):
    # 1. Split into X and y
    X = df['text']
    y = df['hard_label']

    # 2. Use CountVectorizer and remove stop words
    vectorizer = CountVectorizer(stop_words=stop_words_lang)
    X_vec = vectorizer.fit_transform(X).todense()

    # 3. tdf idf transformation
    tfidf = TfidfTransformer()
    X_tfidf = tfidf.fit_transform(np.asarray(X_vec)).todense()

    # 4. train test split
    X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size = 0.3, random_state = 0)
    
    # 5. Fit SVM
    classifier = SVC(kernel='linear')
    classifier.fit(np.asarray(X_train), y_train)
    y_pred = classifier.predict(np.asarray(X_test))
    
    # Evaluation
    conf_m = confusion_matrix(y_test, y_pred)
    print(f"Accuracy: {(conf_m[0][0] + conf_m[1][1]) / conf_m.sum()}")
    print(f"F1_score: {myf1_score(conf_m)}")
    
    return conf_m, y_pred
    

In [7]:
def transform_data(data):
    data = data["train"]
    df = pd.DataFrame(data).transpose()
    df = df.astype({"hard_label": int}, errors='raise') 
    return df

#### Brexit

In [8]:
df_brexit = transform_data(data['HS-Brexit'])
df_brexit.head()

Unnamed: 0,text,annotation task,number of annotations,annotations,annotators,lang,hard_label,soft_label,split,other_info
1,<user> <user> I'm so glad about #Brexit.. My a...,hate speech detection,6,0,"Ann1,Ann2,Ann3,Ann4,Ann5,Ann6",en,0,"{'0': 1.0, '1': 0.0}",train,{'other annotations': {'aggressive language de...
2,RT <user>: There was more to #Brexit than immi...,hate speech detection,6,0,"Ann1,Ann2,Ann3,Ann4,Ann5,Ann6",en,0,"{'0': 1.0, '1': 0.0}",train,{'other annotations': {'aggressive language de...
3,"At the end of the day, the leave campaign won ...",hate speech detection,6,0,"Ann1,Ann2,Ann3,Ann4,Ann5,Ann6",en,0,"{'0': 1.0, '1': 0.0}",train,{'other annotations': {'aggressive language de...
4,So the reducing migration thing wasn't quite w...,hate speech detection,6,0,"Ann1,Ann2,Ann3,Ann4,Ann5,Ann6",en,0,"{'0': 1.0, '1': 0.0}",train,{'other annotations': {'aggressive language de...
5,A Brit Immigrant Asks Britain to Become India’...,hate speech detection,6,0,"Ann1,Ann2,Ann3,Ann4,Ann5,Ann6",en,0,"{'0': 1.0, '1': 0.0}",train,{'other annotations': {'aggressive language de...


In [14]:
conv_m , y_pred = simpleSVM(df_brexit)
conv_m

Accuracy: 0.9110169491525424
F1_score: 0.953229398663697


array([[214,   0],
       [ 21,   1]], dtype=int64)

#### ConvAbuse

In [91]:
df_conv = transform_data(data["ConvAbuse"])
df_conv.head()

Unnamed: 0,text,annotation task,number of annotations,annotations,annotators,lang,hard_label,soft_label,split,other_info
1,"{""prev_agent"": ""Please go on."", ""prev_user"": ""...",abusivness detection,3,111,"Ann2,Ann3,Ann7",en,0,"{'0': 1.0, '1': 0.0}",train,"{'bot': 'E.L.I.Z.A.', 'other_annotations': {'a..."
2,"{""prev_agent"": ""How long have you been on the ...",abusivness detection,3,111,"Ann3,Ann7,Ann8",en,0,"{'0': 1.0, '1': 0.0}",train,"{'bot': 'E.L.I.Z.A.', 'other_annotations': {'a..."
3,"{""prev_agent"": ""You are being a bit negative.""...",abusivness detection,2,-11,"Ann1,Ann7",en,0,"{'0': 0.5, '1': 0.5}",train,"{'bot': 'E.L.I.Z.A.', 'other_annotations': {'a..."
4,"{""prev_agent"": ""Will you be travelling in Econ...",abusivness detection,2,11,"Ann3,Ann6",en,0,"{'0': 1.0, '1': 0.0}",train,"{'bot': 'CarbonBot', 'other_annotations': {'ab..."
5,"{""prev_agent"": ""Will you be travelling in Econ...",abusivness detection,3,111,"Ann1,Ann2,Ann3",en,0,"{'0': 1.0, '1': 0.0}",train,"{'bot': 'CarbonBot', 'other_annotations': {'ab..."


Different text format to process

#### MD-Agreement

In [74]:
df_md = transform_data(data["MD-Agreement"])
df_md.head()

Unnamed: 0,text,annotation task,number of annotations,annotations,annotators,lang,hard_label,soft_label,split,other_info
1,<user> <user> No way Jose!!,offensiveness detection,5,0,"Ann418,Ann266,Ann149,Ann730,Ann345",en,0,"{'0': 1.0, '1': 0.0}",train,{'domain': 'Elections2020'}
2,"Good god, what is the matter with people ?",offensiveness detection,5,0,"Ann733,Ann422,Ann779,Ann514,Ann777",en,0,"{'0': 1.0, '1': 0.0}",train,{'domain': 'Covid-19'}
3,<user> <user> <user> <user> Um the Kurds are h...,offensiveness detection,5,1,"Ann425,Ann511,Ann779,Ann420,Ann721",en,0,"{'0': 0.8, '1': 0.2}",train,{'domain': 'Elections2020'}
4,What is WRONG with these people?,offensiveness detection,5,0,"Ann632,Ann179,Ann701,Ann201,Ann661",en,0,"{'0': 1.0, '1': 0.0}",train,{'domain': 'BLM'}
5,<user> This earpiece too plus a wire on his sl...,offensiveness detection,5,10000,"Ann266,Ann168,Ann149,Ann381,Ann774",en,0,"{'0': 0.8, '1': 0.2}",train,{'domain': 'Elections2020'}


In [102]:
# conv_m , y_pred = simpleSVM(df_md)
conv_m

Accuracy: 0.7750252780586451
F1_score: 0.45532435740514077


array([[1347,   66],
       [ 379,  186]], dtype=int64)

In [76]:
# Takes a while

#### ArMIS

In [None]:
df_ar = transform_data(data["ArMIS"])
df_ar.head()

In [None]:
# simpleSVM(df_ar, stop_words_lang=None)

## Brexit SVM per Group

Train a Regression Model per Annotator Group to have models fitted to the perceptions which then can be compared.

Ann1,2,3 are target (muslim) group Ann4,5,6 are control group

In [23]:
df_brexit["other_info"][0]

{'other annotations': {'aggressive language detection': '0,0,0,0,0,0',
  'offensive language detection': '1,1,1,0,0,1'},
 'annotators group': 'group1,group1,group1,group2,group2,group2',
 'group1': 'target group',
 'group2': 'control group'}

In [33]:
df_brexit["annotators"].value_counts()

Ann1,Ann2,Ann3,Ann4,Ann5,Ann6    784
Name: annotators, dtype: int64

In [20]:
# Split labels into control and target groups
y_t = []
y_c = []
for i in df_brexit["annotations"]:
    labels = list(map(int, i.split(",")))
    y_t.append(np.mean(labels[:3]))
    y_c.append(np.mean(labels[3:]))
    

In [26]:
X = df_brexit['text']

# 2. Use CountVectorizer and remove stop words
vectorizer = CountVectorizer(stop_words="english")
X_vec = vectorizer.fit_transform(X).todense()

# 3. tdf idf transformation
tfidf = TfidfTransformer()
X_tfidf = tfidf.fit_transform(np.asarray(X_vec)).todense()

# 4. train test split
X_train, X_test, y_t_train, y_t_test = train_test_split(X_tfidf, y_t, test_size = 0.3, random_state = 0)
_, _, y_c_train, y_c_test = train_test_split(X_tfidf, y_c, test_size = 0.3, random_state = 0)

Use Regression to get a soft evaluation on how high the consent per group is

In [27]:
reg_c = SVR(kernel='linear')
reg_c.fit(np.asarray(X_train), y_c_train)
y_c_pred = reg_c.predict(np.asarray(X_test))

In [28]:
reg_t = SVR(kernel='linear')
reg_t.fit(np.asarray(X_train), y_t_train)
y_t_pred = reg_t.predict(np.asarray(X_test))

In [44]:
## given code snippet
def cross_entropy(targets_soft, predictions_soft, epsilon = 1e-12):                                
    predictions = np.clip(predictions_soft, epsilon, 1. - epsilon)                                      
    N = predictions.shape[0]
    ce = -np.sum(targets_soft*np.log(predictions+1e-9))/N
    return ce


In [54]:
y_target = (np.array(y_c_test) + np.array(y_t_test)) / 2
y_pred = (y_c_pred + y_t_pred) / 2

target_soft = []
for i in y_target:
    target_soft.append([i, 1-i])
    
y_soft_pred = []
for i in y_pred:
    y_soft_pred.append([i, 1-i])

In [60]:
ce = cross_entropy(target_soft, y_soft_pred)
ce

0.3294894054239409

This cross entropy error seems ok

In [73]:
# Control previous result by doing regression within one model
y = []
for i in df_brexit["soft_label"]:
    y.append(int(i["1"]))

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size = 0.3, random_state = 0)
reg = SVR(kernel='linear')
reg.fit(np.asarray(X_train), y_train)
y_pred_whole = reg_c.predict(np.asarray(X_test))
y_soft_pred_whole = []
for i in y_pred_whole:
    y_soft_pred_whole.append([i, 1-i])
target_soft_whole = []
for i in y_test:
    target_soft_whole.append([i, 1-i])

ce = cross_entropy(target_soft_whole, y_soft_pred_whole)
ce

0.32702354685923984

The Cross entropy does not change much, just a little bit better. So the combination of both models work as well as using just one.

My idea was that we now have two regression parameter sets as well as estimates per group.
Therefore we could check, f.e:
- where do the weights of the linear regression differ the most between the two groups. So, check which words imply a different perception regarding the offensivness
- For which texts do the predicted labels differ the most. Is it possible to extract topics where "most" disagreement occurs

But I do not feel like this is really useful, now that I've done it.