In [1]:
import nltk
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras.layers import Dense
from sentence_transformers import SentenceTransformer

sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [2]:
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [3]:
# You have to adjust the
df_arguments = pd.read_csv("kpm_data/arguments_train.csv")
df_keypoints = pd.read_csv("kpm_data/key_points_train.csv")
df_labels = pd.read_csv("kpm_data/labels_train.csv")

In [4]:
df_arguments

Unnamed: 0,arg_id,argument,topic,stance
0,arg_0_0,`people reach their limit when it comes to the...,Assisted suicide should be a criminal offence,-1
1,arg_0_1,A patient should be able to decide when they h...,Assisted suicide should be a criminal offence,-1
2,arg_0_2,a person has the right to end their suffering ...,Assisted suicide should be a criminal offence,-1
3,arg_0_3,a person should have the dignity to choose how...,Assisted suicide should be a criminal offence,-1
4,arg_0_4,a person should have the right to be able to c...,Assisted suicide should be a criminal offence,-1
...,...,...,...,...
5578,arg_27_218,we should subsidize vocational education to en...,We should subsidize vocational education,1
5579,arg_27_219,We should subsidize vocational education to su...,We should subsidize vocational education,1
5580,arg_27_220,While many who graduate from universities stru...,We should subsidize vocational education,1
5581,arg_27_221,with the rising cost of college tuition vocati...,We should subsidize vocational education,1


In [5]:
df_keypoints

Unnamed: 0,key_point_id,key_point,topic,stance
0,kp_0_0,Assisted suicide gives dignity to the person t...,Assisted suicide should be a criminal offence,-1
1,kp_0_1,Assisted suicide reduces suffering,Assisted suicide should be a criminal offence,-1
2,kp_0_2,People should have the freedom to choose to en...,Assisted suicide should be a criminal offence,-1
3,kp_0_3,The terminally ill would benefit from assisted...,Assisted suicide should be a criminal offence,-1
4,kp_0_4,Assisted suicide allows people to solicit some...,Assisted suicide should be a criminal offence,1
...,...,...,...,...
202,kp_27_3,subsidizing vocational education is expensive,We should subsidize vocational education,-1
203,kp_27_4,subsidizing vocational education promotes thos...,We should subsidize vocational education,1
204,kp_27_5,vocational education is a good career choice,We should subsidize vocational education,1
205,kp_27_6,vocational education better fits many students,We should subsidize vocational education,1


In [6]:
df_labels

Unnamed: 0,arg_id,key_point_id,label
0,arg_0_0,kp_0_0,0
1,arg_0_121,kp_0_4,0
2,arg_0_121,kp_0_5,0
3,arg_0_121,kp_0_6,1
4,arg_0_121,kp_0_7,0
...,...,...,...
20630,arg_27_221,kp_27_6,0
20631,arg_27_221,kp_27_7,0
20632,arg_27_222,kp_27_4,0
20633,arg_27_222,kp_27_5,1


In [7]:
df_labels_ones = df_labels[df_labels["label"] == 1]
df_labels_ones


Unnamed: 0,arg_id,key_point_id,label
3,arg_0_121,kp_0_6,1
9,arg_0_1,kp_0_2,1
11,arg_0_2,kp_0_1,1
14,arg_0_3,kp_0_2,1
18,arg_0_4,kp_0_2,1
...,...,...,...
20604,arg_27_89,kp_27_1,1
20605,arg_27_90,kp_27_2,1
20614,arg_27_92,kp_27_3,1
20623,arg_27_220,kp_27_5,1


In [8]:
df_labels_zeros = df_labels[df_labels["label"] == 0]
df_labels_zeros

Unnamed: 0,arg_id,key_point_id,label
0,arg_0_0,kp_0_0,0
1,arg_0_121,kp_0_4,0
2,arg_0_121,kp_0_5,0
4,arg_0_121,kp_0_7,0
5,arg_0_121,kp_0_8,0
...,...,...,...
20629,arg_27_221,kp_27_4,0
20630,arg_27_221,kp_27_6,0
20631,arg_27_221,kp_27_7,0
20632,arg_27_222,kp_27_4,0


In [9]:
df_labels_zeros_reduced = df_labels_zeros.sample(n=4260, random_state=1)
df_labels_zeros_reduced

Unnamed: 0,arg_id,key_point_id,label
1310,arg_1_157,kp_1_8,0
2953,arg_2_83,kp_2_5,0
3714,arg_3_58,kp_3_4,0
13003,arg_19_159,kp_19_4,0
11129,arg_14_109,kp_14_2,0
...,...,...,...
1474,arg_1_177,kp_1_6,0
1703,arg_1_76,kp_1_2,0
4804,arg_5_198,kp_5_5,0
11417,arg_16_44,kp_16_1,0


In [10]:
df_labels_merged = np.vstack((df_labels_ones, df_labels_zeros_reduced))


In [11]:
np.random.shuffle(df_labels_merged)
df_labels_merged = pd.DataFrame(df_labels_merged)
df_labels_merged.columns = ["arg_id", "key_point_id", "label"]
df_labels_merged
df_labels_merged_labels = df_labels_merged["label"]

In [12]:
merged_dataset=df_labels_merged.merge(df_arguments, left_on="arg_id", right_on="arg_id")
# df_arguments.loc[df_arguments['arg_id'] == "arg_4_121"]
full_dataset=merged_dataset.merge(df_keypoints, left_on="key_point_id", right_on="key_point_id")
full_dataset

Unnamed: 0,arg_id,key_point_id,label,argument,topic_x,stance_x,key_point,topic_y,stance_y
0,arg_25_212,kp_25_6,0,We should subsidize journalism because informa...,We should subsidize journalism,1,A subsidy is important to make journalism viable,We should subsidize journalism,1
1,arg_25_213,kp_25_6,0,we should subsidize journalism because it can ...,We should subsidize journalism,1,A subsidy is important to make journalism viable,We should subsidize journalism,1
2,arg_25_124,kp_25_6,0,journalism and free press are essential for tr...,We should subsidize journalism,1,A subsidy is important to make journalism viable,We should subsidize journalism,1
3,arg_25_159,kp_25_6,0,journalism should be subsidize to promote impa...,We should subsidize journalism,1,A subsidy is important to make journalism viable,We should subsidize journalism,1
4,arg_25_157,kp_25_6,0,journalism should be funded by the government ...,We should subsidize journalism,1,A subsidy is important to make journalism viable,We should subsidize journalism,1
...,...,...,...,...,...,...,...,...,...
8515,arg_9_123,kp_9_3,0,adopting atheism will help to end senseless wa...,We should adopt atheism,1,Atheism is beneficial to the person that adopt...,We should adopt atheism,1
8516,arg_9_202,kp_9_3,0,there is no true god therefore atheism is the ...,We should adopt atheism,1,Atheism is beneficial to the person that adopt...,We should adopt atheism,1
8517,arg_9_147,kp_9_3,0,atheism is the way to go away from religion co...,We should adopt atheism,1,Atheism is beneficial to the person that adopt...,We should adopt atheism,1
8518,arg_9_137,kp_9_3,0,atheism is a concept to adopt because no relig...,We should adopt atheism,1,Atheism is beneficial to the person that adopt...,We should adopt atheism,1


In [15]:
altdata_arguments = sbert_model.encode(full_dataset["argument"])
# altdata_arguments
altdata_keypoints = sbert_model.encode(full_dataset["key_point"])
# altdata_keypoints
training = np.array(np.hstack((altdata_arguments, altdata_keypoints)))
training

array([[-0.2670554 ,  0.23807876,  0.7331662 , ..., -0.63051814,
        -1.3877122 , -0.12489822],
       [-0.13160288,  0.17808898,  1.1157659 , ..., -0.63051814,
        -1.3877122 , -0.12489822],
       [-0.02032149,  0.12400538,  0.47286725, ..., -0.63051814,
        -1.3877122 , -0.12489822],
       ...,
       [-0.10121359,  0.70776415,  1.1980507 , ...,  0.2339278 ,
        -0.7205468 ,  0.07368473],
       [ 0.66733336,  1.1395385 ,  1.1271716 , ...,  0.2339278 ,
        -0.7205468 ,  0.07368473],
       [-0.05617074,  0.6022745 ,  0.2703439 , ...,  0.23392768,
        -0.7205468 ,  0.07368479]], dtype=float32)

In [24]:
labels = np.array(full_dataset["label"])
labels.dtype
labels=np.asarray(labels).astype(int)
labels
# training.dtype
# training.dtypes

array([0, 0, 0, ..., 0, 0, 0])

In [58]:
model = keras.Sequential()
model.add(Dense(768, activation='softmax', input_dim=(1536)))
model.add(Dense(1, activation='linear'))
model.compile(optimizer='sgd', loss='mse')
model.fit(training, labels, epochs=5, verbose=1, batch_size = 1)
model.summary()

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_12 (Dense)            (None, 768)               1180416   
                                                                 
 dense_13 (Dense)            (None, 1)                 769       
                                                                 
Total params: 1,181,185
Trainable params: 1,181,185
Non-trainable params: 0
_________________________________________________________________


In [38]:
model.save('RegressionNNtest.h5')

In [59]:
pred = model.predict(training)
pred

array([[0.5074525],
       [0.5074525],
       [0.5074525],
       ...,
       [0.5074525],
       [0.5074525],
       [0.5074525]], dtype=float32)

In [60]:
labels.shape

(8520,)

In [61]:
pred = pred.flatten()

res = np.vstack((pred, labels))
df_res = pd.DataFrame(res)
df_res = df_res.transpose()
df_res.columns = ["predicted", "true"]
df_res['predicted_label'] = np.where(df_res['predicted'] >= 0.5, 1, 0)
df_res['match'] = np.where(df_res['true'] == df_res['predicted_label'], 1, 0)

df_res['match'].sum()
accuracy = df_res['match'].sum() / df_res.shape[0]
accuracy

0.5

In [29]:
df_res.describe()

Unnamed: 0,predicted,true,predicted_label,match
count,8520.0,8520.0,8520.0,8520.0
mean,0.538504,0.5,0.46338,0.771362
std,0.288646,0.500029,0.498686,0.419981
min,-0.038068,0.0,0.0,0.0
25%,0.315202,0.0,0.0,1.0
50%,0.453401,0.5,0.0,1.0
75%,0.84396,1.0,1.0,1.0
max,0.993901,1.0,1.0,1.0


In [30]:
df_one = df_res[df_res["true"] == 1]
df_one.describe()

Unnamed: 0,predicted,true,predicted_label,match
count,4260.0,4260.0,4260.0,4260.0
mean,0.71282,1.0,0.734742,0.734742
std,0.246643,0.0,0.441522,0.441522
min,0.039602,1.0,0.0,0.0
25%,0.478523,1.0,0.0,0.0
50%,0.809449,1.0,1.0,1.0
75%,0.928725,1.0,1.0,1.0
max,0.993763,1.0,1.0,1.0


In [None]:
df_zeros = df_res[df_res["true"] == 0]
df_zeros.describe()

In [31]:
df_arguments_test = pd.read_csv("kpm_data/arguments_test.csv")
df_keypoints_test = pd.read_csv("kpm_data/key_points_test.csv")
df_labels_test = pd.read_csv("kpm_data/labels_test.csv")

In [32]:
df_labels_test

Unnamed: 0,arg_id,key_point_id,label
0,arg_0_112,kp_0_4,0
1,arg_0_112,kp_0_5,0
2,arg_0_112,kp_0_6,0
3,arg_0_112,kp_0_7,1
4,arg_0_0,kp_0_0,0
...,...,...,...
3421,arg_1_232,kp_1_9,0
3422,arg_0_111,kp_0_3,0
3423,arg_0_111,kp_0_0,1
3424,arg_0_111,kp_0_1,0


In [33]:
merged_dataset_test=df_labels.merge(df_arguments_test, left_on="arg_id", right_on="arg_id")
# df_arguments.loc[df_arguments['arg_id'] == "arg_4_121"]
full_dataset_test=merged_dataset.merge(df_keypoints_test, left_on="key_point_id", right_on="key_point_id")
full_dataset_test

Unnamed: 0,arg_id,key_point_id,label,argument,topic_x,stance_x,key_point,topic_y,stance_y
0,arg_0_59,kp_0_1,1,assited suicide allows those with a painful an...,Assisted suicide should be a criminal offence,-1,Mandatory vaccination contradicts basic rights,Routine child vaccinations should be mandatory,-1
1,arg_0_34,kp_0_1,1,assisted suicide should not be a criminal defe...,Assisted suicide should be a criminal offence,-1,Mandatory vaccination contradicts basic rights,Routine child vaccinations should be mandatory,-1
2,arg_0_24,kp_0_1,1,assisted suicide is necessary in our modern wo...,Assisted suicide should be a criminal offence,-1,Mandatory vaccination contradicts basic rights,Routine child vaccinations should be mandatory,-1
3,arg_0_100,kp_0_1,1,people who are terminally ill and suffering gr...,Assisted suicide should be a criminal offence,-1,Mandatory vaccination contradicts basic rights,Routine child vaccinations should be mandatory,-1
4,arg_0_57,kp_0_1,1,assisted suicides are less painful and are an ...,Assisted suicide should be a criminal offence,-1,Mandatory vaccination contradicts basic rights,Routine child vaccinations should be mandatory,-1
...,...,...,...,...,...,...,...,...,...
1260,arg_2_26,kp_2_2,0,it is a proper choice of each person who choos...,The vow of celibacy should be abandoned,-1,The US has high taxation/high costs of living,The USA is a good country to live in,-1
1261,arg_2_13,kp_2_2,1,celibacy is a state of purity the church shoul...,The vow of celibacy should be abandoned,-1,The US has high taxation/high costs of living,The USA is a good country to live in,-1
1262,arg_2_11,kp_2_2,0,celibacy is a personal choice. it is up to the...,The vow of celibacy should be abandoned,-1,The US has high taxation/high costs of living,The USA is a good country to live in,-1
1263,arg_2_53,kp_2_2,0,the vow of celibacy is a safe-guard that a per...,The vow of celibacy should be abandoned,-1,The US has high taxation/high costs of living,The USA is a good country to live in,-1


In [34]:
altdata_arguments_test = sbert_model.encode(full_dataset_test["argument"])
# altdata_arguments
altdata_keypoints_test = sbert_model.encode(full_dataset_test["key_point"])
# altdata_keypoints
training_test = np.hstack((altdata_arguments_test, altdata_keypoints_test))
training_test

array([[-0.24850523,  0.19954897,  1.124284  , ..., -0.17297818,
        -0.91488683,  0.5066472 ],
       [-0.06659681,  0.65394026,  0.5140067 , ..., -0.17297818,
        -0.91488683,  0.5066472 ],
       [-0.0396717 ,  0.6117953 ,  0.7496689 , ..., -0.17297818,
        -0.91488683,  0.5066472 ],
       ...,
       [-0.33892244,  0.2477396 ,  0.75089955, ..., -0.5594947 ,
        -0.32103527, -0.7785235 ],
       [-0.12383575,  0.8471346 ,  1.028576  , ..., -0.5594947 ,
        -0.32103527, -0.7785235 ],
       [ 0.00511938,  0.58187443,  0.69142115, ..., -0.5594947 ,
        -0.32103527, -0.7785235 ]], dtype=float32)

In [44]:
labels_test = np.array(full_dataset_test["label"])

In [56]:
pred = model.predict(training_test)
pred

array([[0.4293753 ],
       [0.92215025],
       [0.74049145],
       ...,
       [0.02054828],
       [0.02099776],
       [0.05051783]], dtype=float32)

In [57]:
pred = pred.flatten()

res = np.vstack((pred, labels_test))
df_res = pd.DataFrame(res)
df_res = df_res.transpose()
df_res.columns = ["predicted", "true"]
df_res['predicted_label'] = np.where(df_res['predicted'] >= 0.5, 1, 0)
df_res['match'] = np.where(df_res['true'] == df_res['predicted_label'], 1, 0)

df_res['match'].sum()
accuracy = df_res['match'].sum() / df_res.shape[0]
accuracy

0.4972332015810277

In [None]:
import sklearn as skl

In [None]:
df_res["true"]

In [None]:
skl.metrics.f1_score(np.asarray(df_res["true"]).astype(float), np.asarray(df_res["predicted_label"]).astype(float))

In [None]:
df_res

In [None]:
df_res.describe()


In [None]:
df_one = df_res[df_res["true"] == 1]
df_one.describe()

In [None]:
df_zeros = df_res[df_res["true"] == 0]
df_zeros.describe()