In [60]:
import nltk
import numpy as np
import pandas as pd
from tensorflow import keras
from tensorflow.keras.layers import Dense
from sentence_transformers import SentenceTransformer

sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [61]:
def cosine(u, v):
    return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))

In [62]:
# You have to adjust the
df_arguments = pd.read_csv("kpm_data/arguments_train.csv")
df_keypoints = pd.read_csv("kpm_data/key_points_train.csv")
df_labels = pd.read_csv("kpm_data/labels_train.csv")

In [63]:
df_arguments

Unnamed: 0,arg_id,argument,topic,stance
0,arg_0_0,`people reach their limit when it comes to the...,Assisted suicide should be a criminal offence,-1
1,arg_0_1,A patient should be able to decide when they h...,Assisted suicide should be a criminal offence,-1
2,arg_0_2,a person has the right to end their suffering ...,Assisted suicide should be a criminal offence,-1
3,arg_0_3,a person should have the dignity to choose how...,Assisted suicide should be a criminal offence,-1
4,arg_0_4,a person should have the right to be able to c...,Assisted suicide should be a criminal offence,-1
...,...,...,...,...
5578,arg_27_218,we should subsidize vocational education to en...,We should subsidize vocational education,1
5579,arg_27_219,We should subsidize vocational education to su...,We should subsidize vocational education,1
5580,arg_27_220,While many who graduate from universities stru...,We should subsidize vocational education,1
5581,arg_27_221,with the rising cost of college tuition vocati...,We should subsidize vocational education,1


In [64]:
df_keypoints

Unnamed: 0,key_point_id,key_point,topic,stance
0,kp_0_0,Assisted suicide gives dignity to the person t...,Assisted suicide should be a criminal offence,-1
1,kp_0_1,Assisted suicide reduces suffering,Assisted suicide should be a criminal offence,-1
2,kp_0_2,People should have the freedom to choose to en...,Assisted suicide should be a criminal offence,-1
3,kp_0_3,The terminally ill would benefit from assisted...,Assisted suicide should be a criminal offence,-1
4,kp_0_4,Assisted suicide allows people to solicit some...,Assisted suicide should be a criminal offence,1
...,...,...,...,...
202,kp_27_3,subsidizing vocational education is expensive,We should subsidize vocational education,-1
203,kp_27_4,subsidizing vocational education promotes thos...,We should subsidize vocational education,1
204,kp_27_5,vocational education is a good career choice,We should subsidize vocational education,1
205,kp_27_6,vocational education better fits many students,We should subsidize vocational education,1


In [136]:
df_labels

Unnamed: 0,arg_id,key_point_id,label
0,arg_0_0,kp_0_0,0
1,arg_0_121,kp_0_4,0
2,arg_0_121,kp_0_5,0
3,arg_0_121,kp_0_6,1
4,arg_0_121,kp_0_7,0
...,...,...,...
20630,arg_27_221,kp_27_6,0
20631,arg_27_221,kp_27_7,0
20632,arg_27_222,kp_27_4,0
20633,arg_27_222,kp_27_5,1


In [135]:
df_labels_ones = df_labels[df_labels["label"] == 1]
df_labels_ones


Unnamed: 0,arg_id,key_point_id,label
3,arg_0_121,kp_0_6,1
9,arg_0_1,kp_0_2,1
11,arg_0_2,kp_0_1,1
14,arg_0_3,kp_0_2,1
18,arg_0_4,kp_0_2,1
...,...,...,...
20604,arg_27_89,kp_27_1,1
20605,arg_27_90,kp_27_2,1
20614,arg_27_92,kp_27_3,1
20623,arg_27_220,kp_27_5,1


In [137]:
df_labels_zeros = df_labels[df_labels["label"] == 0]
df_labels_zeros

Unnamed: 0,arg_id,key_point_id,label
0,arg_0_0,kp_0_0,0
1,arg_0_121,kp_0_4,0
2,arg_0_121,kp_0_5,0
4,arg_0_121,kp_0_7,0
5,arg_0_121,kp_0_8,0
...,...,...,...
20629,arg_27_221,kp_27_4,0
20630,arg_27_221,kp_27_6,0
20631,arg_27_221,kp_27_7,0
20632,arg_27_222,kp_27_4,0


In [147]:
df_labels_zeros_reduced = df_labels_zeros.sample(n=4260, random_state=1)
df_labels_zeros_reduced

Unnamed: 0,arg_id,key_point_id,label
1310,arg_1_157,kp_1_8,0
2953,arg_2_83,kp_2_5,0
3714,arg_3_58,kp_3_4,0
13003,arg_19_159,kp_19_4,0
11129,arg_14_109,kp_14_2,0
...,...,...,...
1474,arg_1_177,kp_1_6,0
1703,arg_1_76,kp_1_2,0
4804,arg_5_198,kp_5_5,0
11417,arg_16_44,kp_16_1,0


In [157]:
df_labels_merged = np.vstack((df_labels_ones, df_labels_zeros_reduced))


In [158]:
np.random.shuffle(df_labels_merged)
df_labels_merged = pd.DataFrame(df_labels_merged)
df_labels_merged.columns = ["arg_id", "key_point_id", "label"]
df_labels_merged
df_labels_merged_labels = df_labels_merged[]

Unnamed: 0,arg_id,key_point_id,label
0,arg_10_36,kp_10_1,1
1,arg_12_61,kp_12_2,1
2,arg_22_113,kp_22_0,1
3,arg_0_208,kp_0_9,0
4,arg_14_48,kp_14_0,1
...,...,...,...
8515,arg_20_7,kp_20_3,0
8516,arg_21_213,kp_21_4,0
8517,arg_16_37,kp_16_0,1
8518,arg_2_139,kp_2_6,1


In [159]:
merged_dataset=df_labels_merged.merge(df_arguments, left_on="arg_id", right_on="arg_id")
# df_arguments.loc[df_arguments['arg_id'] == "arg_4_121"]
full_dataset=merged_dataset.merge(df_keypoints, left_on="key_point_id", right_on="key_point_id")
full_dataset

Unnamed: 0,arg_id,key_point_id,label,argument,topic_x,stance_x,key_point,topic_y,stance_y
0,arg_10_36,kp_10_1,1,libertarianism leaves the poor and disadvantag...,We should adopt libertarianism,-1,Libertarianism harms minorities/those who are ...,We should adopt libertarianism,-1
1,arg_10_106,kp_10_1,0,we shouldn't because some people need interven...,We should adopt libertarianism,-1,Libertarianism harms minorities/those who are ...,We should adopt libertarianism,-1
2,arg_10_64,kp_10_1,1,more government intervention is needed to ensu...,We should adopt libertarianism,-1,Libertarianism harms minorities/those who are ...,We should adopt libertarianism,-1
3,arg_10_79,kp_10_1,1,"under libertarianism, there would be no social...",We should adopt libertarianism,-1,Libertarianism harms minorities/those who are ...,We should adopt libertarianism,-1
4,arg_10_61,kp_10_1,1,libertarianism would stop people from being ab...,We should adopt libertarianism,-1,Libertarianism harms minorities/those who are ...,We should adopt libertarianism,-1
...,...,...,...,...,...,...,...,...,...
8515,arg_25_85,kp_25_2,0,there are too many news outlets already so we ...,We should subsidize journalism,-1,Journalism is like every other business and if...,We should subsidize journalism,-1
8516,arg_25_48,kp_25_2,0,State subsidies for journalism will make journ...,We should subsidize journalism,-1,Journalism is like every other business and if...,We should subsidize journalism,-1
8517,arg_25_36,kp_25_2,0,journalists are not often regulated so anyone ...,We should subsidize journalism,-1,Journalism is like every other business and if...,We should subsidize journalism,-1
8518,arg_25_15,kp_25_2,1,journalism can make enough income to support i...,We should subsidize journalism,-1,Journalism is like every other business and if...,We should subsidize journalism,-1


In [170]:
altdata_arguments = sbert_model.encode(full_dataset["argument"])
# altdata_arguments
altdata_keypoints = sbert_model.encode(full_dataset["key_point"])
# altdata_keypoints
training = np.array(np.hstack((altdata_arguments, altdata_keypoints)))
training

array([[-0.30637115,  0.83035797,  0.2932304 , ..., -0.96883845,
        -0.51116055, -0.11837567],
       [ 0.5019359 , -0.40482762,  1.4168869 , ..., -0.96883845,
        -0.51116055, -0.11837567],
       [-0.41587275, -0.02706556,  0.22030368, ..., -0.96883845,
        -0.51116055, -0.11837567],
       ...,
       [ 0.17865013,  0.32641155,  0.5654122 , ..., -0.62401634,
        -0.02820914,  0.07711162],
       [ 0.2697417 ,  0.22020559,  0.53038335, ..., -0.62401634,
        -0.02820914,  0.07711162],
       [-0.29194778,  0.3105157 ,  0.577287  , ..., -0.62401634,
        -0.02820914,  0.07711162]], dtype=float32)

In [178]:
labels = np.array(full_dataset["label"])
labels.dtype
labels=np.asarray(labels).astype(np.int)
# training.dtype
# training.dtypes

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  labels=np.asarray(labels).astype(np.int)


In [179]:
model = keras.Sequential()
model.add(Dense(768, activation='softmax', input_dim=(1536)))
model.add(Dense(1, activation='linear'))
model.compile(optimizer='sgd', loss='mse')
model.fit(training, labels, epochs=20, verbose=1, batch_size = 1)
model.summary()

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_14 (Dense)            (None, 768)               1180416   
                                                                 
 dense_15 (Dense)            (None, 1)                 769       
                                                                 
Total params: 1,181,185
Trainable params: 1,181,185
Non-trainable params: 0
_________________________________________________________________


In [163]:
model.save('RegressionNNFullData.h5')

In [181]:
pred = model.predict(training)
pred

AttributeError: 'numpy.ndarray' object has no attribute 'describe'

In [87]:
labels.shape

(20635,)

In [89]:
pred = pred.flatten()

res = np.vstack((pred, labels))
df_res = pd.DataFrame(res)
df_res = df_res.transpose()
df_res.columns = ["predicted", "true"]
df_res['predicted_label'] = np.where(df_res['predicted'] >= 0.5, 1, 0)
df_res['match'] = np.where(df_res['true'] == df_res['predicted_label'], 1, 0)

df_res['match'].sum()
accuracy = df_res['match'].sum() / df_res.shape[0]
accuracy

0.8734189483886601

In [90]:
df_res.describe()

Unnamed: 0,predicted,true,predicted_label,match
count,20635.0,20635.0,20635.0,20635.0
mean,0.166928,0.206445,0.107293,0.873419
std,0.248231,0.404764,0.309493,0.332511
min,-0.03876,0.0,0.0,0.0
25%,0.009544,0.0,0.0,1.0
50%,0.075664,0.0,0.0,1.0
75%,0.193458,0.0,0.0,1.0
max,0.983394,1.0,1.0,1.0


In [91]:
df_one = df_res[df_res["true"] == 1]
df_one.describe()

Unnamed: 0,predicted,true,predicted_label,match
count,4260.0,4260.0,4260.0,4260.0
mean,0.482367,1.0,0.453286,0.453286
std,0.337139,0.0,0.497871,0.497871
min,-0.030209,1.0,0.0,0.0
25%,0.173825,1.0,0.0,0.0
50%,0.416813,1.0,0.0,0.0
75%,0.84528,1.0,1.0,1.0
max,0.983394,1.0,1.0,1.0


In [92]:
df_zeros = df_res[df_res["true"] == 0]
df_zeros.describe()

Unnamed: 0,predicted,true,predicted_label,match
count,16375.0,16375.0,16375.0,16375.0
mean,0.084866,0.0,0.017282,0.982718
std,0.124352,0.0,0.130326,0.130326
min,-0.03876,0.0,0.0,0.0
25%,0.003338,0.0,0.0,1.0
50%,0.044073,0.0,0.0,1.0
75%,0.132588,0.0,0.0,1.0
max,0.972649,0.0,1.0,1.0


In [182]:
df_arguments_test = pd.read_csv("kpm_data/arguments_test.csv")
df_keypoints_test = pd.read_csv("kpm_data/key_points_test.csv")
df_labels_test = pd.read_csv("kpm_data/labels_test.csv")

In [183]:
df_labels_test

Unnamed: 0,arg_id,key_point_id,label
0,arg_0_112,kp_0_4,0
1,arg_0_112,kp_0_5,0
2,arg_0_112,kp_0_6,0
3,arg_0_112,kp_0_7,1
4,arg_0_0,kp_0_0,0
...,...,...,...
3421,arg_1_232,kp_1_9,0
3422,arg_0_111,kp_0_3,0
3423,arg_0_111,kp_0_0,1
3424,arg_0_111,kp_0_1,0


In [184]:
merged_dataset_test=df_labels.merge(df_arguments_test, left_on="arg_id", right_on="arg_id")
# df_arguments.loc[df_arguments['arg_id'] == "arg_4_121"]
full_dataset_test=merged_dataset.merge(df_keypoints_test, left_on="key_point_id", right_on="key_point_id")
full_dataset_test

Unnamed: 0,arg_id,key_point_id,label,argument,topic_x,stance_x,key_point,topic_y,stance_y
0,arg_0_208,kp_0_7,0,killing another person is wrong and should be ...,Assisted suicide should be a criminal offence,1,Routine child vaccinations should be mandatory...,Routine child vaccinations should be mandatory,1
1,arg_0_183,kp_0_7,0,assisted suicide should be a criminal offense ...,Assisted suicide should be a criminal offence,1,Routine child vaccinations should be mandatory...,Routine child vaccinations should be mandatory,1
2,arg_0_227,kp_0_7,0,suicide has never been legal. it is still kill...,Assisted suicide should be a criminal offence,1,Routine child vaccinations should be mandatory...,Routine child vaccinations should be mandatory,1
3,arg_0_210,kp_0_7,0,many people do not have the mental capacity to...,Assisted suicide should be a criminal offence,1,Routine child vaccinations should be mandatory...,Routine child vaccinations should be mandatory,1
4,arg_0_243,kp_0_7,0,this would protect someone who's judgement may...,Assisted suicide should be a criminal offence,1,Routine child vaccinations should be mandatory...,Routine child vaccinations should be mandatory,1
...,...,...,...,...,...,...,...,...,...
1260,arg_2_71,kp_2_2,0,the vow of celibacy should be a personal choic...,The vow of celibacy should be abandoned,-1,The US has high taxation/high costs of living,The USA is a good country to live in,-1
1261,arg_2_26,kp_2_2,0,it is a proper choice of each person who choos...,The vow of celibacy should be abandoned,-1,The US has high taxation/high costs of living,The USA is a good country to live in,-1
1262,arg_2_104,kp_2_2,1,traditionally some religions feel like the pra...,The vow of celibacy should be abandoned,-1,The US has high taxation/high costs of living,The USA is a good country to live in,-1
1263,arg_2_84,kp_2_2,0,the vow of celibacy should not be abandoned be...,The vow of celibacy should be abandoned,-1,The US has high taxation/high costs of living,The USA is a good country to live in,-1


In [185]:
altdata_arguments_test = sbert_model.encode(full_dataset_test["argument"])
# altdata_arguments
altdata_keypoints_test = sbert_model.encode(full_dataset_test["key_point"])
# altdata_keypoints
training_test = np.hstack((altdata_arguments_test, altdata_keypoints_test))
training_test

array([[ 0.11411386,  0.17604612,  0.5231851 , ..., -0.16148242,
        -1.4278636 ,  0.53885084],
       [-0.21698153,  0.3410269 , -0.05026847, ..., -0.16148242,
        -1.4278636 ,  0.53885084],
       [ 0.77071553,  0.5753816 ,  0.8424418 , ..., -0.16148242,
        -1.4278636 ,  0.53885084],
       ...,
       [ 0.15942207,  0.15487501, -0.1556832 , ..., -0.5594947 ,
        -0.32103527, -0.7785235 ],
       [ 0.25615075,  0.5213407 ,  0.8761257 , ..., -0.5594947 ,
        -0.32103527, -0.7785235 ],
       [-0.15758045,  0.42435682,  0.85875595, ..., -0.5594947 ,
        -0.32103527, -0.7785235 ]], dtype=float32)

In [186]:
labels_test = np.array(full_dataset_test["label"])

In [187]:
pred = model.predict(training_test)
pred

array([[0.9458338 ],
       [0.9417888 ],
       [0.9292275 ],
       ...,
       [0.00483871],
       [0.00484586],
       [0.00481105]], dtype=float32)

In [188]:
pred = pred.flatten()

res = np.vstack((pred, labels_test))
df_res = pd.DataFrame(res)
df_res = df_res.transpose()
df_res.columns = ["predicted", "true"]
df_res['predicted_label'] = np.where(df_res['predicted'] >= 0.5, 1, 0)
df_res['match'] = np.where(df_res['true'] == df_res['predicted_label'], 1, 0)

df_res['match'].sum()
accuracy = df_res['match'].sum() / df_res.shape[0]
accuracy

0.4798418972332016

In [196]:
import sklearn as skl

In [198]:
df_res["true"]

dtype('O')

In [200]:
skl.metrics.f1_score(np.asarray(df_res["true"]).astype(float), np.asarray(df_res["predicted_label"]).astype(float))

0.4166666666666667

In [195]:
df_res

Unnamed: 0,predicted,true,predicted_label,match
0,0.945834,0,1,0
1,0.941789,0,1,0
2,0.929227,0,1,0
3,0.876114,0,1,0
4,0.934168,0,1,0
...,...,...,...,...
1260,0.005585,0,0,1
1261,0.004865,0,0,1
1262,0.004839,1,0,0
1263,0.004846,0,0,1


In [189]:
df_res.describe()


Unnamed: 0,predicted_label,match
count,1265.0,1265.0
mean,0.396047,0.479842
std,0.489268,0.499791
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,1.0,1.0
max,1.0,1.0


In [101]:
df_one = df_res[df_res["true"] == 1]
df_one.describe()

Unnamed: 0,predicted,true,predicted_label,match
count,627.0,627.0,627.0,627.0
mean,0.137474,1.0,0.100478,0.100478
std,0.225883,0.0,0.300877,0.300877
min,-0.034367,1.0,0.0,0.0
25%,0.007819,1.0,0.0,0.0
50%,0.040368,1.0,0.0,0.0
75%,0.148088,1.0,0.0,0.0
max,0.973667,1.0,1.0,1.0


In [102]:
df_zeros = df_res[df_res["true"] == 0]
df_zeros.describe()

Unnamed: 0,predicted,true,predicted_label,match
count,2416.0,2416.0,2416.0,2416.0
mean,0.126496,0.0,0.082368,0.917632
std,0.211251,0.0,0.274981,0.274981
min,-0.038495,0.0,0.0,0.0
25%,-0.001271,0.0,0.0,1.0
50%,0.031157,0.0,0.0,1.0
75%,0.163892,0.0,0.0,1.0
max,0.976439,0.0,1.0,1.0
