# Imports / Data

In [1]:
import json
import pandas as pd
from collections import Counter
from nltk.corpus import stopwords
from itertools import filterfalse as ifilterfalse
from sklearn.model_selection import train_test_split
import re
from time import time
from tqdm import tqdm
tqdm.pandas()

In [2]:
import json
import pandas as pd
import numpy as np
from time import time

In [6]:
label2id = {
    'NEITHER': 0,
    'EVIDENCE': 1,
    'CLAIM': 2,
    'NONE': 0
}

def load_corpus(path, label_mapping=None):
    with open(path) as fp:
        corpus = json.load(fp)

    documents, texts, labels = [], [], []
    for abstract in corpus:
        documents.append(abstract)
        texts.append(corpus[abstract]['sentences'])
        if isinstance(label_mapping, dict):
            labels.append(
                [label_mapping[str(l).upper()]
                    for l in corpus[abstract]['labels']])
        else:
            labels.append([str(l).upper() for l in corpus[abstract]['labels']])

    assert len(texts) == len(labels)
    data = pd.DataFrame(
        zip(documents, texts, labels),
        columns=['document', 'sentences', 'labels'])

    return data

* Read our 2 datasets and merge them in 1 dataframe

In [7]:
data1 = load_corpus('dataset_aueb_argument_v3.json' , label_mapping=label2id)
print(f'Dataset 1 length: {len(data1)} abstracts')

data2 = load_corpus('dataset.json' , label_mapping=label2id)
print(f'Dataset 2 length: {len(data2)} abstracts')

data = data1.append(data2)
print(f'Dataset length: {len(data)} abstracts')
data.head(3)

Dataset 1 length: 1017 abstracts
Dataset 2 length: 1669 abstracts
Dataset length: 2686 abstracts


Unnamed: 0,document,sentences,labels
0,doi: 10.1001/jamaneurol.2017.2814,[Concordance Between Different Amyloid Immunoa...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, ..."
1,doi: 10.1001/jamaneurol.2017.4913,[Association of Changes in Plasma Neurofilamen...,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 2, 2, 2]"
2,doi: 10.1002/2015gl067056,[Dynamically triggered slip leading to sustain...,"[0, 0, 0, 1, 1, 2]"


* Create a datafrme with the 'doc_id' & 'sentences' and a dataframe with the 'doc_id' & 'labels' for each unique sentence in our dataset.

In [8]:
#@title Split to sentences
sentences = data['sentences'].explode().reset_index().rename(
    columns={'index': 'doc_id', 'sentences': 'sentence'})
sentences.sentence = sentences.sentence.astype("string")
sentences.sentence = sentences.sentence.str.strip()

#@title and the corresponding labels
labels = pd.DataFrame(data['labels'].explode()).reset_index(drop = True).rename(
    columns={'labels': 'label'})

* Create a dataframe with the splitted sentences and labels

In [9]:
merged_data = pd.concat([sentences,labels['label']], axis = 1)
print(merged_data.shape)
merged_data.sample(5)

(32004, 3)


Unnamed: 0,doc_id,sentence,label
26076,1229,Only the FACT-G social/family well-being subsc...,1
22784,985,"Results At older ages, migrants in Europe were...",1
6593,687,These results show that the CSF Aβ1–42/Aβ1–40 ...,2
18730,606,An open question is what promise the integrati...,0
16420,428,Marital quality remained positive over time.,0


* We observe that we have some sentences that are one word, so let's remove them.

In [10]:
for i in range(len(merged_data)):
    if (len(merged_data['sentence'][i].split()) < 2):
        merged_data = merged_data.drop(i)
        
merged_data.reset_index(inplace = True, drop = True)
merged_data.shape

(31093, 3)

# Exploratory

* 0 means no argument
* 1 means Evidence
* 2 means Claim

In [11]:
merged_data.label.value_counts()

0    21471
1     6203
2     3419
Name: label, dtype: int64

In [12]:
round(merged_data.label.value_counts() / len(merged_data) * 100, 2)

0    69.05
1    19.95
2    11.00
Name: label, dtype: float64

* We observe that we almost 70% of our sentences have no label so it is going to make our classifier predict it with higher probability since we have imbalanced dataset


* Now let's check for missing values

In [13]:
merged_data.isna().sum()

doc_id      0
sentence    0
label       0
dtype: int64

* we have no Na's in our dataset so we can proceed.


* Now let's see if we have any duplicate rows.

In [14]:
merged_data['sentence'].duplicated().any()

True

* We have duplicates so let's remove them.

In [15]:
print('Shape before removing duplicates: ', merged_data.shape)
a = merged_data.shape[0]
merged_data.drop_duplicates(subset=['sentence'], inplace=True)
print('Shape after removing duplicates: ', merged_data.shape)
print('Rows Removed: ', a - merged_data.shape[0])

merged_data.reset_index(drop=True, inplace = True)

Shape before removing duplicates:  (31093, 3)
Shape after removing duplicates:  (30862, 3)
Rows Removed:  231


# Data preprocessing

* Let's assign our X and y

In [16]:
X = merged_data['sentence']
y = merged_data['label']

* Create a function to clean our dataset

In [17]:
import re

def clean_text(text):
    """
    Pre process and convert texts to a list of words
    :param text:
    :return:
    """

    text = str(text)
    text = text.lower()

    # Clean the text
    text = re.sub(r"[^A-Za-z0-9^,!.\/'+-=]", " ", text)
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r"!", " ", text)
    text = re.sub(r"\/", " ", text)
    text = re.sub(r"\^", " ", text)
    text = re.sub(r"\+", " ", text)
    text = re.sub(r"\-", " ", text)
    text = re.sub(r"\=", " ", text)
    text = re.sub(r"'", " ", text)
    text = re.sub(r"(\d+)(k)", r"\g<1>000", text)
    text = re.sub(r":", " ", text)
    text = re.sub(r" e g ", " eg ", text)
    text = re.sub(r" b g ", " bg ", text)
    text = re.sub(r" u s ", " american ", text)
    text = re.sub(r"\0s", "0", text)
    text = re.sub(r" 9 11 ", "911", text)
    text = re.sub(r"e - mail", "email", text)
    text = re.sub(r"j k", "jk", text)
    text = re.sub(r"\s{2,}", " ", text)

    return text

* Let's have a look at a sentence before cleaning

In [18]:
X[107]

'This result is achieved through novel cross-link agents made by boron- and fluorine-containing heterocycles that can react between themselves upon UV- and white-light exposure.'

* Apply the clean function we created to clean our text

In [19]:
X = pd.DataFrame(X)
X['sentence']=X['sentence'].apply(clean_text)

* Let's have a look again at our X before proceeding.

In [20]:
X['sentence'][107]

'this result is achieved through novel cross link agents made by boron and fluorine containing heterocycles that can react between themselves upon uv and white light exposure '

### Splitting Dataset Procedure


* We split the dataset to 80% train and 20% test.
* We use a random state in order to split the same every time we run the code.
* We use stratification in order to have equal representation of all labels in both datasets.

In [21]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X['sentence'],
                                                    y,
                                                    test_size=0.20,
                                                    random_state=42,
                                                    stratify=y)

In [22]:
print(round(y_train.value_counts() / len(y_train) * 100, 2))
round(y_test.value_counts() / len(y_test) * 100, 2)

0    68.97
1    20.00
2    11.03
Name: label, dtype: float64


0    68.98
1    19.99
2    11.03
Name: label, dtype: float64

* We observe that the stratification was successful so let's proceed.


* Let's create a dataframe with the senteces and their corresponding labels of our train dataset.

In [25]:
df = pd.DataFrame(x_train)
df.columns = ['sentence']
df["label"] = y_train
df['sentence'] = df.sentence.str.strip()
df.head(3)

Unnamed: 0,sentence,label
30013,patients who failed previous monotherapy remai...,1
14079,introduction pediatric road traffic injuries r...,0
1548,here we report the rationalization of the high...,0


* Create one dataframe for the sentences that are evidence (label = 1) and one for the claims (label = 2) from the train dataset.

In [26]:
evidences = df[df["label"] == 1]
claims = df[df["label"] == 2]

# Baseline

* Remove stopwords, remove words with length < 4 and count word occurences for the dataframe with the sentences that are evidence

In [77]:
stops = stopwords.words('english')
corpora_1 = " ".join(evidences['sentence'])
words_1 = corpora_1.split()

#Removing Stopwords
filtered_sentence = []
for w in words_1:
    if w not in stops:
        filtered_sentence.append(w)

#remove words with length < 4
filtered_sentence_v2 = []
Counter(filtered_sentence).most_common()
for w in filtered_sentence:
    if len(w) > 3:
        filtered_sentence_v2.append(w)      

#Count how many times each unique word is present in all our sentences
top_words_1 = Counter(filtered_sentence_v2).most_common()
top_words_1 = top_words_1[:15]

* Remove stopwords, remove words with length < 4 and count word occurences for the dataframe with the sentences that are claim

In [78]:
# Remove stopwords and remove words with length < 3 
stops = stopwords.words('english')
corpora_2 = " ".join(claims['sentence'])
words_2 = corpora_2.split()

filtered_sentence = []
for w in words_2:
    if w not in stops:
        filtered_sentence.append(w)
        
filtered_sentence_v2 = []
Counter(filtered_sentence).most_common()
for w in filtered_sentence:
    if len(w) > 3:
        filtered_sentence_v2.append(w)      

top_words_2 = Counter(filtered_sentence_v2).most_common()
top_words_2 = top_words_2[:15]

In [79]:
top_words_1

[('group', 893),
 ('patients', 802),
 ('significant', 479),
 ('months', 478),
 ('significantly', 451),
 ('results', 387),
 ('compared', 356),
 ('higher', 341),
 ('treatment', 339),
 ('mean', 335),
 ('groups', 316),
 ('respectively', 307),
 ('survival', 279),
 ('health', 270),
 ('associated', 260)]

In [80]:
top_words_2

[('patients', 415),
 ('treatment', 255),
 ('results', 225),
 ('health', 203),
 ('study', 203),
 ('cancer', 200),
 ('conclusions', 196),
 ('associated', 167),
 ('well', 155),
 ('women', 154),
 ('conclusion', 154),
 ('climate', 153),
 ('effective', 133),
 ('findings', 125),
 ('risk', 119)]

* Remove common words that are present in the top words for both evidences and claims.

In [81]:
for i in range(len(top_words_1)):
    if i >= len(top_words_1):
        break
    for j in range(len(top_words_2)):
        if j >= len(top_words_2):
            break
        if top_words_1[i][0] == top_words_2[j][0]: #See if they contain the same word
            top_words_2.pop(j)
            top_words_1.pop(i)
            i = 0
            break
            
print(len(top_words_1))
len(top_words_2)

* Keep only the words of each list without their occurences

In [84]:
top_words_1_only = []
for i in range(len(top_words_1)):
    top_words_1_only.append(top_words_1[i][0])

top_words_2_only = []
for i in range(len(top_words_2)):
    top_words_2_only.append(top_words_2[i][0])
    
top_words_1_only[:5]

['group', 'significant', 'months', 'significantly', 'compared']

* Create a dataframe with the sentences and labels of the test dataset.

In [85]:
df_test = pd.DataFrame(x_test)
df_test["label"] = y_test
df_test.reset_index(drop = True, inplace = True)
#df_test['sentence'] = df_test.sentence.str.strip()
df_test

Unnamed: 0,sentence,label
0,spring summer and autumn verification scores c...,0
1,despite of the great number of studies on the ...,0
2,to investigate whether uterine artery emboliza...,0
3,anemia highly common among cancer patients is ...,2
4,methods nationally representative surveys of a...,0
...,...,...
6168,there was no difference in bowel function betw...,1
6169,the municipality of arnhem is one of dutch com...,0
6170,the magnetic field noise referred to the pick ...,0
6171,there is an old definition pointing to the fir...,0


* Add a new column with the predicted value, which has the value of 0 at start, and changes to 1 if it contains a word from top word list from evidences and 2 if it contains a word from top word list from claims.

In [86]:
df_test['predicted'] = 0
for i in range(len(df_test)):
    for word in top_words_1_only:
        if (word in df_test.sentence[i]):
            df_test['predicted'][i] = 1
    for word_2 in top_words_2_only:
        if (word_2 in df_test.sentence[i]):
            df_test['predicted'][i] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['predicted'][i] = 2
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test['predicted'][i] = 1


In [87]:
df_test.label.value_counts()

0    4258
1    1234
2     681
Name: label, dtype: int64

In [88]:
df_test.predicted.value_counts()

0    3483
2    1780
1     910
Name: predicted, dtype: int64

In [89]:
success_rate = (len(df_test[df_test['label'] == df_test['predicted']]) / len(df_test)) * 100
success_rate

55.029969220800254