In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import torch
import warnings
warnings.filterwarnings('ignore')

In [2]:
dataframe = pd.read_excel('dataset 19964.xlsx')

In [3]:
dataframe.head(3)

Unnamed: 0.1,Unnamed: 0,Record ID,Title,Subject,Institution,Journal,Publisher,Country,Author,URLS,...,RetractionPubMedID,OriginalPaperDate,OriginalPaperDOI,OriginalPaperPubMedID,RetractionNature,Reason,Paywalled,Notes,Error,Abstract
0,0,41915,A novel fully automated MRI-based deep-learnin...,(B/T) Computer Science;(BLS) Biology - Cancer;...,"Department of Radiology, Mayo Clinic, Rocheste...",Neuro-Oncology,Oxford Academic,United States,Timothy J Kaufmann;Bradley J Erickson,,...,36623552,2020-01-06,10.1093/neuonc/noaa002,31905237.0,Retraction,+Cites Retracted Work;,No,,0,No abstract available
1,1,41914,A novel fully automated MRI-based deep-learnin...,(B/T) Computer Science;(BLS) Biology - Cancer;...,"Department of Radiology, University of Texas S...",Neuro-Oncology,Oxford Academic,United States,Chandan Ganesh Bangalore Yogananda;Bhavya R Sh...,,...,36623552,2019-10-22,10.1093/neuonc/noz199,31637430.0,Retraction,+Error in Analyses;+Error in Methods;+Error in...,No,,0,Abstract\n \n \n\n\n\n Ba...
2,2,42459,Vaginal misoprostol before copper IUD insertio...,(HSC) Medicine - Obstetrics/Gynecology;(HSC) M...,"Department of Obstetrics and Gynaecology, Facu...",European Journal of Contraception & Reproducti...,Taylor and Francis,Egypt,Radwa Rasheedy;Tarek Fathi Tamara;Ihab Serag A...,,...,36744398,2019-05-21,10.1080/13625187.2019.1610871,31112079.0,Retraction,+Concerns/Issues About Data;+Concerns/Issues A...,No,,0,Abstract\n \n \n\n\nObjective: The...


In [4]:
error_targets = dataframe['Error']
np.unique(error_targets)

array([0, 1, 2], dtype=int64)

In [5]:
# Count the number of examples in each category 
n_example_per_category = dataframe['Error'].value_counts()

# Printing the number of examples in each category
print('before removing the NaN valued(in abstract) rows: ')
print(n_example_per_category)

#after dropping the nan values.
dataframe = dataframe.dropna(subset=['Abstract'])

before removing the NaN valued(in abstract) rows: 
Error
1    11472
0     7822
2      670
Name: count, dtype: int64


In [6]:
print('After removal :')
dataframe['Error'].value_counts()

After removal :


Error
1    9648
0    4536
2     570
Name: count, dtype: int64

In [7]:
Xd = dataframe.iloc[:, range(22,23)]
X_names = Xd.columns
X = Xd.values

Td = dataframe.iloc[:, range(21,22)]
T_names = Td.columns
T = Td.values

X.shape, X_names, T.shape, T_names


((14754, 1),
 Index(['Abstract'], dtype='object'),
 (14754, 1),
 Index(['Error'], dtype='object'))

In [8]:
#I am ignoring the rows with no clear determination - error class 2
rows_to_delete = np.where(T == 2)[0]
X = np.delete(X, rows_to_delete, axis=0)
T = np.delete(T, rows_to_delete, axis=0)

X.shape, X_names, T.shape, T_names

((14184, 1),
 Index(['Abstract'], dtype='object'),
 (14184, 1),
 Index(['Error'], dtype='object'))

In [9]:
#labels are created based on reasons for retraction. However, abstracts are not available for some papers.
rows_to_delete = np.where(X == 'No abstract available')[0]
X = np.delete(X, rows_to_delete, axis=0)
T = np.delete(T, rows_to_delete, axis=0)

X.shape, X_names, T.shape, T_names

((13283, 1),
 Index(['Abstract'], dtype='object'),
 (13283, 1),
 Index(['Error'], dtype='object'))

In [10]:
from transformers import AutoTokenizer, ReformerModel

tokenizer = AutoTokenizer.from_pretrained("google/reformer-crime-and-punishment")
model = ReformerModel.from_pretrained("google/reformer-crime-and-punishment")

In [11]:
X = X.flatten()
tokenized = []
for abstract in X:
        tokenized.append(tokenizer.encode(abstract, add_special_tokens=True))

In [12]:
max_len = 0
for i in tokenized:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized])

In [13]:
padded.shape

(13283, 6500)

In [14]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(13283, 6500)

In [15]:
input_ids = torch.tensor(padded)  
attention_masks = torch.tensor(attention_mask)

In [16]:
last_hidden_states = []
with torch.no_grad():
        last_hidden_states.append(model(input_ids[0:2],attention_mask = attention_masks[0:2]))
features = last_hidden_states[0][0][:,0,:].numpy()

In [17]:
for i in range(2, len(input_ids),2):
    last_hidden_states = []
    if (i%500 == 2):
        print(i)
    with torch.no_grad():
        last_hidden_states.append(model(input_ids[i:i+2],attention_mask = attention_masks[i:i+2]))
    features = np.vstack((features, last_hidden_states[0][0][:,0,:].numpy()))

2
502
1002
1502
2002
2502
3002
3502
4002
4502
5002
5502
6002
6502
7002
7502
8002
8502
9002
9502
10002
10502
11002
11502
12002
12502
13002


In [18]:
features.shape    

(13283, 512)

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

In [20]:
train_features, test_features, train_labels, test_labels = train_test_split(features, T)
train_features.shape, test_features.shape, train_labels.shape, test_labels.shape

((9962, 512), (3321, 512), (9962, 1), (3321, 1))

In [21]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

In [22]:
lr_clf.score(test_features, test_labels)

0.6772056609454984

In [23]:
predicted_labels = lr_clf.predict(test_features)
f1 = f1_score(test_labels, predicted_labels, average='macro')
print(f1)

0.40377019748653503
