In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
import torch
from sklearn.metrics import precision_recall_fscore_support
import warnings
warnings.filterwarnings('ignore')

In [2]:
dataframe = pd.read_excel('dataset 19964.xlsx')

In [3]:
dataframe.head(3)

Unnamed: 0.1,Unnamed: 0,Record ID,Title,Subject,Institution,Journal,Publisher,Country,Author,URLS,...,RetractionPubMedID,OriginalPaperDate,OriginalPaperDOI,OriginalPaperPubMedID,RetractionNature,Reason,Paywalled,Notes,Error,Abstract
0,0,41915,A novel fully automated MRI-based deep-learnin...,(B/T) Computer Science;(BLS) Biology - Cancer;...,"Department of Radiology, Mayo Clinic, Rocheste...",Neuro-Oncology,Oxford Academic,United States,Timothy J Kaufmann;Bradley J Erickson,,...,36623552,2020-01-06,10.1093/neuonc/noaa002,31905237.0,Retraction,+Cites Retracted Work;,No,,0,No abstract available
1,1,41914,A novel fully automated MRI-based deep-learnin...,(B/T) Computer Science;(BLS) Biology - Cancer;...,"Department of Radiology, University of Texas S...",Neuro-Oncology,Oxford Academic,United States,Chandan Ganesh Bangalore Yogananda;Bhavya R Sh...,,...,36623552,2019-10-22,10.1093/neuonc/noz199,31637430.0,Retraction,+Error in Analyses;+Error in Methods;+Error in...,No,,0,Abstract\n \n \n\n\n\n Ba...
2,2,42459,Vaginal misoprostol before copper IUD insertio...,(HSC) Medicine - Obstetrics/Gynecology;(HSC) M...,"Department of Obstetrics and Gynaecology, Facu...",European Journal of Contraception & Reproducti...,Taylor and Francis,Egypt,Radwa Rasheedy;Tarek Fathi Tamara;Ihab Serag A...,,...,36744398,2019-05-21,10.1080/13625187.2019.1610871,31112079.0,Retraction,+Concerns/Issues About Data;+Concerns/Issues A...,No,,0,Abstract\n \n \n\n\nObjective: The...


In [4]:
error_targets = dataframe['Error']
np.unique(error_targets)

array([0, 1, 2], dtype=int64)

In [5]:
# Count the number of examples in each category 
n_example_per_category = dataframe['Error'].value_counts()

# Printing the number of examples in each category
print('before removing the NaN valued(in abstract) rows: ')
print(n_example_per_category)

#after dropping the nan values.
dataframe = dataframe.dropna(subset=['Abstract'])

before removing the NaN valued(in abstract) rows: 
Error
1    11472
0     7822
2      670
Name: count, dtype: int64


In [6]:
print('After removal :')
dataframe['Error'].value_counts()

After removal :


Error
1    9648
0    4536
2     570
Name: count, dtype: int64

In [7]:
Xd = dataframe.iloc[:, range(22,23)]
X_names = Xd.columns
X = Xd.values

Td = dataframe.iloc[:, range(21,22)]
T_names = Td.columns
T = Td.values

X.shape, X_names, T.shape, T_names


((14754, 1),
 Index(['Abstract'], dtype='object'),
 (14754, 1),
 Index(['Error'], dtype='object'))

In [8]:
#I am ignoring the rows with no clear determination - error class 2
rows_to_delete = np.where(T == 2)[0]
X = np.delete(X, rows_to_delete, axis=0)
T = np.delete(T, rows_to_delete, axis=0)

X.shape, X_names, T.shape, T_names

((14184, 1),
 Index(['Abstract'], dtype='object'),
 (14184, 1),
 Index(['Error'], dtype='object'))

In [9]:
#labels are created based on reasons for retraction. However, abstracts are not available for some papers.
rows_to_delete = np.where(X == 'No abstract available')[0]
X = np.delete(X, rows_to_delete, axis=0)
T = np.delete(T, rows_to_delete, axis=0)

X.shape, X_names, T.shape, T_names

((13283, 1),
 Index(['Abstract'], dtype='object'),
 (13283, 1),
 Index(['Error'], dtype='object'))

In [10]:
from transformers import AutoTokenizer, BigBirdModel

tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base")
model = BigBirdModel.from_pretrained("google/bigbird-roberta-base")

In [11]:
X = X.flatten()
tokenized = []
for abstract in X:
        tokenized.append(tokenizer.encode(abstract, add_special_tokens=True))

Token indices sequence length is longer than the specified maximum sequence length for this model (4988 > 4096). Running this sequence through the model will result in indexing errors


In [12]:
long_indices = []
for i in range(len(tokenized)):
    if len(tokenized[i]) > 4096:
        print(len(tokenized[i]))
        long_indices.append(i)

print(long_indices)

4988
7587
6880
5087
[5791, 5907, 5910, 8110]


In [13]:
for i in range(len(long_indices)):
    tokenized.pop(long_indices[i]-i)

In [14]:
# llong_indices = []
# for i in range(len(tokenized)):
#     if len(tokenized[i]) > 4096:
#         print(len(tokenized[i]))
#         llong_indices.append(i)

# print(llong_indices)

In [15]:
T = np.delete(T, long_indices, axis = 0)

print(len(tokenized), T.shape[0])

13279 13279


In [12]:
# fraud - 1, author error - 0
sss = np.nonzero(T)
sss[0].shape

(9135,)

In [16]:
max_len = 0
for i in tokenized:
    if len(i) > max_len:
        max_len = len(i)
        
print(max_len)
padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized])

3532


In [17]:
padded.shape

(13279, 3532)

In [18]:
attention_mask = np.where(padded != 0, 1, 0)
attention_mask.shape

(13279, 3532)

In [19]:
input_ids = torch.tensor(padded)  
attention_masks = torch.tensor(attention_mask)

In [20]:
last_hidden_states = []
with torch.no_grad():
        last_hidden_states.append(model(input_ids[0:5],attention_mask = attention_masks[0:5]))

In [21]:
features = last_hidden_states[0][0][:,0,:].numpy()
features.shape

(5, 768)

In [25]:
for i in range(5, len(input_ids), 5):
    last_hidden_states = []
    if (i%100 == 5):
        print(i)
    with torch.no_grad():
        last_hidden_states.append(model(input_ids[i:i+5],attention_mask = attention_masks[i:i+5]))
    features = np.vstack((features, last_hidden_states[0][0][:,0,:].numpy()))

5
105
205
305
405
505
605
705
805
905
1005
1105
1205
1305
1405
1505
1605
1705
1805
1905
2005
2105
2205
2305
2405
2505
2605
2705
2805
2905
3005
3105
3205
3305
3405
3505
3605
3705
3805
3905
4005
4105
4205
4305
4405
4505
4605
4705
4805
4905
5005
5105
5205
5305
5405
5505
5605
5705
5805
5905
6005
6105
6205
6305
6405
6505
6605
6705
6805
6905
7005
7105
7205
7305
7405
7505
7605
7705
7805
7905
8005
8105
8205
8305
8405
8505
8605
8705
8805
8905
9005
9105
9205
9305
9405
9505
9605
9705
9805
9905
10005
10105
10205
10305
10405
10505
10605
10705
10805
10905
11005
11105
11205
11305
11405
11505
11605
11705
11805
11905
12005
12105
12205
12305
12405
12505
12605
12705
12805
12905
13005
13105
13205


In [26]:
features.shape

(13279, 768)

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score

In [108]:
train_features, test_features, train_labels, test_labels = train_test_split(features, T)
train_features.shape, test_features.shape, train_labels.shape, test_labels.shape

((9959, 768), (3320, 768), (9959, 1), (3320, 1))

In [88]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

In [89]:
lr_clf.score(test_features, test_labels)

0.7337349397590361

In [111]:
# precision, recall, and F1-score
predicted_labels = lr_clf.predict(test_features)
precision, recall, f1_score, support = precision_recall_fscore_support(test_labels, predicted_labels,labels=[1, 0])

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1_score)

Precision: [0.76827976 0.69734151]
Recall: [0.93628928 0.34202608]
F1-score: [0.84400466 0.4589502 ]


In [91]:
predicted_labels = lr_clf.predict(test_features)
f1 = f1_score(test_labels, predicted_labels, average='macro')
print(f1)

0.618620104623215


In [92]:
nnet = torch.nn.Sequential(torch.nn.Linear(768,20),
                           torch.nn.Tanh(),
                           torch.nn.Linear(20, 10),
                           torch.nn.Tanh(),
                           torch.nn.Linear(10, 2)
                          )

In [49]:
new_x_train = torch.from_numpy(train_features)
new_t_train = torch.from_numpy(train_labels.flatten())
new_x_test = torch.from_numpy(test_features)
new_t_test = torch.from_numpy(test_labels.flatten())

In [61]:
def percent_correct(Yclasses, T):
    return (Yclasses == T).float().mean().item() * 100

def use(nnet, X):
    Y = nnet(X)
    class_index = torch.argmax(Y, dim=1)  # not axis=1 as we did in numpy!
    return class_index

def train(nnet, X, T, n_epochs, learning_rate):
    optimizer = torch.optim.Adam(nnet.parameters(), lr=learning_rate)
    loss_func = torch.nn.CrossEntropyLoss()

    for epoch in range(n_epochs):
    
        Y = nnet(X)
        
        loss = loss_func(Y, T)
        loss.backward()
        
        optimizer.step() 
        optimizer.zero_grad()

        pc_train = percent_correct(use(nnet, new_x_train), new_t_train)
        pc_test = percent_correct(use(nnet, new_x_test), new_t_test)
        
    print(f'Epoch {epoch + 1} %correct: Train {pc_train:.1f} Test {pc_test:.1f}')

In [93]:
train(nnet, new_x_train, new_t_train, 500, 0.001)

Epoch 500 %correct: Train 79.1 Test 72.2


# to compare performance with a dummy classifier

In [104]:
from sklearn.dummy import DummyClassifier
clf = DummyClassifier()

clf.fit(train_features, train_labels)
clf.score(test_features, test_labels)

0.6927710843373494

In [112]:
# precision, recall, and F1-score
predicted_labels = clf.predict(test_features)
precision, recall, f1_score, support = precision_recall_fscore_support(test_labels, predicted_labels, labels=[1, 0])

print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1_score)

Precision: [0.6996988 0.       ]
Recall: [1. 0.]
F1-score: [0.82332093 0.        ]


In [117]:
# from sklearn.metrics import precision_score

# precision_ = precision_score(test_labels, predicted_labels)
# print("Precision:", precision_)