In [1]:
import numpy as np
import pandas as pd
import nltk

In [2]:
import os
os.chdir('/Users/dimo/Downloads')

In [3]:
train = pd.read_json(r'train.json')
test = pd.read_json(r'test.json')
train.head()

Unnamed: 0,paperId,title,authorId,authorName,abstract,year,venue
0,0b341b6938308a6d5f47edf490f6e46eae3835fa,Detecting linguistic idiosyncratic interests i...,3188285,Masoud Rouhizadeh,Children with autism spectrum disorder often e...,2014,CLPsych@ACL
1,c682727ee058aadbe9dbf838dcb036322818f588,Bigrams and BiLSTMs Two Neural Networks for Se...,2782720,Yuri Bizzoni,We present and compare two alternative deep ne...,2018,Fig-Lang@NAACL-HLT
2,0f9b5b32229a7245e43754430c0c88f8e7f0d8af,In Factuality: Efficient Integration of Releva...,144748442,Peter Vickers,Visual Question Answering (VQA) methods aim at...,2021,ACL
3,7e8b4cfdc03b59ece2d6b33a217f0abd47f708d9,Variational Graph Autoencoding as Cheap Superv...,46331602,Irene Li,Coreference resolution over semantic graphs li...,2022,ACL
4,07588dd5d0252c7abc99b3834a81bf23741ead4b,LIMIT-BERT : Linguistics Informed Multi-Task BERT,30887404,Junru Zhou,"In this paper, we present Linguistics Informed...",2019,FINDINGS


In [4]:
##CLEANING
import string

In [5]:
##Removing punctuation (found on stackoverflow): https://stackoverflow.com/questions/39782418/remove-punctuations-in-pandas
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text = text.replace(punctuation, '')
    return text

In [6]:
##CLEANING

In [7]:
train["abstract"] = train['abstract'].apply(remove_punctuations)

test["abstract"] = test['abstract'].apply(remove_punctuations)

In [8]:
##Abstract to lowercase 
train["abstract"] = train['abstract'].str.lower()

test["abstract"] = test['abstract'].str.lower()

In [9]:
# TOKENIZING (https://www.analyticsvidhya.com/blog/2021/06/text-preprocessing-in-nlp-with-python-codes/)
import re
def tokenization(text):
    tokens = re.split('\s+',text)
    return tokens

In [10]:
#applying function to the abstract column
train["abstract"] = train['abstract'].apply(lambda x: tokenization(x))

test["abstract"] = test['abstract'].apply(lambda x: tokenization(x))

In [11]:
##STOPWORD REMOVAL

In [12]:
#importing nlp library
import nltk
#Stop words present in the library
stopwords = nltk.corpus.stopwords.words('english')

In [13]:
# defining the function for stopword removal (https://www.analyticsvidhya.com/blog/2021/06/text-preprocessing-in-nlp-with-python-codes/)
def remove_stopwords(text):
    output= [i for i in text if i not in stopwords]
    return output

In [14]:
train["abstract"] = train['abstract'].apply(lambda x:remove_stopwords(x))

test["abstract"] = test['abstract'].apply(lambda x:remove_stopwords(x))

In [15]:
##LEMMATIZE

In [16]:
from nltk.stem import WordNetLemmatizer
#defining the object for Lemmatization
wordnet_lemmatizer = WordNetLemmatizer()

In [17]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/dimo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [18]:
#defining the function for lemmatization (https://www.analyticsvidhya.com/blog/2021/06/text-preprocessing-in-nlp-with-python-codes/)
def lemmatizer(text):
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text


In [19]:
train['abstract']=train['abstract'].apply(lambda x:lemmatizer(x))

test['abstract']=test['abstract'].apply(lambda x:lemmatizer(x))

In [20]:
train.head()

Unnamed: 0,paperId,title,authorId,authorName,abstract,year,venue
0,0b341b6938308a6d5f47edf490f6e46eae3835fa,Detecting linguistic idiosyncratic interests i...,3188285,Masoud Rouhizadeh,"[child, autism, spectrum, disorder, often, exh...",2014,CLPsych@ACL
1,c682727ee058aadbe9dbf838dcb036322818f588,Bigrams and BiLSTMs Two Neural Networks for Se...,2782720,Yuri Bizzoni,"[present, compare, two, alternative, deep, neu...",2018,Fig-Lang@NAACL-HLT
2,0f9b5b32229a7245e43754430c0c88f8e7f0d8af,In Factuality: Efficient Integration of Releva...,144748442,Peter Vickers,"[visual, question, answering, vqa, method, aim...",2021,ACL
3,7e8b4cfdc03b59ece2d6b33a217f0abd47f708d9,Variational Graph Autoencoding as Cheap Superv...,46331602,Irene Li,"[coreference, resolution, semantic, graph, lik...",2022,ACL
4,07588dd5d0252c7abc99b3834a81bf23741ead4b,LIMIT-BERT : Linguistics Informed Multi-Task BERT,30887404,Junru Zhou,"[paper, present, linguistics, informed, multit...",2019,FINDINGS


In [21]:
train_abstract = train['abstract']
test_abstract = test['abstract']

In [22]:
train_abstract = pd.Series(train_abstract, dtype = "string")
test_abstract = pd.Series(test_abstract, dtype = "string")

In [23]:
train_abstract

0        ['child', 'autism', 'spectrum', 'disorder', 'o...
1        ['present', 'compare', 'two', 'alternative', '...
2        ['visual', 'question', 'answering', 'vqa', 'me...
3        ['coreference', 'resolution', 'semantic', 'gra...
4        ['paper', 'present', 'linguistics', 'informed'...
                               ...                        
12124    ['defacto', 'standard', 'decoding', 'method', ...
12125    ['report', 'method', 'used', 'result', 'obtain...
12126    ['describe', 'second', 'iwpt', 'task', 'endtoe...
12127    ['abstract', 'paper', 'investigates', 'ability...
12128    ['framenet', 'best', 'currently', 'operational...
Name: abstract, Length: 12129, dtype: string

In [24]:
# CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer

In [25]:
vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df=3)

In [26]:
ngrams_train = vectorizer.fit_transform(train_abstract)

In [27]:
ngrams_train

<12129x60647 sparse matrix of type '<class 'numpy.float64'>'
	with 1108974 stored elements in Compressed Sparse Row format>

In [28]:
ngrams_test = vectorizer.transform(test_abstract)

In [29]:
ngrams_test = pd.DataFrame(ngrams_test.toarray(), columns = vectorizer.get_feature_names())



In [30]:
#Splitting the training set to training and validation
from sklearn.model_selection import train_test_split

X = ngrams_train
y = train['authorId']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.1)
print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

(10916, 60647) (1213, 60647) (10916,) (1213,)


## Complement Naive Bayes

In [35]:
# Importing necessary libraries
from sklearn.naive_bayes import ComplementNB
# instantiating the model with Complement Naive Bayes..
model = ComplementNB()
# training the model...
model = model.fit(X_train, y_train)

In [36]:
print('Complement Naive Bayes accuracy on validation set:')
print(round((model.score(X_val, y_val)*100) ,2) , '%')

Complement Naive Bayes accuracy on validation set:
8.74 %


## SGD classifier

In [37]:
from sklearn.linear_model import SGDClassifier
sgdc = SGDClassifier(loss = "log")
sgd_model = sgdc.fit(X_train, y_train)



In [38]:
print('Stochastic Gradient Descent Classifier accuracy on validation set:')
print(round((sgd_model.score(X_val, y_val)*100) ,2) , '%')

Stochastic Gradient Descent Classifier accuracy on validation set:
6.02 %


## KNeighbors classifier

In [39]:
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=1)
neigh = neigh.fit(X_train, y_train)

In [40]:
print('KNeighbors Classifier accuracy on validation set:')
print(round((neigh.score(X_val, y_val)*100) ,2) , '%')

KNeighbors Classifier accuracy on validation set:
0.99 %


## Multinomial Naive Bayes

In [41]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb = mnb.fit(X_train, y_train)

In [42]:
print('Multinomial Naive Bayes accuracy on validation set:')
print(round((mnb.score(X_val, y_val)*100) ,2) , '%')

Multinomial Naive Bayes accuracy on validation set:
0.49 %


## Linear SVC

In [43]:
from sklearn.svm import LinearSVC
svc = LinearSVC()
svc = svc.fit(X_train, y_train)

In [44]:
print('Linear SVC accuracy on validation set:')
print(round((svc.score(X_val, y_val)*100) ,2) , '%')

Linear SVC accuracy on validation set:
11.13 %


In [45]:
from sklearn.svm import LinearSVC
svc_largeC = LinearSVC(C = 7)
svc_largeC = svc_largeC.fit(X_train, y_train)

In [46]:
print('Linear SVC with higher regularization parameter value accuracy on validation set:')
print(round((svc_largeC.score(X_val, y_val)*100) ,2) , '%')

# Higher accuracy.

Linear SVC with higher regularization parameter value accuracy on validation set:
11.21 %


## Training the best fitting model on the whole training dataset

In [31]:
from sklearn.svm import LinearSVC
svc = LinearSVC(C = 7)
svc = svc.fit(X, y)

In [32]:
# Predicting on test
y_pred = svc.predict(ngrams_test)
print(y_pred[:10])



[1785372925 2064493724    2011442    1900163 2056582888    1703046
   23181472    2790926  144130537    9120873]


In [33]:
# Changing type for the dictionary
y_pred = y_pred.astype(str)

## Creating a dictionary, matching the required format

In [34]:
# Getting paperId and authorId for ngrams_test
ngrams_test['authorId'] = y_pred.tolist()
ngrams_test['paperId'] = test['paperId']
predictions = ngrams_test[['paperId', 'authorId']]

# Creating a list of dictionaries for each row
predictions = predictions.to_dict('records')


In [35]:
# Saving predictions as a JSON file (https://stackabuse.com/saving-text-json-and-csv-to-a-file-in-python/)
import json

with open('predictions.json', 'w', encoding='utf-8') as json_file:
    json.dump(predictions, json_file, indent=4)