# <center> Topic modeling of scientific research papers

# Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import os
import re
import matplotlib.pyplot as plt
import nltk
import gensim

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.tokenize import casual_tokenize
from nltk.corpus import stopwords
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from tqdm import tqdm
from gensim.models import Word2Vec

nltk.download("punkt")
nltk.download('omw-1.4')
nltk.download("stopwords")
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Azus\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Azus\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Azus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Azus\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# EDA

In [2]:
df=pd.read_csv("data/train.csv", index_col="ID")
df.head(3)

Unnamed: 0_level_0,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0


In [3]:
df.shape

(20972, 8)

We have 20972 instances in our training set

Now let's seperate the targets from the inputs

In [4]:
# We'll only use the Abstract since it generally contains more information than the title 

X=pd.DataFrame(df["ABSTRACT"])  # So it still remains a dataframe

y=df.drop(["TITLE", "ABSTRACT"], axis=1)

### Cleaning (stopwords ..etc)

In [5]:
def cleaned_text(text):
    clean = re.sub("\n"," ",text)
    clean=clean.lower()
    clean=re.sub(r"[~.,%/:;?_&+*=!-]"," ",clean)
    clean=re.sub("[^a-z]"," ",clean)
    clean=clean.lstrip()
    clean=re.sub("\s{2,}"," ",clean)
    return clean

X["cleaned_abstract"]=X["ABSTRACT"].apply(cleaned_text)

In [6]:
X["cleaned_abstract"] = X["cleaned_abstract"].apply(lambda x: ' '.join([word for word in x.split() if len(word)>3]))

Now we will do the next step cleaning process which is stop word removal and then we will lemmatize the words.


In [7]:
stop=stopwords.words('english')
stop.append("also")
X["stop_removed_abstract"]=X["cleaned_abstract"].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

Now to tokenize and lemmatize

In [8]:
X["tokenized"]=X["stop_removed_abstract"].apply(lambda x: nltk.word_tokenize(x))

In [9]:
def word_lemmatizer(text):
    lem_text = [WordNetLemmatizer().lemmatize(i,pos='v') for i in text]
    return lem_text

X["lemmatized"]=X["tokenized"].apply(lambda x: word_lemmatizer(x))
X["lemmatize_joined"]=X["lemmatized"].apply(lambda x: ' '.join(x))

Now we'll tokenize and create our BOW vector, however because of the large vocabulary in our dataset we'll only restrict our BOW to 3000 features (words)

### BOW

In [10]:
counter = CountVectorizer(tokenizer=casual_tokenize, max_features=3000)
bow_docs = pd.DataFrame(counter.fit_transform(raw_documents=X.lemmatize_joined).toarray(), columns=counter.get_feature_names())



In [11]:
bow_docs.head(3)

Unnamed: 0,abelian,ability,able,absence,absolute,absorb,absorption,abstract,abstraction,abundance,...,worst,would,write,year,years,yield,young,zero,zeta,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Now we'll train a Word2Vec model, using these BOW vectors

## Word2Vec

In [12]:
tokens = X["lemmatize_joined"].apply(lambda x: nltk.word_tokenize(x))

In [13]:
w2v_model= Word2Vec(tokens, min_count=60,vector_size=300,
                     window=10,
                     alpha=0.03, 
                     min_alpha=0.0007,
                     workers = 4,
                     seed = 42)

In [14]:
w2v = w2v_model.wv[bow_docs.columns]
w2v = pd.DataFrame(w2v, index=bow_docs.columns)
w2v.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
abelian,0.071531,-0.384007,-0.202899,-0.326325,0.111853,0.344678,0.414825,0.386427,-0.87954,0.736188,...,-0.814025,0.145155,0.694547,-0.350781,-0.938837,0.108248,-0.011665,-0.03307,-0.012399,0.454002
ability,-0.006661,0.807935,0.129141,-0.836222,-0.522488,-0.199422,0.475836,-0.122656,0.065119,0.015068,...,-0.413749,0.112707,-0.764179,-0.55489,0.103974,0.307676,-1.342367,-0.247328,0.759758,0.188017
able,-0.69643,0.612448,0.17776,-0.369662,-0.144763,0.177897,-0.121177,0.084033,-0.000316,-0.214076,...,0.060553,0.349389,-0.575268,-0.31355,1.074376,0.20398,-0.264571,0.502461,0.512547,-0.079062


In [15]:
corpus = {}
docs=[]
for i in X.lemmatized.index:
    doc_vec=0
    for j in range(0,len(X.lemmatized[i])):
        if X.lemmatized[i][j] in w2v.index:
            doc_vec=doc_vec+w2v.loc[X.lemmatized[i][j]].values
            doc_vec = doc_vec.tolist()
            
    corpus['sent{}'.format(i)] =  doc_vec

In [16]:
doc_vec = pd.DataFrame(corpus).T
doc_vec.head(3)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
sent1,-3.062022,42.014184,-13.941606,18.0819,-41.600676,-8.73626,9.655307,-37.089094,-49.5946,-22.86671,...,-9.243809,2.117328,-22.454697,-18.458828,30.52143,14.686189,28.393453,20.410135,-3.8013,18.5686
sent2,6.600168,10.394864,7.854636,8.534307,-9.937889,-7.5845,14.686002,0.440937,6.30374,3.194464,...,-15.172449,6.711353,-4.736738,-6.6079,-6.457617,2.23651,-1.530686,1.243183,-11.060441,7.263927
sent3,4.507501,-0.835507,-0.655553,-16.620234,-12.343077,2.279687,7.447342,22.037397,-32.356278,14.367546,...,-4.200364,-0.330609,5.144783,-5.258108,-14.352036,-10.11513,-0.953852,-1.778798,-15.19259,9.623202


In [17]:
doc_vec.shape

(20972, 300)

Now that we have our document vectors, we can go ahead and use the features to train the classifiers

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_test, y_train , y_test = train_test_split(doc_vec, y.values, test_size=0.2)

# Classification

Now, because we have a **multi-label** classification problem, we'll use the **BinaryRelavance** function of the scikit-learn multi package

In [19]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.metrics import accuracy_score, f1_score

model_performance = pd.DataFrame(columns=['Accuracy','F-1 Score (micro)','F1-Score (weighted)'])

### Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression


lgr=BinaryRelevance(LogisticRegression())
lgr.fit(X_train, y_train)

# predict
predictions = lgr.predict(X_test)

acc=accuracy_score(y_test,predictions)
f1_micro=f1_score(y_test, predictions, average="micro")
f1_weighted=f1_score(y_test, predictions, average="weighted")

model_performance.loc['LogisticRegression'] = [acc, f1_micro, f1_weighted]

print('Accuracy = ', acc)
print('F1 score (micro) is ',f1_micro)
print('F1 score (weighted) is ',f1_weighted)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Accuracy =  0.6481525625744935
F1 score (micro) is  0.8036556603773585
F1 score (weighted) is  0.8007981822974667


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Random  Forest

In [22]:
from sklearn.ensemble import RandomForestClassifier


rf=BinaryRelevance(RandomForestClassifier())
rf.fit(X_train, y_train)

# predict
predictions = rf.predict(X_test)

acc=accuracy_score(y_test,predictions)
f1_micro=f1_score(y_test, predictions, average="micro")
f1_weighted=f1_score(y_test, predictions, average="weighted")

model_performance.loc['RandomForestClassifier'] = [acc, f1_micro, f1_weighted]

print('Accuracy = ', acc)
print('F1 score (micro) is ',f1_micro)
print('F1 score (weighted) is ',f1_weighted)

Accuracy =  0.6617401668653159
F1 score (micro) is  0.8052990766760337
F1 score (weighted) is  0.7964995411195676


### XGBoost

In [23]:
from xgboost import XGBClassifier

rf=BinaryRelevance(XGBClassifier())
rf.fit(X_train, y_train)

# predict
predictions = rf.predict(X_test)

acc=accuracy_score(y_test,predictions)
f1_micro=f1_score(y_test, predictions, average="micro")
f1_weighted=f1_score(y_test, predictions, average="weighted")

model_performance.loc['XGBClassifier'] = [acc, f1_micro, f1_weighted]

print('Accuracy = ', acc)
print('F1 score (micro) is ',f1_micro)
print('F1 score (weighted) is ',f1_weighted)

Accuracy =  0.6624553039332539
F1 score (micro) is  0.81234183375511
F1 score (weighted) is  0.8090999527813589


### Results

In [24]:
model_performance.style.background_gradient(cmap='coolwarm').format({'Accuracy': '{:.2%}',
                                                                     'F1-Score (micro)': '{:.2%}',
                                                                     'F1-Score (weighted)': '{:.2%}',
                                                                     })

Unnamed: 0,Accuracy,F-1 Score (micro),F1-Score (weighted)
LogisticRegression,64.82%,0.803656,80.08%
RandomForestClassifier,66.17%,0.805299,79.65%
XGBClassifier,66.25%,0.812342,80.91%
