<a href="https://colab.research.google.com/github/greyson-newton/youtube_ai/blob/master/arXiv_clustering.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Greyson Newton - Scientific Paper Clustering


---
Content:


1.   Load data
2.   data-cleaning / feature engineering
3.   NLP data preprocessing
4.   Vectorization & dim. reduction with PCA
5.   Clustering
6.   t-SNE vs umap


# Preamble

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
os.chdir("drive/My Drive/projects-deep_learning/ArXiv-NLP")

# Imports

In [None]:
!python -m pip install 'fsspec>=0.3.3'
!pip install umap-learn




In [None]:
# data processing
import logging
logger = logging.getLogger("spacy")
logger.setLevel(logging.ERROR)

import dask.bag as db
import json
import pandas as pd
import string
from tqdm import tqdm




In [None]:
# I discoverd that it's possible to download models for the specific purpose to preprocess scientific texts
# In the spacy docs I found a specific model for this : https://spacy.io/universe/project/scispacy
#Downloading en_core_sci_lg model to preprocess abstracts
from IPython.utils import io
# with io.capture_output() as captured:
#     !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_lg-0.4.0.tar.gz


# NLP Processing and Vectorization
#Import NLP librarys and the spacy package to preprocess the abstract text
import spacy
from spacy.lang.en.stop_words import STOP_WORDS #import commen list of stopword
import en_core_sci_lg  # import downlaoded model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

In [None]:
# Clustering
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from scipy.spatial.distance import cdist
from sklearn import metrics

from umap import UMAP
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

import seaborn as sns

# Load Data

In [None]:
docs = db.read_text('data/arxiv_data.json').map(json.loads)
#Total number of documents: 1872765
docs.count().compute()

1963596

In [None]:
# Looking at one document:
docs.take(1)

({'abstract': '  A fully differential calculation in perturbative quantum chromodynamics is\npresented for the production of massive photon pairs at hadron colliders. All\nnext-to-leading order perturbative contributions from quark-antiquark,\ngluon-(anti)quark, and gluon-gluon subprocesses are included, as well as\nall-orders resummation of initial-state gluon radiation valid at\nnext-to-next-to-leading logarithmic accuracy. The region of phase space is\nspecified in which the calculation is most reliable. Good agreement is\ndemonstrated with data from the Fermilab Tevatron, and predictions are made for\nmore detailed tests with CDF and DO data. Predictions are shown for\ndistributions of diphoton pairs produced at the energy of the Large Hadron\nCollider (LHC). Distributions of the diphoton pairs from the decay of a Higgs\nboson are contrasted with those produced from QCD processes at the LHC, showing\nthat enhanced sensitivity to the signal can be obtained with judicious\nselection 

# Process Data

In [None]:
# The dataset is very huge. Not sure if the whole set can be used. I start prototyping with a subset of the data so it's easyer to handel:
# This procedure was recommended in the ArXiv dataset itself

get_latest_version = lambda x: x['versions'][-1]['created']


# get only necessary fields of the metadata file
trim = lambda x: {'id': x['id'],
                  'authors': x['authors'],
                  'title': x['title'],
                  'doi': x['doi'],
                  'category':x['categories'].split(' '),
                  'abstract':x['abstract'],}
# filter for papers published on or after 2019-01-01
columns = ['id','category','abstract']
docs_df = (docs.filter(lambda x: int(get_latest_version(x).split(' ')[3]) > 2018)
           .map(trim).
           compute())

# convert to pandas
docs_df = pd.DataFrame(docs_df)

#save trimmed dataset for later use so we can skip the dataset trimming later:
docs_df.to_csv("trimmed_arxiv_docs.csv", index=False)

In [None]:
#Let's have a look at the first 5 rows:


docs_df.head()

Unnamed: 0,id,authors,title,doi,category,abstract
0,704.0479,T.Geisser,The affine part of the Picard scheme,,"[math.AG, math.KT]",We describe the maximal torus and maximal un...
1,704.1445,Yasha Gindikin and Vladimir A. Sablikov,Deformed Wigner crystal in a one-dimensional q...,10.1103/PhysRevB.76.045122,"[cond-mat.str-el, cond-mat.mes-hall]",The spatial Fourier spectrum of the electron...
2,705.0033,"Nikos Frantzikinakis, Randall McCutcheon",Ergodic Theory: Recurrence,,[math.DS],We survey the impact of the Poincar\'e recur...
3,705.0344,J. P. Pridham,Unifying derived deformation theories,,[math.AG],We develop a framework for derived deformati...
4,705.0825,Ram Gopal Vishwakarma (Zacatecas University),Einstein's Theory of Gravity in the Presence o...,10.1007/s10509-009-0016-8,"[gr-qc, astro-ph, hep-th]",The mysterious `dark energy' needed to expla...


# NLProcess Data

Saved trimmed and feature engineered data to '/data/clustered_processed_papers.csv'

In [None]:
df = pd.read_csv("./trimmed_arxiv_docs.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 526647 entries, 0 to 526646
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   id        526647 non-null  object
 1   authors   526647 non-null  object
 2   title     526647 non-null  object
 3   doi       181827 non-null  object
 4   category  526647 non-null  object
 5   abstract  526647 non-null  object
dtypes: object(6)
memory usage: 24.1+ MB


In [None]:
df.shape

(526647, 6)

In [None]:
#Addint word counts of each abstract could be a usefull feature
# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    pos_family = {
	    'noun' : ['NN','NNS','NNP','NNPS'],
	    'pron' : ['PRP','PRP$','WP','WP$'],
	    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
	    'adj' :  ['JJ','JJR','JJS'],
	    'adv' : ['RB','RBR','RBS','WRB']
    }
    cnt = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

def feature_engineering(df):
	df['char_count'] = docs_df['abstract'].apply(len)
	df['word_count'] = docs_df['abstract'].apply(lambda x: len(x.split()))
	df['word_density'] = df['char_count'] / (df['word_count']+1)
	df['punctuation_count'] = docs_df['abstract'].apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
	df['title_word_count'] = docs_df['abstract'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
	df['upper_case_word_count'] = docs_df['abstract'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))	
	df['noun_count'] = docs_df['abstract'].apply(lambda x: check_pos_tag(x, 'noun'))
	df['verb_count'] = docs_df['abstract'].apply(lambda x: check_pos_tag(x, 'verb'))
	df['adj_count'] = docs_df['abstract'].apply(lambda x: check_pos_tag(x, 'adj'))
	df['adv_count'] = docs_df['abstract'].apply(lambda x: check_pos_tag(x, 'adv'))
	df['pron_count'] = docs_df['abstract'].apply(lambda x: check_pos_tag(x, 'pron'))
feature_engineering(df)  
df['abstract'].describe(include='all')


count                                                526647
unique                                               526544
top         arXiv admin note: This submission has been w...
freq                                                      3
Name: abstract, dtype: object

In [None]:
# punctuations = string.punctuation #list of punctuation to remove from text
# stopwords = list(STOP_WORDS)
# # Parser
# parser = en_core_sci_lg.load()
# parser.max_length = 7000000 #Limit the size of the parser
# def spacy_tokenizer(sentence):
#     ''' Function to preprocess text of scientific papers 
#         (e.g Removing Stopword and puntuations)'''
#     mytokens = parser(sentence)
#     mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ] # transform to lowercase and then split the scentence
#     mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ] #remove stopsword an punctuation
#     mytokens = " ".join([i for i in mytokens]) 
#     return mytokens

In [None]:
# tqdm.pandas()
# df["processed_text"] = df["abstract"].progress_apply(spacy_tokenizer)

100%|██████████| 526647/526647 [6:10:33<00:00, 23.69it/s]


# Vectorization

In [None]:
# import trimmed and engineered data

# df.to_csv('data/clustered_processed_papers.csv')
# df.to_csv('data/clustered_processed_papers.csv')
df = pd.read_csv('data/clustered_processed_papers.csv')
df = df.iloc[: , 1:]
def vectorize(text, maxx_features):
    vectorizer = TfidfVectorizer(max_features=maxx_features)
    X = vectorizer.fit_transform(text)
    return X

  interactivity=interactivity, compiler=compiler, result=result)


In [None]:
#vectorize each processed abstract
text = df['processed_text'].values
X = vectorize(text, 2 ** 12) #arbitrary max feature -_> Hyperpara. for optimisation (?)
X.shape

(526647, 4096)

In [None]:
# n_batches = 100
# for X_batch in np.array_split(X.toarray(), n_batches):
#     inc_pca.partial_fit(X_batch)
# inc_pca = IncrementalPCA(n_components = 2)



from sklearn.decomposition import IncrementalPCA


pca = IncrementalPCA(n_components=2, batch_size=100) #Keep 95% of the variance
X_reduced= pca.fit_transform(X.toarray())
X_reduced.shape

# Clusterization

In [None]:
r_seed = 24
cluster_errors = []

for i in range(1, 50):
    n_clusters = i
    pipe_pca_kmean = Pipeline([("cluster", KMeans(n_clusters=n_clusters, random_state=r_seed, verbose=0, n_jobs=1))]
    )

    pipe_pca_kmean.fit(X_reduced)
    pipe_pca_kmean.predict(X_reduced)
    cluster_errors.append(pipe_pca_kmean.named_steps["cluster"].inertia_) 

In [None]:
plt.clf()
plt.plot(cluster_errors, "o-")
plt.xlabel("k_clusters")
plt.ylabel("sum sq distances from mean")
plt.savefig('results-clusterization/figs/k_cluster_error.png')
plt.show()

In [None]:
k = 20 # optimal k found in elbow plot
kmeans = KMeans(n_clusters=k, random_state=42)
y_pred = kmeans.fit_predict(X_reduced)
df['kmean_clusters'] = y_pred

# t-SNE vs. UMap

In [None]:
# UMAP Definition:
umap_embeddings = UMAP(n_neighbors=100, min_dist=0.3, n_components=2)
X_umap = umap_embeddings.fit_transform(X_reduced)

In [None]:
tsne = TSNE(verbose=1, perplexity=100, random_state=42)
X_embedded = tsne.fit_transform(X.toarray())

In [None]:
# sns settings
sns.set(rc={'figure.figsize':(15,15)})

# colors
palette = sns.color_palette("bright", 1)

# plot
sns.scatterplot(x=X_embedded[:,0], y=X_embedded[:,1], palette=palette)
plt.title('t-SNE without Labels')
plt.savefig("results-clusterization/figs/t-sne_arxvid.png")
plt.show()

In [None]:
# sns settings
sns.set(rc={'figure.figsize':(15,15)})

# colors
palette = sns.color_palette("bright", 1)

# plot
sns.scatterplot(x=X_umap[:,0], y=X_umap[:,1], palette=palette)
plt.title('umap without Labels')
plt.savefig("results-clusterization/figs/umap_arxvid.png")
plt.show()

In [None]:
%matplotlib inline

# sns settings
sns.set(rc={'figure.figsize':(15,15)})

# colors
palette = sns.hls_palette(20, l=.4, s=.9)

# plot
sns.scatterplot(x=X_embedded[:,0], y=X_embedded[:,1], hue=y_pred, legend='full', palette=palette)
plt.title('t-SNE with Kmeans Labels')
plt.savefig("results-clusterization/figs/cluster_tsne.png")
plt.show()