In [None]:
!pip install transformers==3.1.0
import os, json, gc, re, random



In [None]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
#from sklearn.preprocessing import MultiLabelBinarizer
#from sklearn.model_selection import train_test_split
#from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

import logging
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [None]:
from transformers import pipeline

In [None]:
data_file = '/content/drive/MyDrive/Colab Notebooks/arxiv-metadata-oai-snapshot.json'

""" Using `yield` to load the JSON file in a loop to prevent Python memory issues if JSON is loaded directly"""

def get_metadata():
    with open(data_file, 'r') as f:
        for line in f:
            yield line

In [None]:
classifier = pipeline("zero-shot-classification",device = 0)

Some weights of the model checkpoint at facebook/bart-large-mnli were not used when initializing BartForSequenceClassification: ['model.encoder.version', 'model.decoder.version']
- This IS expected if you are initializing BartForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BartForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
metadata = get_metadata()
for paper in metadata:
    for k, v in json.loads(paper).items():
        print(f'{k}: {v} \n')
    break

id: 0704.0001 

submitter: Pavel Nadolsky 

authors: C. Bal\'azs, E. L. Berger, P. M. Nadolsky, C.-P. Yuan 

title: Calculation of prompt diphoton production cross sections at Tevatron and
  LHC energies 

comments: 37 pages, 15 figures; published version 

journal-ref: Phys.Rev.D76:013009,2007 

doi: 10.1103/PhysRevD.76.013009 

report-no: ANL-HEP-PR-07-12 

categories: hep-ph 

license: None 

abstract:   A fully differential calculation in perturbative quantum chromodynamics is
presented for the production of massive photon pairs at hadron colliders. All
next-to-leading order perturbative contributions from quark-antiquark,
gluon-(anti)quark, and gluon-gluon subprocesses are included, as well as
all-orders resummation of initial-state gluon radiation valid at
next-to-next-to-leading logarithmic accuracy. The region of phase space is
specified in which the calculation is most reliable. Good agreement is
demonstrated with data from the Fermilab Tevatron, and predictions are made for
m

In [None]:
category_map = {'astro-ph': 'Astrophysics',
                'astro-ph.CO': 'Cosmology and Nongalactic Astrophysics',
                'astro-ph.EP': 'Earth and Planetary Astrophysics',
                'astro-ph.GA': 'Astrophysics of Galaxies',
                'astro-ph.HE': 'High Energy Astrophysical Phenomena',
                'astro-ph.IM': 'Instrumentation and Methods for Astrophysics',
                'astro-ph.SR': 'Solar and Stellar Astrophysics',
                'cond-mat.dis-nn': 'Disordered Systems and Neural Networks',
                'cond-mat.mes-hall': 'Mesoscale and Nanoscale Physics',
                'cond-mat.mtrl-sci': 'Materials Science',
                'cond-mat.other': 'Other Condensed Matter',
                'cond-mat.quant-gas': 'Quantum Gases',
                'cond-mat.soft': 'Soft Condensed Matter',
                'cond-mat.stat-mech': 'Statistical Mechanics',
                'cond-mat.str-el': 'Strongly Correlated Electrons',
                'cond-mat.supr-con': 'Superconductivity',
                'cs.AI': 'Artificial Intelligence',
                'cs.AR': 'Hardware Architecture',
                'cs.CC': 'Computational Complexity',
                'cs.CE': 'Computational Engineering, Finance, and Science',
                'cs.CG': 'Computational Geometry',
                'cs.CL': 'Computation and Language',
                'cs.CR': 'Cryptography and Security',
                'cs.CV': 'Computer Vision and Pattern Recognition',
                'cs.CY': 'Computers and Society',
                'cs.DB': 'Databases',
                'cs.DC': 'Distributed, Parallel, and Cluster Computing',
                'cs.DL': 'Digital Libraries',
                'cs.DM': 'Discrete Mathematics',
                'cs.DS': 'Data Structures and Algorithms',
                'cs.ET': 'Emerging Technologies',
                'cs.FL': 'Formal Languages and Automata Theory',
                'cs.GL': 'General Literature',
                'cs.GR': 'Graphics',
                'cs.GT': 'Computer Science and Game Theory',
                'cs.HC': 'Human-Computer Interaction',
                'cs.IR': 'Information Retrieval',
                'cs.IT': 'Information Theory',
                'cs.LG': 'Machine Learning',
                'cs.LO': 'Logic in Computer Science',
                'cs.MA': 'Multiagent Systems',
                'cs.MM': 'Multimedia',
                'cs.MS': 'Mathematical Software',
                'cs.NA': 'Numerical Analysis',
                'cs.NE': 'Neural and Evolutionary Computing',
                'cs.NI': 'Networking and Internet Architecture',
                'cs.OH': 'Other Computer Science',
                'cs.OS': 'Operating Systems',
                'cs.PF': 'Performance',
                'cs.PL': 'Programming Languages',
                'cs.RO': 'Robotics',
                'cs.SC': 'Symbolic Computation',
                'cs.SD': 'Sound',
                'cs.SE': 'Software Engineering',
                'cs.SI': 'Social and Information Networks',
                'cs.SY': 'Systems and Control',
                'econ.EM': 'Econometrics',
                'eess.AS': 'Audio and Speech Processing',
                'eess.IV': 'Image and Video Processing',
                'eess.SP': 'Signal Processing',
                'gr-qc': 'General Relativity and Quantum Cosmology',
                'hep-ex': 'High Energy Physics - Experiment',
                'hep-lat': 'High Energy Physics - Lattice',
                'hep-ph': 'High Energy Physics - Phenomenology',
                'hep-th': 'High Energy Physics - Theory',
                'math.AC': 'Commutative Algebra',
                'math.AG': 'Algebraic Geometry',
                'math.AP': 'Analysis of PDEs',
                'math.AT': 'Algebraic Topology',
                'math.CA': 'Classical Analysis and ODEs',
                'math.CO': 'Combinatorics',
                'math.CT': 'Category Theory',
                'math.CV': 'Complex Variables',
                'math.DG': 'Differential Geometry',
                'math.DS': 'Dynamical Systems',
                'math.FA': 'Functional Analysis',
                'math.GM': 'General Mathematics',
                'math.GN': 'General Topology',
                'math.GR': 'Group Theory',
                'math.GT': 'Geometric Topology',
                'math.HO': 'History and Overview',
                'math.IT': 'Information Theory',
                'math.KT': 'K-Theory and Homology',
                'math.LO': 'Logic',
                'math.MG': 'Metric Geometry',
                'math.MP': 'Mathematical Physics',
                'math.NA': 'Numerical Analysis',
                'math.NT': 'Number Theory',
                'math.OA': 'Operator Algebras',
                'math.OC': 'Optimization and Control',
                'math.PR': 'Probability',
                'math.QA': 'Quantum Algebra',
                'math.RA': 'Rings and Algebras',
                'math.RT': 'Representation Theory',
                'math.SG': 'Symplectic Geometry',
                'math.SP': 'Spectral Theory',
                'math.ST': 'Statistics Theory',
                'math-ph': 'Mathematical Physics',
                'nlin.AO': 'Adaptation and Self-Organizing Systems',
                'nlin.CD': 'Chaotic Dynamics',
                'nlin.CG': 'Cellular Automata and Lattice Gases',
                'nlin.PS': 'Pattern Formation and Solitons',
                'nlin.SI': 'Exactly Solvable and Integrable Systems',
                'nucl-ex': 'Nuclear Experiment',
                'nucl-th': 'Nuclear Theory',
                'physics.acc-ph': 'Accelerator Physics',
                'physics.ao-ph': 'Atmospheric and Oceanic Physics',
                'physics.app-ph': 'Applied Physics',
                'physics.atm-clus': 'Atomic and Molecular Clusters',
                'physics.atom-ph': 'Atomic Physics',
                'physics.bio-ph': 'Biological Physics',
                'physics.chem-ph': 'Chemical Physics',
                'physics.class-ph': 'Classical Physics',
                'physics.comp-ph': 'Computational Physics',
                'physics.data-an': 'Data Analysis, Statistics and Probability',
                'physics.ed-ph': 'Physics Education',
                'physics.flu-dyn': 'Fluid Dynamics',
                'physics.gen-ph': 'General Physics',
                'physics.geo-ph': 'Geophysics',
                'physics.hist-ph': 'History and Philosophy of Physics',
                'physics.ins-det': 'Instrumentation and Detectors',
                'physics.med-ph': 'Medical Physics',
                'physics.optics': 'Optics',
                'physics.plasm-ph': 'Plasma Physics',
                'physics.pop-ph': 'Popular Physics',
                'physics.soc-ph': 'Physics and Society',
                'physics.space-ph': 'Space Physics',
                'q-bio.BM': 'Biomolecules',
                'q-bio.CB': 'Cell Behavior',
                'q-bio.GN': 'Genomics',
                'q-bio.MN': 'Molecular Networks',
                'q-bio.NC': 'Neurons and Cognition',
                'q-bio.OT': 'Other Quantitative Biology',
                'q-bio.PE': 'Populations and Evolution',
                'q-bio.QM': 'Quantitative Methods',
                'q-bio.SC': 'Subcellular Processes',
                'q-bio.TO': 'Tissues and Organs',
                'q-fin.CP': 'Computational Finance',
                'q-fin.EC': 'Economics',
                'q-fin.GN': 'General Finance',
                'q-fin.MF': 'Mathematical Finance',
                'q-fin.PM': 'Portfolio Management',
                'q-fin.PR': 'Pricing of Securities',
                'q-fin.RM': 'Risk Management',
                'q-fin.ST': 'Statistical Finance',
                'q-fin.TR': 'Trading and Market Microstructure',
                'quant-ph': 'Quantum Physics',
                'stat.AP': 'Applications',
                'stat.CO': 'Computation',
                'stat.ME': 'Methodology',
                'stat.ML': 'Machine Learning',
                'stat.OT': 'Other Statistics',
                'stat.TH': 'Statistics Theory'}
len(category_map)

153

In [None]:
text_tags_dict = {"abstract":[], "categories":[]}
for paper in metadata:
    parsed = json.loads(paper)
    text = parsed['abstract']
    text_tags_dict["abstract"].append(text)
    text_tags_dict["categories"].append(parsed['categories'])


In [None]:
df = pd.DataFrame.from_records(text_tags_dict)


In [None]:
df.iloc[1]['abstract']

IndexError: ignored

In [None]:
df.iloc[1]['categories']

IndexError: ignored

In [None]:
candidate_labels = list(category_map.values()) #candidate labels are basically the classes that the classifier will predict
predictedCategories = []
trueCategories = []
for i in range(100):
    text = df.iloc[i]['abstract']
    cat = df.iloc[i]['categories']
    cat = cat.split()
    res = classifier(text, candidate_labels, multi_class=True)#setting multi-class as True
    labels = res['labels'] 
    scores = res['scores'] #extracting the scores associated with the labels
    res_dict = {label : score for label,score in zip(labels, scores)}
    sorted_dict = dict(sorted(res_dict.items(), key=lambda x:x[1],reverse = True)) #sorting the dictionary of labels in descending order based on their score
    categories  = []
    for i, (k,v) in enumerate(sorted_dict.items()):
        if(i > 2): #storing only the best 3 predictions
            break
        else:
            categories.append(k)
    predictedCategories.append(categories)
    trueCats = [category_map[x] for x in cat]
    trueCategories.append(trueCats)

In [None]:
for y_true, y_pred in zip(trueCategories[:3], predictedCategories[:3]):
    print(f'True Categories {y_true}')
    print(f'Predicted Categories {y_pred}')
    print('#'*50)

True Categories ['Combinatorics', 'Computational Geometry']
Predicted Categories ['Computation', 'Performance', 'Systems and Control']
##################################################
True Categories ['General Physics']
Predicted Categories ['Physics and Society', 'Complex Variables', 'Graphics']
##################################################
True Categories ['Combinatorics']
Predicted Categories ['Systems and Control', 'Sound', 'Methodology']
##################################################


In [None]:
predictedCategories = []
trueCategories = []
text = df.iloc[1]['abstract']
cat = df.iloc[1]['categories']
cat = cat.split()
res = classifier(text, candidate_labels, multi_class=True)#setting multi-class as True
labels = res['labels'] 
scores = res['scores'] #extracting the scores associated with the labels
res_dict = {label : score for label,score in zip(labels, scores)}
sorted_dict = dict(sorted(res_dict.items(), key=lambda x:x[1],reverse = True)) #sorting the dictionary of labels in descending order based on their score
categories  = []
for i, (k,v) in enumerate(sorted_dict.items()):
    if(i > 2): #storing only the best 3 predictions
        break
    else:
        categories.append(k)
predictedCategories.append(categories)
trueCats = [category_map[x] for x in cat]
trueCategories.append(trueCats)

IndexError: ignored

In [None]:

for y_true, y_pred in zip(trueCategories, predictedCategories):
    print(f'True Categories {y_true}')
    print(f'Predicted Categories {y_pred}')
    print('#'*50)

True Categories ['General Physics']
Predicted Categories ['Physics and Society', 'Complex Variables', 'Graphics']
##################################################


In [None]:
df.head()

Unnamed: 0,abstract,categories
0,"We describe a new algorithm, the $(k,\ell)$-...",math.CO cs.CG
1,The evolution of Earth-Moon system is descri...,physics.gen-ph
2,We show that a determinant of Stirling cycle...,math.CO
3,In this paper we show how to compute the $\L...,math.CA math.FA
4,We study the two-particle wave function of p...,cond-mat.mes-hall


In [None]:
from transformers import BertTokenizer

In [None]:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords  

stop_words = set(stopwords.words('english'))  
  
lemmatizer = WordNetLemmatizer() 

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
def word_cleaner(txt):
  sentence = ""
  for word in txt.split():
    if not word in stop_words:
      sentence += " " + lemmatizer.lemmatize(word)
  return sentence

In [None]:
df["processed"] = 'not yet processed'

In [None]:
for i in range(100000):
  df['processed'][i] = word_cleaner(df['abstract'][i])

In [None]:
new_df = df.head(100000)

In [None]:
new_df.to_csv("processed.csv", index=False)

In [None]:
for i in range(10):
  candidate_labels = list(category_map.values()) #candidate labels are basically the classes that the classifier will predict
  predictedCategories = []
  trueCategories = []
  text = new_df.iloc[i]['processed']
  cat = new_df.iloc[i]['categories']
  cat = cat.split()
  res = classifier(text, candidate_labels, multi_class=True)#setting multi-class as True
  labels = res['labels'] 
  scores = res['scores'] #extracting the scores associated with the labels
  res_dict = {label : score for label,score in zip(labels, scores)}
  sorted_dict = dict(sorted(res_dict.items(), key=lambda x:x[1],reverse = True)) #sorting the dictionary of labels in descending order based on their score
  categories  = []
  for i, (k,v) in enumerate(sorted_dict.items()):
      if(i > 2): #storing only the best 3 predictions
          break
      else:
          categories.append(k)
  predictedCategories.append(categories)
  trueCats = [category_map[x] for x in cat]
  trueCategories.append(trueCats)

In [None]:
for y_true, y_pred in zip(trueCategories, predictedCategories):
    print(f'True Categories {y_true}')
    print(f'Predicted Categories {y_pred}')
    print('#'*50)

True Categories ['Number Theory', 'Algebraic Geometry']
Predicted Categories ['Computation', 'Sound', 'Number Theory']
##################################################


In [None]:
for i in range(10):
  candidate_labels = list(category_map.values()) #candidate labels are basically the classes that the classifier will predict
  predictedCategories = []
  trueCategories = []
  text = new_df.iloc[i]['abstract']
  cat = new_df.iloc[i]['categories']
  cat = cat.split()
  res = classifier(text, candidate_labels, multi_class=True)#setting multi-class as True
  labels = res['labels'] 
  scores = res['scores'] #extracting the scores associated with the labels
  res_dict = {label : score for label,score in zip(labels, scores)}
  sorted_dict = dict(sorted(res_dict.items(), key=lambda x:x[1],reverse = True)) #sorting the dictionary of labels in descending order based on their score
  categories  = []
  for i, (k,v) in enumerate(sorted_dict.items()):
      if(i > 2): #storing only the best 3 predictions
          break
      else:
          categories.append(k)
  predictedCategories.append(categories)
  trueCats = [category_map[x] for x in cat]
  trueCategories.append(trueCats)

In [None]:
for y_true, y_pred in zip(trueCategories, predictedCategories):
    print(f'True Categories {y_true}')
    print(f'Predicted Categories {y_pred}')
   

NameError: ignored

In [None]:
processed_df = pd.read_csv("/content/drive/MyDrive/processed.csv")

In [None]:
candidate_labels = list(category_map.values()) #candidate labels are basically the classes that the classifier will predict
predictedCategories = []
trueCategories = []
for i in range(50):
  text = processed_df.iloc[i]['abstract']
  cat = processed_df.iloc[i]['categories']
  cat = cat.split()
  res = classifier(text, candidate_labels, multi_class=True)#setting multi-class as True
  labels = res['labels'] 
  scores = res['scores'] #extracting the scores associated with the labels
  res_dict = {label : score for label,score in zip(labels, scores)}
  sorted_dict = dict(sorted(res_dict.items(), key=lambda x:x[1],reverse = True)) #sorting the dictionary of labels in descending order based on their score
  categories  = []
  for i, (k,v) in enumerate(sorted_dict.items()):
      if(i > 2): #storing only the best 3 predictions
          break
      else:
          categories.append(k)
  predictedCategories.append(categories)
  trueCats = [category_map[x] for x in cat]
  trueCategories.append(trueCats)

In [None]:
proc_predictedCategories = []
proc_trueCategories = []
for i in range(50):
  text = processed_df.iloc[i]['processed']
  cat = processed_df.iloc[i]['categories']
  cat = cat.split()
  res = classifier(text, candidate_labels, multi_class=True)#setting multi-class as True
  labels = res['labels'] 
  scores = res['scores'] #extracting the scores associated with the labels
  res_dict = {label : score for label,score in zip(labels, scores)}
  sorted_dict = dict(sorted(res_dict.items(), key=lambda x:x[1],reverse = True)) #sorting the dictionary of labels in descending order based on their score
  categories  = []
  for i, (k,v) in enumerate(sorted_dict.items()):
      if(i > 2): #storing only the best 3 predictions
          break
      else:
          categories.append(k)
  proc_predictedCategories.append(categories)
  trueCats = [category_map[x] for x in cat]
  proc_trueCategories.append(trueCats)

In [None]:
cor = 0
for pred,true in zip(predictedCategories, trueCategories):
  for element in pred:
    if element in true:
      cor += 1


In [None]:
cor

8

In [None]:
cor = 0
for pred,true in zip(proc_predictedCategories, proc_trueCategories):
  for element in pred:
    if element in true:
      cor += 1


In [None]:
cor

11

In [None]:
candidate_labels = list(category_map.values()) #candidate labels are basically the classes that the classifier will predict
predictedCategories = []
trueCategories = []
for i in range(10):
  text = processed_df.iloc[i]['abstract']
  cat = processed_df.iloc[i]['categories']
  cat = cat.split()
  res = classifier(text, candidate_labels, multi_class=True)#setting multi-class as True
  labels = res['labels'] 
  scores = res['scores'] #extracting the scores associated with the labels
  res_dict = {label : score for label,score in zip(labels, scores)}
  sorted_dict = dict(sorted(res_dict.items(), key=lambda x:x[1],reverse = True)) #sorting the dictionary of labels in descending order based on their score
  categories  = []
  for i, (k,v) in enumerate(sorted_dict.items()):
      if(i > 2): #storing only the best 3 predictions
          break
      else:
          categories.append(k)
  predictedCategories.append(categories)
  trueCats = [category_map[x] for x in cat]
  trueCategories.append(trueCats)

NameError: ignored

In [None]:
proc_predictedCategories = []
proc_trueCategories = []
for i in range(10):
  text = processed_df.iloc[i]['processed']
  cat = processed_df.iloc[i]['categories']
  cat = cat.split()
  res = classifier(text, candidate_labels, multi_class=True)#setting multi-class as True
  labels = res['labels'] 
  scores = res['scores'] #extracting the scores associated with the labels
  res_dict = {label : score for label,score in zip(labels, scores)}
  sorted_dict = dict(sorted(res_dict.items(), key=lambda x:x[1],reverse = True)) #sorting the dictionary of labels in descending order based on their score
  categories  = []
  for i, (k,v) in enumerate(sorted_dict.items()):
      if(i > 2): #storing only the best 3 predictions
          break
      else:
          categories.append(k)
  proc_predictedCategories.append(categories)
  trueCats = [category_map[x] for x in cat]
  proc_trueCategories.append(trueCats)

NameError: ignored