<a href="https://colab.research.google.com/github/danielscabar/arxiv_text_classification/blob/main/Arxiv_Text_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preparation

In [1]:
import pandas as pd
import numpy as np

In [17]:
!pip install arxiv
import arxiv



In [3]:
category_map_astro = {'astro-ph': 'Astrophysics',
'astro-ph.CO': 'Cosmology and Nongalactic Astrophysics',
'astro-ph.EP': 'Earth and Planetary Astrophysics',
'astro-ph.GA': 'Astrophysics of Galaxies',
'astro-ph.HE': 'High Energy Astrophysical Phenomena',
'astro-ph.IM': 'Instrumentation and Methods for Astrophysics',
'astro-ph.SR': 'Solar and Stellar Astrophysics'}

In [21]:
def data_transf(searches,client):
  results = {}

  for search in searches:
    results[search] = client.results(searches[search])

  ls = []
  for result in results:
    for paper in results[result]:
      ls.append(paper)

  df = pd.DataFrame([vars(paper) for paper in ls])
  return df

In [22]:
def get_arxiv_data_relevance(category_map, num_papers):
  client = arxiv.Client()
  searches = {}
  for category in category_map:
    searches[category_map[category]] = arxiv.Search(
      query = 'cat:'+category,
      max_results = num_papers,
      sort_by = arxiv.SortCriterion.Relevance
    )
  return data_transf(searches, client)

In [23]:
def get_arxiv_data_recent(category_map, num_papers):
  client = arxiv.Client()
  searches = {}
  for category in category_map:
    searches[category_map[category]] = arxiv.Search(
      query = 'cat:'+category,
      max_results = num_papers,
      sort_by = arxiv.SortCriterion.SubmittedDate
    )
  return data_transf(searches, client)

In [37]:
df = get_arxiv_data_relevance(category_map_astro, 200)
df.head()

KeyboardInterrupt: 

In [38]:
from google.colab import drive
drive.mount('/content/drive')

#df.to_csv('/content/drive/My Drive/arxiv_data_astro200.csv', index=False)

df = pd.read_csv('/content/drive/My Drive/arxiv_data_astro200.csv')
df.head()

Mounted at /content/drive


Unnamed: 0,entry_id,updated,published,title,authors,summary,comment,journal_ref,doi,primary_category,categories,links,pdf_url,_raw
0,http://arxiv.org/abs/astro-ph/9204001v1,1992-04-13 18:20:01+00:00,1992-04-13 18:20:01+00:00,Gamma-Ray Bursts as the Death Throes of Massiv...,"[arxiv.Result.Author('Ramesh Narayan'), arxiv....",It is proposed that gamma-ray bursts are creat...,14 pages,Astrophys.J. 395 (1992) L83-L86,10.1086/186493,astro-ph,['astro-ph'],[arxiv.Result.Link('http://dx.doi.org/10.1086/...,http://arxiv.org/pdf/astro-ph/9204001v1,{'id': 'http://arxiv.org/abs/astro-ph/9204001v...
1,http://arxiv.org/abs/astro-ph/9204002v1,1992-04-26 17:54:00+00:00,1992-04-26 17:54:00+00:00,Gravitational Lensing and the Variability of G,"[arxiv.Result.Author('Lawrence Krauss'), arxiv...",The four observables associated with gravitati...,13 pages plus figures (not included),"Astrophys.J.397:357,1992",10.1086/171792,astro-ph,['astro-ph'],[arxiv.Result.Link('http://dx.doi.org/10.1086/...,http://arxiv.org/pdf/astro-ph/9204002v1,{'id': 'http://arxiv.org/abs/astro-ph/9204002v...
2,http://arxiv.org/abs/astro-ph/9204003v2,1992-04-30 20:39:38+00:00,1992-04-29 16:36:30+00:00,The Ptolemaic Gamma-Ray Burst Universe,[arxiv.Result.Author('J. I. Katz')],The BATSE experiment on GRO has demonstrated t...,10 pages (Replaced to provide omitted line.),,10.1007/BF00645080,astro-ph,['astro-ph'],[arxiv.Result.Link('http://dx.doi.org/10.1007/...,http://arxiv.org/pdf/astro-ph/9204003v2,{'id': 'http://arxiv.org/abs/astro-ph/9204003v...
3,http://arxiv.org/abs/astro-ph/9204004v1,1992-04-30 19:20:04+00:00,1992-04-30 19:20:04+00:00,Expanding Photospheres of Type II Supernovae a...,"[arxiv.Result.Author('B P Schmidt'), arxiv.Res...",We use the Expanding Photosphere Method to det...,21 pages,Astrophys.J. 395 (1992) 366-386,10.1086/171659,astro-ph,['astro-ph'],[arxiv.Result.Link('http://dx.doi.org/10.1086/...,http://arxiv.org/pdf/astro-ph/9204004v1,{'id': 'http://arxiv.org/abs/astro-ph/9204004v...
4,http://arxiv.org/abs/astro-ph/9204005v1,1992-04-30 19:18:05+00:00,1992-04-30 19:18:05+00:00,Radiation Transfer in Gamma-Ray Bursts,"[arxiv.Result.Author('B. J. Carrigan'), arxiv....",We have calculated gamma-ray radiative transpo...,24 pages,Astrophys.J. 399 (1992) 100-107,10.1086/171906,astro-ph,['astro-ph'],[arxiv.Result.Link('http://dx.doi.org/10.1086/...,http://arxiv.org/pdf/astro-ph/9204005v1,{'id': 'http://arxiv.org/abs/astro-ph/9204005v...


# EDA

In [40]:
df.shape

(1400, 14)

In [41]:
pd.set_option('display.max_rows', 7)
df['primary_category'].value_counts()

Unnamed: 0_level_0,count
primary_category,Unnamed: 1_level_1
astro-ph,200
astro-ph.CO,200
astro-ph.EP,200
astro-ph.GA,200
astro-ph.HE,200
astro-ph.IM,200
astro-ph.SR,200


In [42]:
df_short = df[['primary_category', 'summary']]
df_short.head()

Unnamed: 0,primary_category,summary
0,astro-ph,It is proposed that gamma-ray bursts are creat...
1,astro-ph,The four observables associated with gravitati...
2,astro-ph,The BATSE experiment on GRO has demonstrated t...
3,astro-ph,We use the Expanding Photosphere Method to det...
4,astro-ph,We have calculated gamma-ray radiative transpo...


In [43]:
print(df_short.isnull().sum())

primary_category    0
summary             0
dtype: int64


# Data Cleaning

In [44]:
import re
import string
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = nltk.WordNetLemmatizer()

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords]
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [16]:
df_short['summary_clean'] = df_short['summary'].apply(lambda x: clean_text(x))
df_short.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_short['summary_clean'] = df_short['summary'].apply(lambda x: clean_text(x))


Unnamed: 0,primary_category,summary,summary_clean
0,astro-ph,It is proposed that gamma-ray bursts are creat...,"[proposed, gammaray, burst, created, merger, d..."
1,astro-ph,The four observables associated with gravitati...,"[four, observables, associated, gravitational,..."
2,astro-ph,The BATSE experiment on GRO has demonstrated t...,"[batse, experiment, gro, demonstrated, isotrop..."
3,astro-ph,We use the Expanding Photosphere Method to det...,"[use, expanding, photosphere, method, determin..."
4,astro-ph,We have calculated gamma-ray radiative transpo...,"[calculated, gammaray, radiative, transport, r..."


# Feature Engineering

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_matrix = tfidf_vect.fit_transform(df_short['summary'])

tfidf_df = pd.DataFrame(tfidf_matrix.toarray())

In [46]:
tfidf_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14854,14855,14856,14857,14858,14859,14860,14861,14862,14863
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Model Training and Evaluation

In [15]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors
from sklearn.neural_network import MLPClassifier


from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

model_params = {
    'RandomForestClassifier': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 5, 10, 15]
        }
    },
    'KNeighborsClassifier': {
        'model': neighbors.KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3,4,5,6,7,8,9,10],
            'metric': ['euclidean','manhattan']
        }
    },
    'MLPClassifier': {
        'model': MLPClassifier(max_iter=500,random_state=42),
        'params': {
            'hidden_layer_sizes': [(100,), (50, 50), (10, 10, 10)],
            'activation': ['relu', 'tanh']
        }
    }
}

best_models = {}

X = tfidf_df
y = df_short['primary_category']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


for model_name, mp in model_params.items():
    print(f'Model {model_name}...')
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, scoring='accuracy')
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    best_models[model_name] = (clf.best_estimator_, accuracy)

best_model_name = max(best_models, key=lambda k: best_models[k][1])
best_model, best_accuracy = best_models[best_model_name]

print(f"Melhor modelo: {best_model_name}")
print(f"Melhor acurácia: {best_accuracy}")
print(f"Melhor modelo encontrado: {best_model}")
print(f"Classification Report: \n{classification_report(y_test, y_pred)}")

Model RandomForestClassifier...
Model KNeighborsClassifier...
Model MLPClassifier...
Melhor modelo: MLPClassifier
Melhor acurácia: 0.7
Melhor modelo encontrado: MLPClassifier(activation='tanh', max_iter=500, random_state=42)
Classification Report: 
              precision    recall  f1-score   support

    astro-ph       0.58      0.45      0.51        40
 astro-ph.CO       0.61      0.70      0.65        40
 astro-ph.EP       0.92      0.82      0.87        40
 astro-ph.GA       0.59      0.55      0.57        40
 astro-ph.HE       0.70      0.82      0.76        40
 astro-ph.IM       0.78      0.90      0.84        40
 astro-ph.SR       0.70      0.65      0.68        40

    accuracy                           0.70       280
   macro avg       0.70      0.70      0.70       280
weighted avg       0.70      0.70      0.70       280



In [47]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

X = tfidf_df
y = df_short['primary_category']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

model = MLPClassifier(max_iter=500,random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Acurácia: {accuracy}")
print(f"Classification Report: \n{classification_report(y_test, y_pred)}")

Acurácia: 0.7178571428571429
Classification Report: 
              precision    recall  f1-score   support

    astro-ph       0.65      0.55      0.59        40
 astro-ph.CO       0.67      0.70      0.68        40
 astro-ph.EP       0.92      0.82      0.87        40
 astro-ph.GA       0.59      0.55      0.57        40
 astro-ph.HE       0.67      0.85      0.75        40
 astro-ph.IM       0.82      0.90      0.86        40
 astro-ph.SR       0.72      0.65      0.68        40

    accuracy                           0.72       280
   macro avg       0.72      0.72      0.72       280
weighted avg       0.72      0.72      0.72       280



In [48]:
model.fit(pd.concat([X_train, X_test]),pd.concat([y_train, y_test]))

Model Validation

In [49]:
df_test = get_arxiv_data_recent(category_map_astro, 10)
df_test.head()

Unnamed: 0,entry_id,updated,published,title,authors,summary,comment,journal_ref,doi,primary_category,categories,links,pdf_url,_raw
0,http://arxiv.org/abs/1912.12092v1,2019-12-27 13:35:35+00:00,2019-12-27 13:35:35+00:00,Bremsstrahlung emission from nuclear reactions...,"[Sergei P. Maydanyuk, Kostiantyn A. Shaulskyi]",Bremsstrahlung emission of photons during nucl...,"16 pages, 7 captured figures, 1 Appendix",,,nucl-th,"[nucl-th, astro-ph, astro-ph.HE, astro-ph.SR, ...","[http://arxiv.org/abs/1912.12092v1, http://arx...",http://arxiv.org/pdf/1912.12092v1,"{'id': 'http://arxiv.org/abs/1912.12092v1', 'g..."
1,http://arxiv.org/abs/0812.4649v3,2009-08-31 14:52:45+00:00,2008-12-30 20:59:03+00:00,Towards a warped inflationary brane scanning,"[Heng-Yu Chen, Jinn-Ouk Gong]",We present a detailed systematics for comparin...,"(v1) 11 pages, 2 figures, 2 tables; (v2) more ...","Phys.Rev.D80:063507,2009",10.1103/PhysRevD.80.063507,hep-th,"[hep-th, astro-ph, gr-qc, hep-ph]","[http://dx.doi.org/10.1103/PhysRevD.80.063507,...",http://arxiv.org/pdf/0812.4649v3,"{'id': 'http://arxiv.org/abs/0812.4649v3', 'gu..."
2,http://arxiv.org/abs/0812.5113v1,2008-12-30 20:38:39+00:00,2008-12-30 20:38:39+00:00,Ultraviolet Spectra of Local Galaxies and thei...,[Claus Leitherer],The new generation of 8 to 10m class telescope...,"8 pages, invited talk given at the conference ...",,10.1063/1.3141539,astro-ph,[astro-ph],"[http://dx.doi.org/10.1063/1.3141539, http://a...",http://arxiv.org/pdf/0812.5113v1,"{'id': 'http://arxiv.org/abs/0812.5113v1', 'gu..."
3,http://arxiv.org/abs/0812.4582v1,2008-12-30 20:20:48+00:00,2008-12-30 20:20:48+00:00,Correlated variability in the blazar 3C 454.3,"[E. W. Bonning, C. Bailyn, C. M. Urry, M. Buxt...",The blazar 3C 454.3 was revealed by the Fermi ...,"7 pages, 3 figures, submitted to ApJ Letters","Astrophys.J.697:L81-L85,2009",10.1088/0004-637X/697/2/L81,astro-ph,[astro-ph],[http://dx.doi.org/10.1088/0004-637X/697/2/L81...,http://arxiv.org/pdf/0812.4582v1,"{'id': 'http://arxiv.org/abs/0812.4582v1', 'gu..."
4,http://arxiv.org/abs/0812.5111v1,2008-12-30 20:20:43+00:00,2008-12-30 20:20:43+00:00,Biases and Uncertainties in Physical Parameter...,"[Seong-Kook Lee, Rafal Idzi, Henry C. Ferguson...",We investigate the biases and uncertainties in...,"85 pages, 34 figures, submittted to ApJS","Astrophys.J.Suppl.184:100-132,2009",10.1088/0067-0049/184/1/100,astro-ph,[astro-ph],[http://dx.doi.org/10.1088/0067-0049/184/1/100...,http://arxiv.org/pdf/0812.5111v1,"{'id': 'http://arxiv.org/abs/0812.5111v1', 'gu..."


In [54]:
df_test_filtered = df_test[df_test['primary_category'].isin(category_map_astro)]
df_test_filtered.head()

Unnamed: 0,entry_id,updated,published,title,authors,summary,comment,journal_ref,doi,primary_category,categories,links,pdf_url,_raw
2,http://arxiv.org/abs/0812.5113v1,2008-12-30 20:38:39+00:00,2008-12-30 20:38:39+00:00,Ultraviolet Spectra of Local Galaxies and thei...,[Claus Leitherer],The new generation of 8 to 10m class telescope...,"8 pages, invited talk given at the conference ...",,10.1063/1.3141539,astro-ph,[astro-ph],"[http://dx.doi.org/10.1063/1.3141539, http://a...",http://arxiv.org/pdf/0812.5113v1,"{'id': 'http://arxiv.org/abs/0812.5113v1', 'gu..."
3,http://arxiv.org/abs/0812.4582v1,2008-12-30 20:20:48+00:00,2008-12-30 20:20:48+00:00,Correlated variability in the blazar 3C 454.3,"[E. W. Bonning, C. Bailyn, C. M. Urry, M. Buxt...",The blazar 3C 454.3 was revealed by the Fermi ...,"7 pages, 3 figures, submitted to ApJ Letters","Astrophys.J.697:L81-L85,2009",10.1088/0004-637X/697/2/L81,astro-ph,[astro-ph],[http://dx.doi.org/10.1088/0004-637X/697/2/L81...,http://arxiv.org/pdf/0812.4582v1,"{'id': 'http://arxiv.org/abs/0812.4582v1', 'gu..."
4,http://arxiv.org/abs/0812.5111v1,2008-12-30 20:20:43+00:00,2008-12-30 20:20:43+00:00,Biases and Uncertainties in Physical Parameter...,"[Seong-Kook Lee, Rafal Idzi, Henry C. Ferguson...",We investigate the biases and uncertainties in...,"85 pages, 34 figures, submittted to ApJS","Astrophys.J.Suppl.184:100-132,2009",10.1088/0067-0049/184/1/100,astro-ph,[astro-ph],[http://dx.doi.org/10.1088/0067-0049/184/1/100...,http://arxiv.org/pdf/0812.5111v1,"{'id': 'http://arxiv.org/abs/0812.5111v1', 'gu..."
5,http://arxiv.org/abs/0812.4574v2,2009-09-20 17:40:01+00:00,2008-12-30 20:13:32+00:00,Dynamics of a Spherical Accretion Shock with N...,"[Rodrigo Fernández, Christopher Thompson]",We investigate the effects of neutrino heating...,Published in ApJ. This version includes some p...,"Astrophys.J.703:1464-1485,2009",10.1088/0004-637X/703/2/1464,astro-ph,[astro-ph],[http://dx.doi.org/10.1088/0004-637X/703/2/146...,http://arxiv.org/pdf/0812.4574v2,"{'id': 'http://arxiv.org/abs/0812.4574v2', 'gu..."
6,http://arxiv.org/abs/0812.5108v3,2009-08-02 13:35:02+00:00,2008-12-30 19:58:20+00:00,Asymptotically FRW black holes,"[J. T. Firouzjaee, Reza Mansouri]",Application of concepts like black hole and ev...,"9 pages, 6 figures","Gen.Rel.Grav.42:2431-2452,2010",10.1007/s10714-010-0991-7,astro-ph,"[astro-ph, gr-qc]","[http://dx.doi.org/10.1007/s10714-010-0991-7, ...",http://arxiv.org/pdf/0812.5108v3,"{'id': 'http://arxiv.org/abs/0812.5108v3', 'gu..."


In [59]:
tfidf_matrix_test = tfidf_vect.transform(df_test_filtered['summary'])

tfidf_df_test = pd.DataFrame(tfidf_matrix_test.toarray())

In [60]:
y_pred_test = model.predict(tfidf_df_test)
accuracy = accuracy_score(df_test_filtered['primary_category'], y_pred_test)
print(f"Acurácia: {accuracy}")
print(f"Classification Report: \n{classification_report(df_test_filtered['primary_category'], y_pred_test)}")

Acurácia: 0.5423728813559322
Classification Report: 
              precision    recall  f1-score   support

    astro-ph       0.50      0.14      0.22         7
 astro-ph.CO       0.33      1.00      0.50         5
 astro-ph.EP       1.00      0.73      0.84        11
 astro-ph.GA       0.57      0.27      0.36        15
 astro-ph.HE       0.50      0.60      0.55        10
 astro-ph.IM       1.00      0.75      0.86         4
 astro-ph.SR       0.42      0.71      0.53         7

    accuracy                           0.54        59
   macro avg       0.62      0.60      0.55        59
weighted avg       0.62      0.54      0.53        59



# Creating a Streamlit App

# Referências

https://medium.com/@JyotsnaPyarasani/building-a-text-classification-system-for-news-articles-a-comprehensive-guide-10a99e8e862d

https://lukasschwab.me/arxiv.py/arxiv.html

https://www.kaggle.com/code/honggiangtrnh/topic-model-lda/input

https://colab.research.google.com/github/EPS-Libraries-Berkeley/volt/blob/main/Search/arxiv_api.ipynb#scrollTo=uznzjWmSBoLF

https://pypi.org/project/arxiv/

https://arxiv.org/category_taxonomy