<a href="https://colab.research.google.com/github/danielscabar/arxiv_text_classification/blob/main/Arxiv_Text_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preparation

In [1]:
import pandas as pd
import numpy as np

In [2]:
!pip install arxiv



In [4]:
category_map_astro = {'astro-ph': 'Astrophysics',
'astro-ph.CO': 'Cosmology and Nongalactic Astrophysics',
'astro-ph.EP': 'Earth and Planetary Astrophysics',
'astro-ph.GA': 'Astrophysics of Galaxies',
'astro-ph.HE': 'High Energy Astrophysical Phenomena',
'astro-ph.IM': 'Instrumentation and Methods for Astrophysics',
'astro-ph.SR': 'Solar and Stellar Astrophysics'}

In [5]:
len(category_map_astro)

7

In [6]:
import arxiv

# Construct the default API client.
client = arxiv.Client()

# Search for the 200 most relevent articles for each category mapped of astronomy
searches = {}

for category in category_map_astro:
  searches[category_map_astro[category]] = arxiv.Search(
    query = 'cat:'+category,
    max_results = 200,
    sort_by = arxiv.SortCriterion.Relevance
  )

results = {}

for search in searches:
  results[search] = client.results(searches[search])

In [7]:
ls = []
for result in results:
  for paper in results[result]:
    ls.append(paper)
ls

[arxiv.Result(entry_id='http://arxiv.org/abs/astro-ph/9204001v1', updated=datetime.datetime(1992, 4, 13, 18, 20, 1, tzinfo=datetime.timezone.utc), published=datetime.datetime(1992, 4, 13, 18, 20, 1, tzinfo=datetime.timezone.utc), title='Gamma-Ray Bursts as the Death Throes of Massive Binary Stars', authors=[arxiv.Result.Author('Ramesh Narayan'), arxiv.Result.Author('Bohdan Paczyński'), arxiv.Result.Author('Tsvi Piran')], summary='It is proposed that gamma-ray bursts are created in the mergers of double\nneutron star binaries and black hole neutron star binaries at cosmological\ndistances. Bursts with complex profiles and relatively long durations are the\nresult of magnetic flares generated by the Parker instability in a post-merger\ndifferentially-rotating disk. Some bursts may also be produced through\nneutrino-antineutrino annihilation into electrons and positrons. In both cases,\nan optically thick fireball of size $\\sles\\ 100$ km is initially created, which\nexpands ultrarelativ

In [8]:
df = pd.DataFrame([vars(paper) for paper in ls])

In [9]:
df.head()

Unnamed: 0,entry_id,updated,published,title,authors,summary,comment,journal_ref,doi,primary_category,categories,links,pdf_url,_raw
0,http://arxiv.org/abs/astro-ph/9204001v1,1992-04-13 18:20:01+00:00,1992-04-13 18:20:01+00:00,Gamma-Ray Bursts as the Death Throes of Massiv...,"[Ramesh Narayan, Bohdan Paczyński, Tsvi Piran]",It is proposed that gamma-ray bursts are creat...,14 pages,Astrophys.J. 395 (1992) L83-L86,10.1086/186493,astro-ph,[astro-ph],"[http://dx.doi.org/10.1086/186493, http://arxi...",http://arxiv.org/pdf/astro-ph/9204001v1,{'id': 'http://arxiv.org/abs/astro-ph/9204001v...
1,http://arxiv.org/abs/astro-ph/9204002v1,1992-04-26 17:54:00+00:00,1992-04-26 17:54:00+00:00,Gravitational Lensing and the Variability of G,"[Lawrence Krauss, Martin White]",The four observables associated with gravitati...,13 pages plus figures (not included),"Astrophys.J.397:357,1992",10.1086/171792,astro-ph,[astro-ph],"[http://dx.doi.org/10.1086/171792, http://arxi...",http://arxiv.org/pdf/astro-ph/9204002v1,{'id': 'http://arxiv.org/abs/astro-ph/9204002v...
2,http://arxiv.org/abs/astro-ph/9204003v2,1992-04-30 20:39:38+00:00,1992-04-29 16:36:30+00:00,The Ptolemaic Gamma-Ray Burst Universe,[J. I. Katz],The BATSE experiment on GRO has demonstrated t...,10 pages (Replaced to provide omitted line.),,10.1007/BF00645080,astro-ph,[astro-ph],"[http://dx.doi.org/10.1007/BF00645080, http://...",http://arxiv.org/pdf/astro-ph/9204003v2,{'id': 'http://arxiv.org/abs/astro-ph/9204003v...
3,http://arxiv.org/abs/astro-ph/9204004v1,1992-04-30 19:20:04+00:00,1992-04-30 19:20:04+00:00,Expanding Photospheres of Type II Supernovae a...,"[B P Schmidt, R P Kirshner, R G Eastman]",We use the Expanding Photosphere Method to det...,21 pages,Astrophys.J. 395 (1992) 366-386,10.1086/171659,astro-ph,[astro-ph],"[http://dx.doi.org/10.1086/171659, http://arxi...",http://arxiv.org/pdf/astro-ph/9204004v1,{'id': 'http://arxiv.org/abs/astro-ph/9204004v...
4,http://arxiv.org/abs/astro-ph/9204005v1,1992-04-30 19:18:05+00:00,1992-04-30 19:18:05+00:00,Radiation Transfer in Gamma-Ray Bursts,"[B. J. Carrigan, J. I. Katz]",We have calculated gamma-ray radiative transpo...,24 pages,Astrophys.J. 399 (1992) 100-107,10.1086/171906,astro-ph,[astro-ph],"[http://dx.doi.org/10.1086/171906, http://arxi...",http://arxiv.org/pdf/astro-ph/9204005v1,{'id': 'http://arxiv.org/abs/astro-ph/9204005v...


In [2]:
from google.colab import drive
drive.mount('drive')

#df.to_csv('/content/drive/My Drive/arxiv_data_astro200.csv', index=False)

df = pd.read_csv('/content/drive/MyDrive/arxiv_data_astro200.csv')

Mounted at drive


# EDA

In [3]:
df.shape

(1400, 14)

In [4]:
pd.set_option('display.max_rows', 7)
df['primary_category'].value_counts()

Unnamed: 0_level_0,count
primary_category,Unnamed: 1_level_1
astro-ph,200
astro-ph.CO,200
astro-ph.EP,200
astro-ph.GA,200
astro-ph.HE,200
astro-ph.IM,200
astro-ph.SR,200


In [33]:
#rows_to_drop = [x for x in df['primary_category'].value_counts().index if df['primary_category'].value_counts()[x] < 1000]
#df = df[~df.primary_category.isin(rows_to_drop)]

In [5]:
df_short = df[['primary_category', 'summary']]
df_short.head()

Unnamed: 0,primary_category,summary
0,astro-ph,It is proposed that gamma-ray bursts are creat...
1,astro-ph,The four observables associated with gravitati...
2,astro-ph,The BATSE experiment on GRO has demonstrated t...
3,astro-ph,We use the Expanding Photosphere Method to det...
4,astro-ph,We have calculated gamma-ray radiative transpo...


In [6]:
print(df_short.isnull().sum())

primary_category    0
summary             0
dtype: int64


# Data Cleaning

In [7]:
import re
import string
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = nltk.WordNetLemmatizer()

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords]
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [8]:
df_short['summary_clean'] = df_short['summary'].apply(lambda x: clean_text(x))
df_short.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_short['summary_clean'] = df_short['summary'].apply(lambda x: clean_text(x))


Unnamed: 0,primary_category,summary,summary_clean
0,astro-ph,It is proposed that gamma-ray bursts are creat...,"[proposed, gammaray, burst, created, merger, d..."
1,astro-ph,The four observables associated with gravitati...,"[four, observables, associated, gravitational,..."
2,astro-ph,The BATSE experiment on GRO has demonstrated t...,"[batse, experiment, gro, demonstrated, isotrop..."
3,astro-ph,We use the Expanding Photosphere Method to det...,"[use, expanding, photosphere, method, determin..."
4,astro-ph,We have calculated gamma-ray radiative transpo...,"[calculated, gammaray, radiative, transport, r..."


# Feature Engineering

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_matrix = tfidf_vect.fit_transform(df_short['summary'])

tfidf_df = pd.DataFrame(tfidf_matrix.toarray())

In [9]:
tfidf_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14854,14855,14856,14857,14858,14859,14860,14861,14862,14863
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Model Training and Evaluation

In [11]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier


from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

model_params = {
    'RandomForestClassifier': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 5, 10, 15]
        }
    }
}

best_models = {}

X = tfidf_df
y = df_short['primary_category']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


for model_name, mp in model_params.items():
    print(f'Model {model_name}...')
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, scoring='accuracy')
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    best_models[model_name] = (clf.best_estimator_, accuracy)

best_model_name = max(best_models, key=lambda k: best_models[k][1])
best_model, best_accuracy = best_models[best_model_name]

print(f"Melhor modelo: {best_model_name}")
print(f"Melhor acurácia: {best_accuracy}")
print(f"Melhor modelo encontrado: {best_model}")
print(f"Classification Report: \n{classification_report(y_test, y_pred)}")

Model RandomForestClassifier...
Melhor modelo: RandomForestClassifier
Melhor acurácia: 0.6678571428571428
Melhor modelo encontrado: RandomForestClassifier(n_estimators=300)
Classification Report: 
              precision    recall  f1-score   support

    astro-ph       0.65      0.28      0.39        40
 astro-ph.CO       0.54      0.78      0.64        40
 astro-ph.EP       0.88      0.88      0.88        40
 astro-ph.GA       0.56      0.47      0.51        40
 astro-ph.HE       0.62      0.78      0.69        40
 astro-ph.IM       0.80      0.88      0.83        40
 astro-ph.SR       0.66      0.62      0.64        40

    accuracy                           0.67       280
   macro avg       0.67      0.67      0.65       280
weighted avg       0.67      0.67      0.65       280



In [12]:
best_model.fit(pd.concat([X_train, X_test]),pd.concat([y_train, y_test]))
y_pred = best_model.predict(pd.concat([X_train, X_test]))
accuracy = accuracy_score(pd.concat([y_train, y_test]), y_pred)
print(f"Acurácia (just check...): {accuracy}")

Acurácia (just check...): 1.0


# Creating a Streamlit App

# Referências

https://medium.com/@JyotsnaPyarasani/building-a-text-classification-system-for-news-articles-a-comprehensive-guide-10a99e8e862d

https://lukasschwab.me/arxiv.py/arxiv.html

https://www.kaggle.com/code/honggiangtrnh/topic-model-lda/input

https://colab.research.google.com/github/EPS-Libraries-Berkeley/volt/blob/main/Search/arxiv_api.ipynb#scrollTo=uznzjWmSBoLF

https://pypi.org/project/arxiv/

https://arxiv.org/category_taxonomy