<a href="https://colab.research.google.com/github/danielscabar/arxiv_text_classification/blob/main/Arxiv_Text_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preparation

In [1]:
import pandas as pd
import numpy as np

In [2]:
!pip install arxiv
import arxiv



In [3]:
category_map_astro = {'astro-ph': 'Astrophysics',
'astro-ph.CO': 'Cosmology and Nongalactic Astrophysics',
'astro-ph.EP': 'Earth and Planetary Astrophysics',
'astro-ph.GA': 'Astrophysics of Galaxies',
'astro-ph.HE': 'High Energy Astrophysical Phenomena',
'astro-ph.IM': 'Instrumentation and Methods for Astrophysics',
'astro-ph.SR': 'Solar and Stellar Astrophysics'}

In [4]:
def data_transf(searches,client):
  results = {}

  for search in searches:
    results[search] = client.results(searches[search])

  ls = []
  for result in results:
    for paper in results[result]:
      ls.append(paper)

  df = pd.DataFrame([vars(paper) for paper in ls])
  return df

In [5]:
def get_arxiv_data_relevance(category_map, num_papers):
  client = arxiv.Client()
  searches = {}
  for category in category_map:
    searches[category_map[category]] = arxiv.Search(
      query = 'cat:'+category,
      max_results = num_papers,
      sort_by = arxiv.SortCriterion.Relevance
    )
  return data_transf(searches, client)

In [6]:
def get_arxiv_data_recent(category_map, num_papers):
  client = arxiv.Client()
  searches = {}
  for category in category_map:
    searches[category_map[category]] = arxiv.Search(
      query = 'cat:'+category,
      max_results = num_papers,
      sort_by = arxiv.SortCriterion.SubmittedDate
    )
  return data_transf(searches, client)

In [7]:
df = get_arxiv_data_relevance(category_map_astro, 200)
df.head()

Unnamed: 0,entry_id,updated,published,title,authors,summary,comment,journal_ref,doi,primary_category,categories,links,pdf_url,_raw
0,http://arxiv.org/abs/astro-ph/9204001v1,1992-04-13 18:20:01+00:00,1992-04-13 18:20:01+00:00,Gamma-Ray Bursts as the Death Throes of Massiv...,"[Ramesh Narayan, Bohdan Paczyński, Tsvi Piran]",It is proposed that gamma-ray bursts are creat...,14 pages,Astrophys.J. 395 (1992) L83-L86,10.1086/186493,astro-ph,[astro-ph],"[http://dx.doi.org/10.1086/186493, http://arxi...",http://arxiv.org/pdf/astro-ph/9204001v1,{'id': 'http://arxiv.org/abs/astro-ph/9204001v...
1,http://arxiv.org/abs/astro-ph/9204002v1,1992-04-26 17:54:00+00:00,1992-04-26 17:54:00+00:00,Gravitational Lensing and the Variability of G,"[Lawrence Krauss, Martin White]",The four observables associated with gravitati...,13 pages plus figures (not included),"Astrophys.J.397:357,1992",10.1086/171792,astro-ph,[astro-ph],"[http://dx.doi.org/10.1086/171792, http://arxi...",http://arxiv.org/pdf/astro-ph/9204002v1,{'id': 'http://arxiv.org/abs/astro-ph/9204002v...
2,http://arxiv.org/abs/astro-ph/9204003v2,1992-04-30 20:39:38+00:00,1992-04-29 16:36:30+00:00,The Ptolemaic Gamma-Ray Burst Universe,[J. I. Katz],The BATSE experiment on GRO has demonstrated t...,10 pages (Replaced to provide omitted line.),,10.1007/BF00645080,astro-ph,[astro-ph],"[http://dx.doi.org/10.1007/BF00645080, http://...",http://arxiv.org/pdf/astro-ph/9204003v2,{'id': 'http://arxiv.org/abs/astro-ph/9204003v...
3,http://arxiv.org/abs/astro-ph/9204004v1,1992-04-30 19:20:04+00:00,1992-04-30 19:20:04+00:00,Expanding Photospheres of Type II Supernovae a...,"[B P Schmidt, R P Kirshner, R G Eastman]",We use the Expanding Photosphere Method to det...,21 pages,Astrophys.J. 395 (1992) 366-386,10.1086/171659,astro-ph,[astro-ph],"[http://dx.doi.org/10.1086/171659, http://arxi...",http://arxiv.org/pdf/astro-ph/9204004v1,{'id': 'http://arxiv.org/abs/astro-ph/9204004v...
4,http://arxiv.org/abs/astro-ph/9204005v1,1992-04-30 19:18:05+00:00,1992-04-30 19:18:05+00:00,Radiation Transfer in Gamma-Ray Bursts,"[B. J. Carrigan, J. I. Katz]",We have calculated gamma-ray radiative transpo...,24 pages,Astrophys.J. 399 (1992) 100-107,10.1086/171906,astro-ph,[astro-ph],"[http://dx.doi.org/10.1086/171906, http://arxi...",http://arxiv.org/pdf/astro-ph/9204005v1,{'id': 'http://arxiv.org/abs/astro-ph/9204005v...


In [7]:
df = pd.read_csv('https://raw.githubusercontent.com/danielscabar/arxiv_text_classification/refs/heads/main/arxiv_data_astro200.csv')

# EDA

In [8]:
df.shape

(1400, 14)

In [9]:
pd.set_option('display.max_rows', 7)
df['primary_category'].value_counts()

Unnamed: 0_level_0,count
primary_category,Unnamed: 1_level_1
astro-ph,200
astro-ph.CO,200
astro-ph.EP,200
astro-ph.GA,200
astro-ph.HE,200
astro-ph.IM,200
astro-ph.SR,200


In [10]:
df_short = df[['primary_category', 'summary']]
df_short.head()

Unnamed: 0,primary_category,summary
0,astro-ph,It is proposed that gamma-ray bursts are creat...
1,astro-ph,The four observables associated with gravitati...
2,astro-ph,The BATSE experiment on GRO has demonstrated t...
3,astro-ph,We use the Expanding Photosphere Method to det...
4,astro-ph,We have calculated gamma-ray radiative transpo...


In [11]:
print(df_short.isnull().sum())

primary_category    0
summary             0
dtype: int64


# Data Cleaning

In [12]:
import re
import string
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = nltk.WordNetLemmatizer()

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords]
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Feature Engineering

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_matrix = tfidf_vect.fit_transform(df_short['summary'])

tfidf_df = pd.DataFrame(tfidf_matrix.toarray())

In [14]:
tfidf_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14854,14855,14856,14857,14858,14859,14860,14861,14862,14863
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Model Training and Evaluation

In [15]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors
from sklearn.neural_network import MLPClassifier


from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

model_params = {
    'RandomForestClassifier': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 5, 10, 15]
        }
    },
    'KNeighborsClassifier': {
        'model': neighbors.KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3,4,5,6,7,8,9,10],
            'metric': ['euclidean','manhattan']
        }
    },
    'MLPClassifier': {
        'model': MLPClassifier(max_iter=500,random_state=42),
        'params': {
            'hidden_layer_sizes': [(100,), (50, 50), (10, 10, 10)],
            'activation': ['relu', 'tanh']
        }
    }
}

best_models = {}

X = tfidf_df
y = df_short['primary_category']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


for model_name, mp in model_params.items():
    print(f'Model {model_name}...')
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, scoring='accuracy')
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    best_models[model_name] = (clf.best_estimator_, accuracy)

best_model_name = max(best_models, key=lambda k: best_models[k][1])
best_model, best_accuracy = best_models[best_model_name]

print(f"Melhor modelo: {best_model_name}")
print(f"Melhor acurácia: {best_accuracy}")
print(f"Melhor modelo encontrado: {best_model}")
print(f"Classification Report: \n{classification_report(y_test, y_pred)}")

Model RandomForestClassifier...
Model KNeighborsClassifier...
Model MLPClassifier...
Melhor modelo: MLPClassifier
Melhor acurácia: 0.7
Melhor modelo encontrado: MLPClassifier(activation='tanh', max_iter=500, random_state=42)
Classification Report: 
              precision    recall  f1-score   support

    astro-ph       0.58      0.45      0.51        40
 astro-ph.CO       0.61      0.70      0.65        40
 astro-ph.EP       0.92      0.82      0.87        40
 astro-ph.GA       0.59      0.55      0.57        40
 astro-ph.HE       0.70      0.82      0.76        40
 astro-ph.IM       0.78      0.90      0.84        40
 astro-ph.SR       0.70      0.65      0.68        40

    accuracy                           0.70       280
   macro avg       0.70      0.70      0.70       280
weighted avg       0.70      0.70      0.70       280



In [16]:
best_model.fit(pd.concat([X_train, X_test]),pd.concat([y_train, y_test]))

# Model Deployment

In [47]:
import pickle

with open("vectorizer.pkl", "wb") as vec_file:
    pickle.dump(tfidf_vect, vec_file)

with open("model.pkl", "wb") as model_file:
    pickle.dump(best_model, model_file)

# Creating a Streamlit App

In [25]:
!pip install streamlit pyngrok

Collecting streamlit
  Downloading streamlit-1.40.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.1-py3-none-any.whl.metadata (8.3 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Downloading streamlit-1.40.1-py2.py3-none-any.whl (8.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m49.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.2.1-py3-none-any.whl (22 kB)
Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m64.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64

In [33]:
import os

# Set the ngrok auth token as an environment variable (do this only once in the session)
os.environ["NGROK_AUTH_TOKEN"] = "2oqCsaRBYb6i5SG39iOOXv9u8Gx_9tJfhyAbLFv1LRRJtfpo"

In [34]:
# Authenticate ngrok using the environment variable
!ngrok config add-authtoken $NGROK_AUTH_TOKEN

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [35]:
from pyngrok import ngrok

# Start ngrok tunnel on localhost port 8501 for the Streamlit app
public_url = ngrok.connect("http://localhost:8501")
print(f"Streamlit app URL: {public_url}")


Streamlit app URL: NgrokTunnel: "https://b30f-34-105-47-9.ngrok-free.app" -> "http://localhost:8501"


In [53]:
%%writefile app.py
import streamlit as st
import pickle
import pandas as pd
import re
import string
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = nltk.WordNetLemmatizer()

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords]
    return text

# Category mapping dictionary
category_map_astro = {
    'astro-ph': 'Astrophysics',
    'astro-ph.CO': 'Cosmology and Nongalactic Astrophysics',
    'astro-ph.EP': 'Earth and Planetary Astrophysics',
    'astro-ph.GA': 'Astrophysics of Galaxies',
    'astro-ph.HE': 'High Energy Astrophysical Phenomena',
    'astro-ph.IM': 'Instrumentation and Methods for Astrophysics',
    'astro-ph.SR': 'Solar and Stellar Astrophysics'
}

# Load vectorizer and model
with open("vectorizer.pkl", "rb") as vec_file:
    vectorizer = pickle.load(vec_file)

with open("model.pkl", "rb") as model_file:
    model = pickle.load(model_file)

# Title and text input
st.title("🕵🏿Classificador de artigos científicos de Astrofísica💻")
input_sms = st.text_area("Enter the message")

if st.button('Predict'):
    # Vectorization
    vector_input = vectorizer.transform([input_sms])

    # Prediction
    result = model.predict(vector_input)[0]

    # Display result
    category = category_map_astro.get(result, "Unknown category")
    st.header(f"Predicted category: {category}")


Overwriting app.py


In [54]:
!streamlit run app.py &>/dev/null &

# Referências

https://medium.com/@JyotsnaPyarasani/building-a-text-classification-system-for-news-articles-a-comprehensive-guide-10a99e8e862d

https://lukasschwab.me/arxiv.py/arxiv.html

https://www.kaggle.com/code/honggiangtrnh/topic-model-lda/input

https://colab.research.google.com/github/EPS-Libraries-Berkeley/volt/blob/main/Search/arxiv_api.ipynb#scrollTo=uznzjWmSBoLF

https://pypi.org/project/arxiv/

https://arxiv.org/category_taxonomy