<a href="https://colab.research.google.com/github/danielscabar/arxiv_text_classification/blob/main/Arxiv_Text_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Preparation

In [1]:
import pandas as pd
import numpy as np

In [17]:
!pip install arxiv
import arxiv



In [3]:
category_map_astro = {'astro-ph': 'Astrophysics',
'astro-ph.CO': 'Cosmology and Nongalactic Astrophysics',
'astro-ph.EP': 'Earth and Planetary Astrophysics',
'astro-ph.GA': 'Astrophysics of Galaxies',
'astro-ph.HE': 'High Energy Astrophysical Phenomena',
'astro-ph.IM': 'Instrumentation and Methods for Astrophysics',
'astro-ph.SR': 'Solar and Stellar Astrophysics'}

In [21]:
def data_transf(searches,client):
  results = {}

  for search in searches:
    results[search] = client.results(searches[search])

  ls = []
  for result in results:
    for paper in results[result]:
      ls.append(paper)

  df = pd.DataFrame([vars(paper) for paper in ls])
  return df

In [22]:
def get_arxiv_data_relevance(category_map, num_papers):
  client = arxiv.Client()
  searches = {}
  for category in category_map:
    searches[category_map[category]] = arxiv.Search(
      query = 'cat:'+category,
      max_results = num_papers,
      sort_by = arxiv.SortCriterion.Relevance
    )
  return data_transf(searches, client)

In [23]:
def get_arxiv_data_recent(category_map, num_papers):
  client = arxiv.Client()
  searches = {}
  for category in category_map:
    searches[category_map[category]] = arxiv.Search(
      query = 'cat:'+category,
      max_results = num_papers,
      sort_by = arxiv.SortCriterion.SubmittedDate
    )
  return data_transf(searches, client)

In [35]:
#df['primary_category'][~df.primary_category.isin(category_map_astro) == False]

Unnamed: 0,primary_category
1,astro-ph.GA
2,astro-ph.EP
3,astro-ph.GA
6,astro-ph.SR


In [36]:
df = get_arxiv_data_relevance(category_map_astro, 200)
df.head()

KeyboardInterrupt: 

# EDA

In [None]:
pd.shape

In [12]:
pd.set_option('display.max_rows', 7)
df['primary_category'].value_counts()

Unnamed: 0_level_0,count
primary_category,Unnamed: 1_level_1
astro-ph,200
astro-ph.CO,200
astro-ph.EP,200
astro-ph.GA,200
astro-ph.HE,200
astro-ph.IM,200
astro-ph.SR,200


In [None]:
#rows_to_drop = [x for x in df['primary_category'].value_counts().index if df['primary_category'].value_counts()[x] < 1000]
#df = df[~df.primary_category.isin(rows_to_drop)]

In [13]:
df_short = df[['primary_category', 'summary']]
df_short.head()

Unnamed: 0,primary_category,summary
0,astro-ph,It is proposed that gamma-ray bursts are creat...
1,astro-ph,The four observables associated with gravitati...
2,astro-ph,The BATSE experiment on GRO has demonstrated t...
3,astro-ph,We use the Expanding Photosphere Method to det...
4,astro-ph,We have calculated gamma-ray radiative transpo...


In [14]:
print(df_short.isnull().sum())

primary_category    0
summary             0
dtype: int64


# Data Cleaning

In [15]:
import re
import string
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

stopwords = nltk.corpus.stopwords.words('english')
lemmatizer = nltk.WordNetLemmatizer()

def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [lemmatizer.lemmatize(word) for word in tokens if word not in stopwords]
    return text

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [16]:
df_short['summary_clean'] = df_short['summary'].apply(lambda x: clean_text(x))
df_short.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_short['summary_clean'] = df_short['summary'].apply(lambda x: clean_text(x))


Unnamed: 0,primary_category,summary,summary_clean
0,astro-ph,It is proposed that gamma-ray bursts are creat...,"[proposed, gammaray, burst, created, merger, d..."
1,astro-ph,The four observables associated with gravitati...,"[four, observables, associated, gravitational,..."
2,astro-ph,The BATSE experiment on GRO has demonstrated t...,"[batse, experiment, gro, demonstrated, isotrop..."
3,astro-ph,We use the Expanding Photosphere Method to det...,"[use, expanding, photosphere, method, determin..."
4,astro-ph,We have calculated gamma-ray radiative transpo...,"[calculated, gammaray, radiative, transport, r..."


# Feature Engineering

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(analyzer=clean_text)
tfidf_matrix = tfidf_vect.fit_transform(df_short['summary'])

tfidf_df = pd.DataFrame(tfidf_matrix.toarray())

In [18]:
tfidf_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,14854,14855,14856,14857,14858,14859,14860,14861,14862,14863
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Model Training and Evaluation

In [15]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import neighbors
from sklearn.neural_network import MLPClassifier


from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

model_params = {
    'RandomForestClassifier': {
        'model': RandomForestClassifier(),
        'params': {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 5, 10, 15]
        }
    },
    'KNeighborsClassifier': {
        'model': neighbors.KNeighborsClassifier(),
        'params': {
            'n_neighbors': [3,4,5,6,7,8,9,10],
            'metric': ['euclidean','manhattan']
        }
    },
    'MLPClassifier': {
        'model': MLPClassifier(max_iter=500,random_state=42),
        'params': {
            'hidden_layer_sizes': [(100,), (50, 50), (10, 10, 10)],
            'activation': ['relu', 'tanh']
        }
    }
}

best_models = {}

X = tfidf_df
y = df_short['primary_category']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)


for model_name, mp in model_params.items():
    print(f'Model {model_name}...')
    clf = GridSearchCV(mp['model'], mp['params'], cv=5, scoring='accuracy')
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    best_models[model_name] = (clf.best_estimator_, accuracy)

best_model_name = max(best_models, key=lambda k: best_models[k][1])
best_model, best_accuracy = best_models[best_model_name]

print(f"Melhor modelo: {best_model_name}")
print(f"Melhor acurácia: {best_accuracy}")
print(f"Melhor modelo encontrado: {best_model}")
print(f"Classification Report: \n{classification_report(y_test, y_pred)}")

Model RandomForestClassifier...
Model KNeighborsClassifier...
Model MLPClassifier...
Melhor modelo: MLPClassifier
Melhor acurácia: 0.7
Melhor modelo encontrado: MLPClassifier(activation='tanh', max_iter=500, random_state=42)
Classification Report: 
              precision    recall  f1-score   support

    astro-ph       0.58      0.45      0.51        40
 astro-ph.CO       0.61      0.70      0.65        40
 astro-ph.EP       0.92      0.82      0.87        40
 astro-ph.GA       0.59      0.55      0.57        40
 astro-ph.HE       0.70      0.82      0.76        40
 astro-ph.IM       0.78      0.90      0.84        40
 astro-ph.SR       0.70      0.65      0.68        40

    accuracy                           0.70       280
   macro avg       0.70      0.70      0.70       280
weighted avg       0.70      0.70      0.70       280



In [21]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

X = tfidf_df
y = df_short['primary_category']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

model = MLPClassifier(max_iter=500,random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Acurácia: {accuracy}")
print(f"Classification Report: \n{classification_report(y_test, y_pred)}")

Acurácia: 0.7178571428571429
Classification Report: 
              precision    recall  f1-score   support

    astro-ph       0.65      0.55      0.59        40
 astro-ph.CO       0.67      0.70      0.68        40
 astro-ph.EP       0.92      0.82      0.87        40
 astro-ph.GA       0.59      0.55      0.57        40
 astro-ph.HE       0.67      0.85      0.75        40
 astro-ph.IM       0.82      0.90      0.86        40
 astro-ph.SR       0.72      0.65      0.68        40

    accuracy                           0.72       280
   macro avg       0.72      0.72      0.72       280
weighted avg       0.72      0.72      0.72       280



In [22]:
model.fit(pd.concat([X_train, X_test]),pd.concat([y_train, y_test]))

# Creating a Streamlit App

# Referências

https://medium.com/@JyotsnaPyarasani/building-a-text-classification-system-for-news-articles-a-comprehensive-guide-10a99e8e862d

https://lukasschwab.me/arxiv.py/arxiv.html

https://www.kaggle.com/code/honggiangtrnh/topic-model-lda/input

https://colab.research.google.com/github/EPS-Libraries-Berkeley/volt/blob/main/Search/arxiv_api.ipynb#scrollTo=uznzjWmSBoLF

https://pypi.org/project/arxiv/

https://arxiv.org/category_taxonomy