Examen Bimestral

In [17]:
# Read data
import json
import pandas as pd
import os

In [18]:
# Leer el archivo JSON
url = '../data/arxiv_examen.json'
data = pd.read_json(url, encoding='utf-8', lines=True)

In [19]:
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Descargar recursos necesarios de nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /home/murder/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/murder/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to /home/murder/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [20]:
print(data)

             id                                              title  \
0      704.0001  Calculation of prompt diphoton production cros...   
1      704.0002           Sparsity-certifying Graph Decompositions   
2      704.0003  The evolution of the Earth-Moon system based o...   
3      704.0004  A determinant of Stirling cycle numbers counts...   
4      704.0005  From dyadic $\Lambda_{\alpha}$ to $\Lambda_{\a...   
...         ...                                                ...   
16995  707.3825  Emergence of noncollinear magnetic ordering in...   
16996  707.3826                      More hilltop inflation models   
16997  707.3827  Engineering Silicon Nanocrystals: Theoretical ...   
16998  707.3828  Structure, bonding and magnetism in cobalt clu...   
16999  707.3829  Occupation Statistics of Critical Branching Ra...   

                                                abstract  
0        A fully differential calculation in perturba...  
1        We describe a new algorithm, the

In [21]:
# Definir stopwords en inglés
stop_words = set(stopwords.words('english'))

# Función para preprocesar texto
def preprocess_text(text):
    # Convertir a minúsculas
    text = text.lower()
    # Tokenizar palabras
    tokens = word_tokenize(text)
    # Eliminar stopwords y signos de puntuación
    tokens = [word for word in tokens if word not in stop_words and word not in string.punctuation]
    return tokens

# Aplicar función al título y resumen
data['title_tokens'] = data['title'].apply(preprocess_text)
data['abstract_tokens'] = data['abstract'].apply(preprocess_text)

# Mostrar resultado
print(data[['title_tokens', 'abstract_tokens']])

                                            title_tokens  \
0      [calculation, prompt, diphoton, production, cr...   
1           [sparsity-certifying, graph, decompositions]   
2      [evolution, earth-moon, system, based, dark, m...   
3      [determinant, stirling, cycle, numbers, counts...   
4           [dyadic, \lambda_, \alpha, \lambda_, \alpha]   
...                                                  ...   
16995  [emergence, noncollinear, magnetic, ordering, ...   
16996                       [hilltop, inflation, models]   
16997  [engineering, silicon, nanocrystals, theoretic...   
16998  [structure, bonding, magnetism, cobalt, clusters]   
16999  [occupation, statistics, critical, branching, ...   

                                         abstract_tokens  
0      [fully, differential, calculation, perturbativ...  
1      [describe, new, algorithm, k, \ell, -pebble, g...  
2      [evolution, earth-moon, system, described, dar...  
3      [show, determinant, stirling, cycle,

In [22]:
# Vectorizar
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
# Fit on the combined text from title_tokens and abstract_tokens
vectorizer.fit(data['title_tokens'].astype(str) + ' ' + data['abstract_tokens'].astype(str))
corpus_vect = vectorizer.transform(data['title_tokens'].astype(str) + ' ' + data['abstract_tokens'].astype(str)) # Transformación
print(corpus_vect.shape)

(17000, 44149)


In [23]:
from sklearn.metrics.pairwise import cosine_similarity

In [37]:
# Función para buscar usando TF-IDF
def search_tfidf(query, vectorizer, corpus_vect, data,  top_k=10):
    # Vectorize the query
    query_vect = vectorizer.transform([query])

    # Calculate cosine similarities between the query and documents
    cosine_similarities = cosine_similarity(query_vect, corpus_vect).flatten()

    # Create a DataFrame with the results
    df_results = pd.DataFrame({'Documento': data['abstract'], 'Similitud coseno': cosine_similarities})

    # Sort by similarity and get the top results
    df_results = df_results.sort_values(by='Similitud coseno', ascending=False)
    return df_results.head(top_k)

In [25]:
query = "diphoton production cross sections"

In [31]:
search_tfidf(query, vectorizer, corpus_vect, data)

Unnamed: 0,Documento,Similitud coseno
0,A fully differential calculation in perturba...,0.393993
15464,We have performed a search for new particles...,0.362845
9537,The increasing size of the data samples reco...,0.332994
8315,We measured fragmentation cross sections pro...,0.306359
11499,"Using the CLEO III detector, we measure abso...",0.285362
10803,We prove that twist-3 soft-gluon-pole (SGP) ...,0.264567
4351,The results of measurements of the productio...,0.257428
11379,Microscopic optical model potential results ...,0.252194
8251,Using the measured fragmentation cross secti...,0.251028
5053,The charged-current quasi-elastic scattering...,0.245088


In [38]:
# BM25 Search
from rank_bm25 import BM25Okapi

def search_bm25(query,  top_k=10):
  # Tokenize the corpus for BM25
  tokenized_corpus = data['abstract_tokens'].tolist()

  bm25_doc = BM25Okapi(tokenized_corpus)
  scores = bm25_doc.get_scores(query)

  # Create a DataFrame with the data
  df = pd.DataFrame({'Documento': data['abstract'], 'Score BM25': scores})
  df = df.sort_values(by='Score BM25', ascending=False)
  bm25_results = df.head(top_k)
  bm25_results
  return bm25_results

In [39]:
search_bm25(query)

Unnamed: 0,Documento,Score BM25
14426,We study the critical set C of the nonlinear...,54.005663
13534,"Let p:C-->Y be a covering of smooth, project...",49.654692
14278,We prove that if x^m + c*x^n permutes the pr...,46.431785
4327,Let ccl(G) denote the order of the largest c...,43.879518
12788,"For two graph H and G, the Ramsey number r(H...",43.550299
16486,We prove a Morita reduction theorem for the ...,43.228933
7751,"We show that if A is a subset of {1, ..., n}...",42.580936
14272,Suppose x^m + c*x^n is a permutation polynom...,41.792553
15782,Consider a finite morphism f:X -> Y of smoot...,39.846724
16842,We consider the recursive equation ``x(n+1)=...,39.735954


In [None]:
# Faiss
from sentence_transformers import SentenceTransformer
import faiss
import os
from dotenv import load_dotenv
from openai import OpenAI