In [1]:
# Librerías

import numpy as np
import pandas as pd
import nltk

from datetime import datetime

import re
import matplotlib.pyplot as plt

import unicodedata
from contractions import CONTRACTION_MAP

from nltk.stem import WordNetLemmatizer
from nltk.tokenize.toktok import ToktokTokenizer
import pdfplumber

from importlib import reload

pd.options.display.max_colwidth = 200
%matplotlib inline

In [2]:
# Se debe tener el archivo utils.py en la misma carpeta que el notebook
import utils as utils 

## Obtención de corpus

- Link: 


In [3]:
# Lectura de datos
df = pd.read_csv("Train.csv")
df.head()

Unnamed: 0,id,ABSTRACT,Computer Science,Mathematics,Physics,Statistics,Analysis of PDEs,Applications,Artificial Intelligence,Astrophysics of Galaxies,...,Methodology,Number Theory,Optimization and Control,Representation Theory,Robotics,Social and Information Networks,Statistics Theory,Strongly Correlated Electrons,Superconductivity,Systems and Control
0,1824,"a ever-growing datasets inside observational astronomy have challenged scientists inside many aspects, including an efficient and interactive data exploration and visualization. many tools have be...",0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,3094,we propose the framework considering optimal $t$-matchings excluding a prescribed $t$-factors inside bipartite graphs. a proposed framework was the generalization of a nonbipartite matching proble...,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,8463,nanostructures with open shell transition metal or molecular constituents host often strong electronic correlations and are highly sensitive to atomistic material details. this tutorial review dis...,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,2082,"stars are self-gravitating fluids inside which pressure, buoyancy, rotation and magnetic fields provide a restoring forces considering global modes of oscillation. pressure and buoyancy energetica...",0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8687,deep neural perception and control networks are likely to be the key component of self-driving vehicles. these models need to be explainable - they should provide easy-to-interpret rationales cons...,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Número de documentos en el corpus
print("El número de documentos en el corpus es: ",len(df))

El número de documentos en el corpus es:  14004


In [5]:
# Obtención de los documentos, que en este caso son los resumenes en inglés de artículos científicos
corpus = df["ABSTRACT"]
corpus = corpus.to_numpy() # Se pasa a numpy array para su preprocesamiento
corpus[:2]

array(['a ever-growing datasets inside observational astronomy have challenged scientists inside many aspects, including an efficient and interactive data exploration and visualization. many tools have been developed to confront this challenge. however, they usually focus on displaying a actual images or focus on visualizing patterns within catalogs inside the predefined way. inside this paper we introduce vizic, the python visualization library that builds a connection between images and catalogs through an interactive map of a sky region. vizic visualizes catalog data over the custom background canvas with the help of a shape, size and orientation of each object inside a catalog. a displayed objects inside a map are highly interactive and customizable comparing to those inside a images. these objects should be filtered by or colored by their properties, such as redshift and magnitude. they also should be sub-selected with the help of the lasso-like tool considering further analysis w

## Preprocesamiento y limpieza del corpus

In [6]:
utils = reload(utils)

In [7]:
# Preprocesamiento del corpus, se normaliza los documentos, utilizando la función definida en el módulo utils
# Los pasos de preprocesamiento se muestran en el escrito
norm_corpus = utils.normalize_corpus(corpus)

In [8]:
print("Comparación antes vs despues del preprocesamiento: \n")
print("Texto antes de ser normalizado:\n")
print(corpus[1][:200]+"\n\n")
print("Texto después de ser normalizado:\n")
print(norm_corpus[1][:200])

Comparación antes vs despues del preprocesamiento: 

Texto antes de ser normalizado:

we propose the framework considering optimal $t$-matchings excluding a prescribed $t$-factors inside bipartite graphs. a proposed framework was the generalization of a nonbipartite matching problem an


Texto después de ser normalizado:

propose framework considering optimal matchings excluding prescribed factors inside bipartite graph proposed framework wa generalization nonbipartite matching problem includes several problem triangle


## Representación vectorizada del corpus

### Representación utilizando el modelo TF-IDF

In [9]:
# Obtención de la matriz TD-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0., max_df=1., norm='l2',
                     use_idf=True, smooth_idf=True)
tv_matrix = tv.fit_transform(norm_corpus)
tv_matrix = tv_matrix.toarray()

vocab = tv.get_feature_names() # Obtener todas las palabras unicas en el corpus

In [10]:
# Longitud de vocabulario
print("La longitud del vocabulario es de: ",len(vocab)," palabras distintas")

La longitud del vocabulario es de:  49831  palabras distintas


In [11]:
# Matriz TD-IDF
pd.DataFrame(tv_matrix, columns=vocab).head()

Unnamed: 0,aa,aachen,aae,aaes,aalenjohansen,aam,aams,aan,aapm,aaronson,...,zw,zwcl,zwick,zwicky,zwise,zwitterion,zwittterion,zywina,zz,zzsun
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Obtención de matriz de similitudes

In [12]:
from sklearn.metrics.pairwise import cosine_similarity 
similarity_matrix = cosine_similarity(tv_matrix)
similarity_df = pd.DataFrame(similarity_matrix)
similarity_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13994,13995,13996,13997,13998,13999,14000,14001,14002,14003
0,1.0,0.010763,0.029082,0.018118,0.026351,0.030871,0.07817,0.041312,0.062039,0.021232,...,0.04246,0.014586,0.008463,0.008993,0.031695,0.016148,0.021624,0.018459,0.042962,0.018181
1,0.010763,1.0,0.023855,0.016535,0.013672,0.014149,0.072063,0.050292,0.035116,0.01019,...,0.018148,0.023672,0.024691,0.004338,0.061405,0.016113,0.077672,0.017015,0.032609,0.015714
2,0.029082,0.023855,1.0,0.069737,0.011784,0.020395,0.047014,0.046352,0.014508,0.024256,...,0.019132,0.091212,0.018352,0.010379,0.038782,0.024296,0.039047,0.079402,0.053395,0.048188
3,0.018118,0.016535,0.069737,1.0,0.016685,0.008965,0.040151,0.046311,0.010489,0.018847,...,0.02052,0.059137,0.007433,0.027037,0.011068,0.014252,0.017036,0.12551,0.014151,0.037039
4,0.026351,0.013672,0.011784,0.016685,1.0,0.028597,0.028811,0.045849,0.006288,0.004475,...,0.04619,0.002155,0.017448,0.008267,0.026234,0.00287,0.018917,0.012645,0.003036,0.012916


In [13]:
# Matriz de similitudes por pares de documentos
similarity_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13994,13995,13996,13997,13998,13999,14000,14001,14002,14003
0,1.0,0.010763,0.029082,0.018118,0.026351,0.030871,0.07817,0.041312,0.062039,0.021232,...,0.04246,0.014586,0.008463,0.008993,0.031695,0.016148,0.021624,0.018459,0.042962,0.018181
1,0.010763,1.0,0.023855,0.016535,0.013672,0.014149,0.072063,0.050292,0.035116,0.01019,...,0.018148,0.023672,0.024691,0.004338,0.061405,0.016113,0.077672,0.017015,0.032609,0.015714
2,0.029082,0.023855,1.0,0.069737,0.011784,0.020395,0.047014,0.046352,0.014508,0.024256,...,0.019132,0.091212,0.018352,0.010379,0.038782,0.024296,0.039047,0.079402,0.053395,0.048188
3,0.018118,0.016535,0.069737,1.0,0.016685,0.008965,0.040151,0.046311,0.010489,0.018847,...,0.02052,0.059137,0.007433,0.027037,0.011068,0.014252,0.017036,0.12551,0.014151,0.037039
4,0.026351,0.013672,0.011784,0.016685,1.0,0.028597,0.028811,0.045849,0.006288,0.004475,...,0.04619,0.002155,0.017448,0.008267,0.026234,0.00287,0.018917,0.012645,0.003036,0.012916


## Obtención de artículos similares a cada uno

In [14]:

from_n = 1
to_n = 21
print("Número de documentos similares a obtener: ",to_n-from_n)

Número de documentos similares a obtener:  20


In [15]:

matrix_idx=[]
for similitudes in similarity_df.values:
    #print(similitudes)
    # Índices ordenados de acuerdo a su similitud con el documento en cuestión,  de mayor a menor
    similar_docs_idxs= np.argsort(-similitudes)[from_n:to_n] # Se excluye el  propio doc (Siempre es el de mayor similitud)
    #print(similar_docs_idxs)
    matrix_idx.append(similar_docs_idxs) # Se crea matriz sólo con el número de índices que se quiere

In [16]:
# Matriz con los indices de los n documentos más parecidos
matrix_idx = np.array(matrix_idx)


## Comparación de los resultados con las etiquetas del corpus

In [17]:
temas = df[df.columns[2:]]
temas.head()

Unnamed: 0,Computer Science,Mathematics,Physics,Statistics,Analysis of PDEs,Applications,Artificial Intelligence,Astrophysics of Galaxies,Computation and Language,Computer Vision and Pattern Recognition,...,Methodology,Number Theory,Optimization and Control,Representation Theory,Robotics,Social and Information Networks,Statistics Theory,Strongly Correlated Electrons,Superconductivity,Systems and Control
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# Se va añadiendo el número de coincidencias de las etiquetas (Ver escrito) de los 20 artículos del rango de similitud
# elegido, con las etiquetas del documento en cuestión. Cada fila es el número de coincidencias de estos con cada uno 
# de los documentos.
i=0
coincidencias=[]
for indices in matrix_idx:
    #print(utils.num_of_coincidances(indices,original_paper_idx=i,topics=temas))
    coincidencias.append(utils.num_of_coincidances(indices,original_paper_idx=i,topics=temas))
    i+=1

13
14
20
16
20
17
15
18
19
15
15
19
15
19
20
19
17
12
20
17
20
20
17
19
16
20
20
19
20
16
19
10
20
13
14
15
14
20
5
13
15
19
20
14
20
19
14
19
18
20
16
20
18
19
20
19
8
15
18
8
16
17
19
18
20
19
16
3
20
19
20
20
19
20
17
17
20
20
18
16
19
15
20
19
12
14
11
18
19
20
18
19
19
18
12
20
5
13
18
20
15
20
18
13
18
20
17
17
18
18
18
20
14
18
18
20
19
13
19
13
19
17
18
16
14
16
18
20
6
14
9
12
16
14
12
12
19
16
13
20
20
6
16
18
20
20
17
11
16
19
18
7
10
10
20
14
20
14
20
12
18
13
20
17
18
20
15
13
20
16
16
13
19
20
20
18
20
15
20
20
19
19
20
11
19
19
12
15
19
19
15
19
19
20
14
20
20
17
13
19
20
20
18
17
19
17
9
20
16
17
15
19
10
20
15
14
19
7
19
16
20
11
17
15
18
16
16
14
10
20
17
19
20
20
9
18
14
20
20
9
18
20
19
20
20
19
20
19
20
14
18
20
12
18
19
19
17
20
19
9
12
20
20
20
15
15
16
15
5
10
17
17
16
20
19
10
11
7
8
13
20
20
18
19
19
20
20
14
20
20
16
19
9
20
20
19
19
19
20
14
20
18
6
20
20
20
17
16
20
20
20
19
20
18
20
5
20
20
19
16
16
20
16
19
17
14
6
13
14
20
18
17
20
15
11
20
19
13
18
8
16

20
20
20
15
20
17
19
9
15
20
13
14
17
11
14
19
8
13
20
11
20
18
15
19
18
14
18
18
5
20
11
15
20
15
12
16
14
20
18
20
0
20
3
17
20
17
20
13
8
16
16
19
18
15
11
20
15
20
20
18
19
19
20
4
17
7
18
10
4
20
8
19
19
18
17
20
18
16
6
10
6
10
20
17
17
17
18
16
18
14
18
18
20
16
20
20
19
12
20
16
20
20
8
20
10
20
20
19
12
15
18
20
18
18
10
20
18
8
18
17
19
20
20
6
19
20
9
0
20
10
20
16
19
8
3
20
20
18
19
19
19
17
20
17
20
15
11
20
14
18
15
17
10
19
18
20
10
19
15
20
19
17
16
20
10
10
17
20
17
20
16
17
17
19
15
19
17
20
17
20
14
5
7
17
17
14
20
15
20
15
18
19
18
19
16
20
20
18
18
14
15
16
17
20
14
1
16
14
18
20
2
13
19
8
11
20
19
20
20
10
20
11
17
15
18
19
15
20
8
8
20
18
17
20
19
17
20
20
17
10
20
20
12
15
17
20
19
16
20
19
20
5
11
15
20
16
18
1
13
20
16
13
18
13
20
13
20
20
19
17
17
16
20
13
9
19
20
20
13
19
19
17
16
14
16
19
19
17
17
19
20
17
16
14
18
5
19
20
20
18
20
19
20
12
14
14
20
15
19
14
12
20
7
20
20
18
19
20
9
17
20
20
9
18
20
16
4
20
14
9
18
16
1
13
20
20
18
20
15
20
20
20
19
16
16
2

19
20
20
20
20
20
20
18
16
20
18
13
17
5
20
16
18
20
19
12
15
17
18
20
16
20
20
20
12
18
18
18
19
20
18
20
19
20
17
19
19
10
19
11
8
20
15
17
16
20
12
20
20
13
16
14
16
20
20
19
18
14
7
14
7
18
13
19
15
14
18
20
16
19
10
19
19
19
20
16
8
18
11
17
19
19
19
8
19
14
20
8
18
14
20
20
20
13
19
10
19
13
18
20
14
8
15
19
18
18
14
20
11
19
16
17
17
14
19
14
12
20
18
19
20
20
20
14
15
16
13
14
19
19
20
15
17
17
19
9
17
16
20
20
8
20
19
20
19
19
20
20
6
20
18
19
18
18
15
16
20
11
19
16
19
19
18
20
15
18
18
5
12
17
15
17
18
13
18
18
17
14
15
17
15
20
20
12
16
16
10
18
19
20
20
18
17
14
16
19
18
20
18
8
15
13
16
10
15
20
15
11
20
19
14
17
6
13
19
16
19
19
20
15
15
20
16
18
14
18
17
20
20
18
20
20
17
11
13
4
20
16
15
13
18
20
5
20
20
20
13
20
8
19
12
20
14
20
17
9
6
10
20
19
20
4
20
16
15
19
19
13
20
19
18
13
19
18
12
18
20
9
19
17
18
14
10
1
2
12
20
16
15
20
12
13
16
20
15
12
19
14
18
15
12
19
16
18
13
19
20
20
17
20
19
18
16
19
17
20
20
6
19
13
12
15
20
18
17
18
18
20
12
14
17
20
16
18
19
20
20
1

17
20
20
17
16
16
16
18
13
19
10
2
17
20
20
14
17
17
18
15
16
12
15
15
13
11
20
13
19
20
20
20
19
20
20
10
6
17
14
14
20
20
20
16
18
20
2
14
1
20
20
12
15
19
14
17
20
18
8
17
19
20
17
16
18
15
20
20
19
20
19
20
20
19
19
19
16
13
20
11
16
17
11
3
17
17
19
19
11
14
10
18
20
18
15
13
13
9
19
16
18
3
17
13
19
18
13
15
20
20
13
19
4
17
20
17
4
17
14
19
20
20
17
14
18
16
19
18
15
20
17
17
20
20
17
14
20
5
18
20
19
5
17
20
20
12
17
18
19
16
17
13
18
15
18
14
19
14
19
16
19
20
17
18
19
17
6
14
16
20
9
20
8
10
16
18
18
20
19
20
14
6
20
18
18
18
16
18
17
17
20
17
19
17
16
20
11
20
14
16
9
11
20
8
13
20
17
20
12
20
18
16
20
3
14
18
17
15
13
20
20
19
14
19
19
14
15
16
18
20
20
13
15
20
20
19
16
16
20
13
5
11
14
16
15
17
15
20
18
19
20
9
13
17
6
20
10
19
15
16
20
17
8
1
20
17
17
18
19
10
19
20
14
20
20
17
19
20
18
19
17
18
15
20
5
18
18
16
16
12
20
19
16
13
10
19
20
19
11
19
14
20
20
18
20
20
8
19
20
7
20
20
4
17
20
19
15
19
18
10
15
20
15
18
20
16
19
17
15
16
18
20
16
18
19
17
12
19
20
10
5
20
20


17
7
7
20
18
20
20
20
20
18
20
15
20
16
16
17
20
20
15
17
17
19
1
19
20
19
5
18
20
18
19
12
19
9
20
9
16
20
17
18
19
20
18
19
12
14
14
18
4
17
20
14
18
15
20
8
20
10
20
18
20
19
20
7
12
12
15
14
16
19
18
17
19
20
18
11
18
15
10
18
20
5
19
20
10
19
7
14
19
20
19
17
19
8
13
17
19
19
19
18
12
20
8
3
20
18
19
20
16
19
13
19
18
20
20
16
20
19
13
20
14
16
14
8
16
15
20
20
20
2
16
11
16
15
15
19
5
15
18
2
18
16
13
18
20
11
15
18
19
17
17
19
13
20
16
18
19
18
18
14
15
6
16
15
20
15
18
20
9
20
11
13
20
20
17
13
18
17
20
3
18
15
19
17
16
15
16
20
19
20
5
20
15
16
18
19
13
12
14
19
20
16
19
9
17
5
19
19
17
19
20
19
17
20
19
17
19
18
18
13
17
20
18
18
19
18
19
18
10
18
19
19
20
20
20
15
12
17
11
20
20
19
15
19
20
20
20
17
19
17
20
17
20
18
20
18
15
17
5
17
15
20
20
13
13
20
20
13
16
20
18
20
17
18
14
20
18
7
19
18
18
15
19
20
13
13
15
12
4
19
16
16
19
17
20
19
20
20
19
18
18
17
18
7
17
11
15
20
19
19
20
17
16
16
20
12
18
19
19
14
20
17
15
20
19
20
20
18
7
15
19
12
14
18
20
18
17
19
19
20
19
16
19


In [19]:
print(len(coincidencias))
print(sum(coincidencias)/len(coincidencias))
# Este valor indica la exactitud de las s
print(sum(coincidencias)/len(coincidencias)/(to_n-from_n))

14004
16.3180519851471
0.815902599257355


In [20]:
# Visualización de resultados
# Ejemplo 1
# Documento A 
norm_corpus[10]

'study problem extracting selective connector considering given set query vertex q \\subseteq v inside graph g v e selective connector wa subgraph g exhibit cohesiveness property contains query vertex doe necessarily connect relaxing connectedness requirement allows connector detect multiple community tolerant outlier achieve introducing new measure network inefficiency instantiating search considering selective connector problem finding minimum inefficiency subgraph show minimum inefficiency subgraph problem wa nphard devise efficient algorithm approximate mean several case study inside variety application domain human brain cancer food network show minimum inefficiency subgraph produce highquality solution exhibiting desired behavior selective connector'

In [21]:
# Ejemplo 1

utils.get_topics_of_doc(temas,10)

['Computer Science', 'Data Structures and Algorithms']

In [22]:
# Ejemplo 1

# Indices de los documentos más parecidos a A
matrix_idx[10]

array([11661,  6984,  8344,  7508, 10415, 10802,  1254,  7474,  9889,
       10947, 13915, 10793, 10879,  8885,  7653,  6669, 13445, 13971,
        7507,  6591], dtype=int64)

In [23]:
# Ejemplo 1

# Documento más parecido a A:
norm_corpus[matrix_idx[10][0]]

'study nphard problem motivated energyefficiently maintaining connectivity symmetric wireless sensor communication network given edgeweighted n vertex graph find connected spanning subgraph minimum cost cost wa determined letting vertex pay expensive edge incident inside subgraph provide algorithm work inside polynomial time one find set obligatory edge yield spanning subgraph \\log n connected component also provide lineartime algorithm reduces input graph consists tree together g additional edge equivalent graph g vertex based obtain polynomialtime algorithm considering g\\in \\log n negative side show \\log n approximating difference optimal solution cost natural lower bound wa nphard presumably exact algorithm running inside ^ n time inside f \\cdot n^ time considering computable function f'

In [24]:
matrix_idx[10][0]

11661

In [25]:
# Ejemplo 1

utils.get_topics_of_doc(temas,matrix_idx[10][0])

['Computer Science', 'Data Structures and Algorithms']

In [26]:
# Visualización de resultados
# Ejemplo 2
# Documento A, 
norm_corpus[7000]

'continuously rotating halfwave plate crhwp wa promising tool improve sensitivity large angular scale inside cosmic microwave background cmb polarization measurement crhwp single detector measure three stokes parameter q u thereby avoiding set systematic error introduced mismatch inside property orthogonal detector pair focus implementation crhwps inside large aperture telescope e primary mirror wa larger current maximum halfwave plate diameter \\sim crhwp placed primary mirror focal plane inside configuration one need address intensity polarization \\rightarrow p leakage optic becomes source f noise also cause differential gain systematics arise cmb temperature fluctuation inside paper present performance crhwp installed inside polarbear experiment employ gregorian telescope primary illumination pattern crhwp wa placed near prime focus primary secondary mirror find \\rightarrow p leakage wa larger expectation physical property primary mirror resulting inside f knee mhz excess leakage 

In [27]:
# Ejemplo 2

utils.get_topics_of_doc(temas,7000)

['Physics',
 'Cosmology and Nongalactic Astrophysics',
 'Instrumentation and Methods for Astrophysics']

In [28]:
# Ejemplo 2

# Indices de los documentos más parecidos a A
matrix_idx[7000]

array([ 1849,  8269, 11850, 12364, 11622,  9840,  7744,  7620, 11320,
        5589, 12529,   123, 10874,  3804,  5847,   907,  3286, 13328,
        6409, 12786], dtype=int64)

In [29]:
# Ejemplo 2

# Documento más parecido a A

norm_corpus[matrix_idx[7000][0]]

'future cosmic microwave background cmb satellite mission aim use b mode polarization measure tensortoscalar ratio r sensitivity ^ achieving goal require sufficient detector array sensitivity also unprecedented control systematic error inherent cmb polarization measurement since polarization measurement derive difference observation different time different sensor detector response mismatch introduce leakage intensity polarization thus lead spurious b mode signal expected primordial b mode polarization signal wa dwarfed known unpolarized intensity signal leakage could contribute substantially final error budget considering measuring r help simulation approximate magnitude angular spectrum spurious b mode signal resulting bandpass mismatch different detector wa assumed detector calibrated considering example help cmb dipole sensitivity primordial cmb signal ha perfectly matched consequently mismatch inside frequency bandpass shape detector introduces difference inside relative calibrati

In [30]:
# Ejemplo 2

utils.get_topics_of_doc(temas,matrix_idx[7000][0])

['Physics', 'Cosmology and Nongalactic Astrophysics']

## Comparación con artículos menos similares y el número de coincidencias de estos

In [31]:
len(norm_corpus)

14004

### 2do Rango

In [32]:

from_n = 1001
to_n = 1021
print("Número de documentos similares a obtener: ",to_n-from_n)

Número de documentos similares a obtener:  20


In [33]:
matrix_idx=[]
for similitudes in similarity_df.values:
    #print(similitudes)
    similar_docs_idxs= np.argsort(-similitudes)[from_n:to_n] # Se excluye el  propio doc
    #print(similar_docs_idxs)
    matrix_idx.append(similar_docs_idxs)
# Matriz con los indices de los n documentos más parecidos
matrix_idx = np.array(matrix_idx)

In [34]:
# describir
i=0
coincidencias=[]
for indices in matrix_idx:
    #print(utils.num_of_coincidances(indices,original_paper_idx=i,topics=temas))
    coincidencias.append(utils.num_of_coincidances(indices,original_paper_idx=i,topics=temas))
    i+=1

3
12
13
14
19
13
12
18
18
11
10
10
15
14
18
11
5
12
11
11
16
17
11
14
15
19
13
15
12
8
14
7
5
7
14
11
9
18
12
12
3
12
9
8
8
7
19
18
6
7
11
19
5
7
14
12
8
16
6
7
9
10
13
6
6
11
13
4
19
12
13
18
19
19
10
19
13
14
13
14
7
15
13
11
14
8
14
11
5
14
12
14
19
12
14
17
7
17
13
12
9
20
16
9
18
15
14
19
17
4
10
7
12
13
9
12
13
13
16
4
12
6
10
7
11
14
11
9
8
11
3
8
7
18
12
10
4
15
7
8
6
4
10
10
17
20
14
14
13
14
16
12
9
10
15
5
10
10
15
12
15
6
20
10
7
4
9
11
20
13
12
9
7
13
17
12
17
14
19
12
19
11
19
3
17
7
14
15
11
7
11
15
18
13
7
20
11
9
8
7
11
14
14
14
17
6
11
13
13
12
15
10
11
13
10
11
14
10
11
7
16
10
16
12
12
8
14
8
12
11
13
18
8
2
13
9
13
13
16
6
14
17
16
15
17
9
16
14
7
17
14
15
5
8
14
12
16
10
10
8
11
17
9
4
14
16
10
16
11
7
6
9
4
10
18
12
10
9
5
7
20
19
18
13
13
10
9
14
12
17
7
10
10
7
15
19
8
14
5
14
20
13
8
13
18
18
5
10
16
20
15
9
8
7
8
12
9
15
18
17
14
18
9
10
11
14
5
8
13
18
13
3
11
11
12
18
16
9
13
3
11
11
12
17
10
5
19
20
20
6
10
13
10
10
9
7
13
14
15
18
19
15
9
3
7
10
13
19
19


17
8
13
16
12
9
19
20
11
8
11
13
10
16
17
15
15
11
9
7
5
1
8
19
7
10
10
11
8
12
4
6
12
9
14
12
18
14
16
18
16
2
20
11
12
10
11
12
14
14
12
11
12
8
12
17
11
14
18
9
20
16
7
8
12
10
10
7
18
10
13
10
18
7
10
10
12
7
7
10
10
12
7
12
11
10
17
10
3
11
14
11
17
13
13
12
9
16
14
13
16
15
7
13
18
9
14
17
14
8
10
17
18
15
8
20
18
12
6
20
10
19
10
13
9
14
8
18
7
9
10
10
10
4
14
16
11
16
19
10
17
4
19
13
13
6
15
9
16
12
11
10
15
8
7
17
17
16
11
11
6
9
13
9
17
11
15
12
10
9
7
14
16
10
5
16
17
7
11
13
11
7
4
13
19
9
12
14
18
10
12
19
3
17
12
17
14
15
17
6
16
13
8
17
20
19
9
11
13
20
7
20
14
11
8
13
13
19
19
8
13
8
12
5
6
16
19
2
11
19
13
12
14
16
18
7
15
11
17
19
9
13
10
9
11
12
5
15
11
16
8
16
11
9
5
10
13
17
4
12
9
9
15
11
9
15
18
6
18
12
14
15
6
9
18
8
8
4
15
10
13
12
17
3
11
11
9
14
12
18
17
8
6
15
4
17
16
9
11
6
11
19
12
16
12
13
18
18
9
4
4
12
10
12
11
14
16
6
10
6
15
11
16
12
5
19
12
14
15
11
13
14
14
5
6
9
12
6
15
8
9
8
13
8
17
13
11
16
12
13
20
11
9
16
20
8
12
19
13
10
13
13
11
12
11
15
8
8

10
19
8
11
17
10
13
13
15
16
20
4
18
11
7
14
6
11
19
11
5
13
5
14
14
20
15
12
17
13
10
16
15
2
19
10
20
7
20
12
19
18
12
3
8
12
18
16
15
18
8
13
11
6
13
13
9
15
10
16
10
6
12
14
12
15
14
5
15
18
8
8
18
9
14
9
12
10
20
13
11
3
11
12
4
16
13
14
11
10
10
11
11
16
8
19
15
12
14
14
19
19
17
6
9
15
15
15
9
11
16
8
13
11
5
13
12
7
14
11
11
9
12
14
16
14
5
15
10
16
10
15
13
20
12
12
9
11
12
15
15
9
10
20
8
17
5
5
5
12
6
14
15
11
20
7
11
13
7
11
12
12
9
8
6
10
15
10
7
7
14
12
6
8
13
11
14
14
14
13
17
8
10
8
11
12
1
17
15
8
12
8
5
7
13
11
16
13
11
14
15
16
14
12
7
13
18
5
3
9
9
15
14
14
7
6
6
5
10
12
10
13
7
10
14
17
18
17
11
11
7
16
8
16
13
5
18
9
12
12
15
12
8
12
13
2
18
4
17
17
15
20
11
9
12
7
17
11
6
5
11
13
12
7
6
9
11
10
16
9
7
9
14
9
13
20
14
11
15
11
13
14
17
14
13
19
5
14
8
13
12
15
4
19
9
18
19
19
15
8
19
14
11
5
10
9
10
18
7
11
11
13
11
16
8
9
19
11
10
11
9
16
14
12
15
8
20
19
13
7
11
8
15
9
2
7
7
19
12
7
12
13
8
11
10
15
11
5
13
8
7
13
14
9
11
6
9
8
8
8
16
17
8
13
13
6
9
14
8
12
11
1

17
8
4
8
17
10
8
5
6
9
12
18
14
11
11
17
9
14
5
11
11
12
16
19
12
14
11
19
9
4
8
9
10
19
11
7
12
11
14
16
20
19
6
14
11
17
18
18
12
11
13
16
18
11
17
11
11
11
8
12
14
4
14
8
18
12
7
15
12
10
12
7
11
12
12
20
11
17
18
9
13
12
15
16
11
18
11
8
20
19
17
7
8
14
6
19
9
7
7
13
14
14
19
17
10
13
10
18
11
9
7
14
11
12
10
9
15
6
8
10
20
10
8
16
13
14
19
8
11
7
5
10
8
13
9
2
9
4
12
15
5
9
8
15
17
13
11
13
5
19
8
11
10
14
15
8
9
11
20
11
10
3
7
14
6
16
18
12
9
16
18
15
13
9
17
19
17
13
11
8
9
8
9
9
8
10
15
14
15
15
15
4
7
9
12
5
10
8
13
5
12
9
17
11
7
16
15
14
18
9
15
14
17
13
11
7
13
17
16
14
19
11
15
11
19
18
9
15
2
12
20
16
14
13
7
20
7
5
15
6
13
10
19
20
4
9
11
13
13
8
15
12
12
5
12
13
9
15
5
18
10
6
11
20
9
18
12
16
11
13
12
20
15
7
8
8
9
13
9
9
14
12
9
16
15
9
5
13
12
12
14
17
13
19
11
12
20
10
8
16
8
9
10
9
6
13
18
16
3
14
7
17
11
12
9
8
13
5
17
5
9
17
4
14
16
11
11
15
12
7
13
14
12
13
8
11
11
9
7
6
10
5
8
13
14
17
16
14
13
6
11
12
13
6
13
18
15
10
17
18
8
17
14
11
9
18
14
6
18
15
4
17
4
1

10
8
10
18
10
15
9
15
16
11
6
4
13
15
9
15
11
10
8
9
10
12
16
13
11
17
12
15
11
11
10
11
13
15
17
10
11
7
18
10
3
20
6
12
13
5
18
13
15
15
15
19
18
6
14
10
12
9
9
15
11
18
17
11
8
19
17
13
12
9
17
8
4
4
16
15
11
14
16
7
12
17
10
18
13
18
12
12
9
12
14
16
15
10
5
18
12
6
19
5
11
20
12
10
9
8
5
8
12
10
19
17
9
16
9
15
11
14
10
17
19
13
7
6
11
13
11
9
12
7
19
6
11
16
12
11
14
4
8
12
7
7
14
13
17
16
6
14
17
12
3
13
10
8
15
11
6
12
10
13
14
6
12
10
20
17
11
10
9
9
10
5
9
13
9
19
11
15
18
19
10
7
4
13
9
13
14
15
10
9
14
15
13
15
15
10
11
10
19
2
14
13
12
13
10
9
12
9
20
15
10
8
19
11
15
16
12
9
10
13
8
10
12
14
12
13
9
12
15
17
14
13
16
17
9
15
10
10
14
11
9
17
3
10
19
7
17
20
17
11
6
5
11
6
8
8
15
16
11
12
8
20
10
9
6
15
14
5
8
20
13
13
18
13
16
6
12
11
15
10
10
13
8
2
13
9
18
18
11
8
11
16
12
3
17
11
7
9
17
16
19
3
6
12
13
13
12
8
19
12
20
18
10
10
9
9
13
9
15
11
8
8
20
11
20
17
13
11
12
13
10
13
11
12
9
13
15
9
11
11
1
16
11
13
14
4
20
20
8
9
10
12
10
13
17
9
6
13
2
12
11
14
8
13
3
11
12


In [35]:
print(len(coincidencias))
print(sum(coincidencias)/len(coincidencias))
# Este valor indica la exactitud de las s
print(sum(coincidencias)/len(coincidencias)/(to_n-from_n))

14004
11.769851471008284
0.5884925735504142


### 3er Rango

In [36]:
from_n = 2001
to_n = 2021
print("Número de documentos similares a obtener: ",to_n-from_n)

Número de documentos similares a obtener:  20


In [37]:
matrix_idx=[]
for similitudes in similarity_df.values:
    #print(similitudes)
    similar_docs_idxs= np.argsort(-similitudes)[from_n:to_n] # Se excluye el  propio doc
    #print(similar_docs_idxs)
    matrix_idx.append(similar_docs_idxs)
# Matriz con los indices de los n documentos más parecidos
matrix_idx = np.array(matrix_idx)

In [38]:
# describir
i=0
coincidencias=[]
for indices in matrix_idx:
    #print(utils.num_of_coincidances(indices,original_paper_idx=i,topics=temas))
    coincidencias.append(utils.num_of_coincidances(indices,original_paper_idx=i,topics=temas))
    i+=1

6
11
7
13
15
9
10
13
19
8
10
5
12
10
16
6
5
6
8
8
12
15
9
14
12
18
13
7
10
14
14
5
3
10
16
12
10
19
6
12
7
8
14
2
6
6
14
17
7
6
6
18
7
10
12
12
12
16
4
5
10
15
10
7
6
9
14
4
18
6
15
19
20
19
10
19
13
12
15
9
9
11
7
7
6
6
12
6
3
7
7
10
17
11
16
14
12
8
10
8
10
19
18
11
18
16
12
18
11
10
8
8
11
8
3
13
19
11
7
8
8
6
13
3
9
13
5
8
3
9
3
8
8
10
9
8
3
9
9
5
4
7
14
8
18
20
11
15
12
11
13
13
7
11
16
6
7
9
18
13
15
7
20
9
5
5
7
5
16
8
5
4
5
10
13
11
15
5
19
10
17
8
20
2
12
6
11
15
9
9
7
17
13
9
7
20
6
4
10
8
3
17
11
16
18
5
8
6
11
8
15
13
10
14
12
11
11
6
3
10
16
8
12
10
10
9
17
8
10
5
13
16
7
4
16
6
10
8
19
3
13
18
12
12
19
10
15
15
7
15
10
12
4
11
11
9
10
8
13
13
12
10
11
7
16
13
9
10
13
7
6
10
6
6
17
15
5
12
7
11
17
19
16
7
11
9
12
15
14
17
5
7
14
4
9
17
8
11
8
11
18
13
7
16
17
17
3
5
14
20
10
7
11
8
2
9
7
8
13
16
13
16
4
8
6
9
7
9
12
16
17
5
8
13
12
16
10
7
10
3
8
14
11
13
15
5
19
20
18
5
9
14
9
5
7
14
10
9
15
20
17
10
12
3
2
10
9
18
17
15
16
16
7
3
6
15
9
11
7
12
9
5
7
2
11
8
18
1
2
11
16


9
9
12
9
12
9
18
10
18
12
16
8
15
20
8
17
15
11
11
19
18
7
2
8
18
7
17
10
11
12
10
13
18
18
9
15
4
8
8
3
14
17
6
9
18
6
14
15
12
14
9
14
14
11
16
10
5
11
4
6
8
6
12
8
9
3
8
13
13
5
9
14
16
7
14
5
5
7
7
3
3
18
4
14
10
13
11
7
7
16
8
9
6
9
13
11
15
10
2
6
8
16
8
12
18
19
7
9
10
7
8
7
8
3
10
17
20
7
12
10
9
15
17
10
3
4
13
8
10
6
6
8
5
9
8
14
5
13
8
3
15
7
10
9
12
10
10
13
2
7
12
8
8
8
4
7
13
12
8
17
13
6
17
9
9
20
5
4
15
19
13
11
16
13
7
11
6
10
7
8
18
3
6
15
7
14
13
19
18
5
11
9
11
14
10
13
8
12
8
17
6
6
12
18
7
9
18
11
15
10
12
5
11
12
11
14
15
13
8
17
5
14
8
15
3
9
4
13
3
11
18
2
7
5
9
9
12
10
6
9
10
15
9
12
11
14
11
6
14
5
12
6
9
10
9
9
10
8
9
11
8
9
8
7
12
7
5
8
8
7
8
16
9
15
16
7
14
4
8
17
9
4
3
13
12
8
11
17
6
5
8
8
12
17
3
7
16
9
11
6
13
4
15
9
11
15
16
8
16
13
6
5
18
17
5
7
14
2
3
10
9
7
8
13
18
3
5
7
9
7
3
13
11
19
5
7
10
13
17
11
10
16
4
12
9
11
4
11
20
15
6
17
6
19
7
5
12
4
17
5
17
4
4
10
14
9
14
9
12
13
12
17
5
12
4
9
9
10
12
17
7
7
3
12
5
13
19
14
13
5
3
13
17
12
4
6
13
15


6
15
16
3
17
8
7
8
10
8
8
9
12
15
12
6
14
15
5
14
5
4
4
16
7
14
13
13
13
10
11
10
3
6
19
10
9
15
9
6
16
9
16
14
10
4
7
10
15
16
11
7
14
5
9
5
16
8
12
13
5
9
16
14
13
8
8
14
16
6
8
9
15
17
10
5
10
3
13
10
10
10
20
4
10
17
6
9
10
9
5
9
18
17
8
15
9
10
4
5
12
7
9
9
12
13
10
9
18
15
5
13
8
11
10
16
12
14
7
7
8
8
3
10
6
14
8
14
7
4
18
12
10
19
5
3
19
19
3
17
17
10
6
8
18
4
10
10
10
8
7
17
19
19
7
6
8
10
3
10
9
6
10
8
16
8
10
10
5
9
16
17
16
7
14
7
20
9
18
14
3
12
3
13
11
13
18
4
5
5
5
6
14
9
12
11
3
9
13
10
16
10
19
3
6
14
18
12
8
4
4
12
19
12
18
5
15
10
5
5
6
9
8
4
3
7
3
17
16
8
5
7
10
13
7
17
10
9
13
10
12
19
12
13
13
15
7
7
12
12
14
2
12
19
10
8
8
4
9
12
13
5
5
12
11
5
16
7
17
14
5
16
10
15
14
18
11
15
9
8
15
17
10
3
10
13
8
4
5
7
11
4
13
13
8
10
13
18
3
4
10
6
6
4
19
3
18
4
10
10
16
10
6
9
13
8
8
13
12
8
11
17
9
10
8
15
5
20
18
7
8
12
13
9
16
6
6
15
5
7
6
7
8
11
15
4
13
10
7
6
14
10
11
6
18
5
15
10
18
14
5
13
8
2
19
6
14
12
20
17
10
4
12
4
11
4
12
12
6
5
20
6
5
14
4
18
1
7
1
8
17
18
7
6

8
6
10
3
14
12
11
5
12
11
11
16
13
8
2
13
17
8
8
11
10
10
13
12
11
9
6
9
3
20
17
16
11
13
14
15
15
16
11
13
5
11
7
10
10
7
8
7
8
8
16
8
8
14
10
7
10
9
12
3
17
6
6
12
12
12
7
14
12
4
2
7
7
14
4
4
15
12
7
15
14
4
9
12
4
5
9
16
8
7
7
14
4
8
12
6
12
8
12
7
7
15
14
13
8
17
14
10
4
3
4
12
7
8
7
11
9
9
6
11
8
12
10
13
6
8
11
12
14
15
10
5
10
13
12
20
4
12
16
11
13
8
1
14
19
11
9
19
2
14
17
13
13
5
15
3
4
9
8
13
7
4
5
17
8
7
7
14
7
11
6
5
11
6
10
8
9
12
11
6
12
8
10
19
9
8
12
17
19
16
10
4
11
5
19
12
18
12
8
4
10
8
7
7
14
12
19
7
9
13
6
13
13
8
7
4
6
10
20
10
14
11
1
7
11
5
18
13
5
6
20
10
11
11
12
18
5
17
6
9
13
12
12
8
12
19
13
16
13
17
11
8
12
4
4
16
6
12
5
1
10
12
9
13
5
6
17
18
8
17
6
19
12
14
19
6
5
1
6
10
17
5
5
9
18
11
12
13
4
8
8
9
12
3
15
13
7
7
10
18
8
13
6
12
8
9
5
7
5
17
9
17
8
12
11
11
13
11
8
16
10
8
11
17
13
14
11
9
17
9
9
12
10
19
11
5
3
10
7
10
14
20
15
4
15
9
8
10
17
7
8
6
12
15
7
7
10
3
14
8
18
14
16
17
16
12
15
13
10
19
5
8
14
13
7
6
12
14
6
11
7
5
9
9
6
10
9
9
7
9
8
12
9


18
4
11
9
14
7
15
11
14
10
11
11
7
10
6
19
13
7
7
20
9
9
9
3
6
14
18
14
5
19
10
14
7
15
14
3
8
11
9
9
9
6
11
9
9
9
12
14
5
16
7
10
9
14
5
4
19
12
5
9
16
7
9
5
4
4
10
10
9
13
10
10
12
5
8
7
11
6
8
8
10
11
8
10
8
8
7
5
7
15
13
14
17
9
5
11
11
15
11
5
20
16
11
9
13
11
17
16
8
8
6
12
12
9
3
6
5
11
9
12
10
5
10
5
14
2
11
10
2
15
11
14
5
4
14
18
6
8
8
15
10
19
10
17
4
12
14
12
6
13
7
14
18
14
10
10
9
14
12
6
13
17
8
7
9
5
11
14
5
7
7
14
8
19
4
8
7
13
16
10
3
6
9
9
5
13
15
12
10
7
15
16
9
14
18
3
6
13
9
10
7
15
17
6
12
16
2
12
6
12
5
4
4
16
6
10
10
10
16
6
15
13
13
10
8
10
13
8
4
16
14
10
8
15
8
13
14
2
14
12
5
9
17
11
11
13
15
7
11
16
6
6
19
10
13
17
7
12
9
12
6
8
3
11
20
13
13
9
3
20
16
18
2
9
3
16
8
13
9
7
17
17
13
20
16
2
5
5
6
6
4
7
12
7
10
7
11
9
11
18
8
9
10
16
19
4
10
15
13
7
18
10
10
3
9
9
11
11
8
7
5
5
5
3
7
8
11
4
10
13
4
15
7
9
5
14
4
3
6
13
13
5
13
13
11
8
4
7
11
5
13
11
3
6
12
11
15
6
8
10
6
19
9
15
10
11
18
9
11
10
13
13
9
7
12
13
13
5
4
7
18
11
16
7
7
8
11
18
2
17
6
16
8
4
3
1

In [39]:
print(len(coincidencias))
print(sum(coincidencias)/len(coincidencias))
# Este valor indica la exactitud de las s
print(sum(coincidencias)/len(coincidencias)/(to_n-from_n))

14004
10.291559554413025
0.5145779777206513


### 4to Rango

In [40]:
from_n = 3001
to_n = 3021
print("Número de documentos similares a obtener: ",to_n-from_n)

Número de documentos similares a obtener:  20


In [41]:
matrix_idx=[]
for similitudes in similarity_df.values:
    #print(similitudes)
    similar_docs_idxs= np.argsort(-similitudes)[from_n:to_n] # Se excluye el  propio doc
    #print(similar_docs_idxs)
    matrix_idx.append(similar_docs_idxs)
# Matriz con los indices de los n documentos más parecidos
matrix_idx = np.array(matrix_idx)

In [42]:
# describir
i=0
coincidencias=[]
for indices in matrix_idx:
    #print(utils.num_of_coincidances(indices,original_paper_idx=i,topics=temas))
    coincidencias.append(utils.num_of_coincidances(indices,original_paper_idx=i,topics=temas))
    i+=1

3
14
4
3
14
9
12
18
18
6
10
6
13
6
15
9
3
12
2
6
13
17
7
12
13
16
12
5
8
11
6
5
6
8
13
9
6
15
10
8
3
9
10
7
3
8
17
16
7
6
9
17
6
7
10
13
16
17
4
3
6
13
15
7
8
11
12
1
17
4
9
20
17
16
6
15
13
12
11
5
3
8
10
5
12
11
11
6
1
6
3
5
17
11
13
14
7
5
11
9
8
18
15
12
18
13
10
16
6
3
12
3
14
6
3
13
15
11
5
5
4
6
11
8
10
12
4
5
4
14
9
7
5
13
4
6
3
11
2
3
3
5
11
2
10
17
14
11
11
13
12
6
2
11
17
5
7
6
17
12
9
6
18
15
6
2
8
6
16
6
4
5
4
10
11
8
16
6
19
12
16
9
16
7
17
5
7
14
3
9
10
13
15
10
7
16
8
5
7
8
4
17
15
11
18
4
6
5
15
6
11
12
8
16
12
6
8
2
5
7
15
15
13
13
6
11
16
1
13
6
12
18
7
2
9
10
12
7
16
4
13
17
14
5
18
7
18
15
6
13
13
8
10
5
7
8
8
4
3
8
10
7
4
3
13
14
3
10
9
5
6
12
5
11
19
16
4
11
5
11
19
15
17
7
14
6
7
13
7
17
5
10
10
2
5
17
11
10
2
10
17
10
5
12
14
19
7
11
12
18
5
6
5
7
4
11
5
7
19
14
11
13
11
8
9
10
5
8
8
17
16
10
7
12
9
15
12
6
12
9
8
14
12
12
12
6
12
19
15
4
6
12
5
7
7
4
8
12
15
16
13
11
9
6
4
9
6
16
17
9
17
18
6
8
5
13
9
10
9
13
8
8
5
4
8
4
16
5
4
9
15
6
13
14
5
5
10
8
9
8
6
4
9


4
4
14
6
9
6
6
2
5
10
17
9
8
11
14
7
15
11
3
3
11
3
7
10
5
9
6
6
6
11
8
10
5
5
12
8
5
7
8
11
6
7
2
4
9
7
5
7
4
4
6
5
4
19
6
8
13
7
9
19
10
3
7
16
8
12
16
9
13
12
7
2
6
7
11
2
8
13
3
11
11
15
18
7
8
8
10
12
14
14
4
14
7
18
8
2
7
17
10
9
14
10
8
6
11
6
11
11
12
10
15
11
1
15
6
13
8
17
2
9
8
12
3
7
18
3
8
6
9
4
8
12
10
3
8
6
4
11
7
15
15
4
11
5
10
2
14
11
6
3
8
6
8
5
9
3
9
9
15
7
4
5
7
9
4
11
5
17
17
9
14
5
10
19
10
5
6
7
12
5
12
10
9
3
6
5
2
16
6
5
11
10
14
15
13
4
11
13
4
8
14
7
15
7
5
4
18
14
2
1
16
4
3
6
5
5
9
12
16
1
4
7
4
6
4
7
11
17
3
3
5
13
18
12
11
18
3
14
3
7
4
11
16
14
5
16
4
17
3
4
14
8
12
8
20
8
7
10
10
9
6
4
10
19
10
17
4
12
5
10
7
13
5
18
7
8
5
11
3
14
19
11
8
4
7
7
17
10
7
6
13
20
2
5
18
15
7
19
18
2
2
4
12
2
2
5
13
7
5
15
19
12
6
17
6
15
4
12
4
17
11
4
14
13
8
14
3
14
17
10
9
18
14
10
4
4
9
5
17
6
11
7
10
9
9
9
5
2
19
17
19
13
5
10
17
3
7
10
3
4
7
13
11
16
18
4
9
9
18
6
13
15
8
10
3
8
12
5
16
8
8
5
11
8
17
5
8
7
5
17
10
16
8
12
11
13
12
6
11
10
4
10
11
3
11
11
8
10
8
5
14

11
14
9
6
7
11
11
5
6
6
10
19
13
8
4
6
12
12
7
11
5
7
9
11
13
15
11
12
12
13
4
10
6
7
9
7
14
15
11
5
6
6
6
8
13
6
6
11
9
4
13
5
15
9
3
19
7
11
19
15
6
10
8
8
13
16
9
4
10
12
5
2
4
4
14
5
7
10
9
3
15
16
5
2
8
8
2
5
16
4
13
8
11
10
10
9
11
5
14
5
5
9
12
7
12
20
6
13
7
11
4
15
18
4
9
5
11
14
12
5
8
13
3
7
5
7
6
11
14
4
17
10
3
4
10
10
12
4
17
5
12
11
19
11
2
14
10
2
18
7
8
12
15
10
12
2
10
5
10
5
7
8
8
8
20
11
3
9
3
17
3
3
6
6
14
19
4
7
7
10
15
18
9
6
9
13
4
8
14
18
6
5
3
15
10
5
6
19
6
16
10
18
17
7
14
6
5
12
9
6
7
19
5
6
10
10
7
5
10
14
2
5
10
15
8
3
8
13
9
5
9
3
9
6
10
14
15
14
17
7
3
4
6
7
12
13
10
12
11
14
5
3
10
9
19
12
5
10
13
1
5
10
18
11
13
6
2
3
2
16
18
12
18
7
5
11
13
13
6
7
5
7
12
9
12
17
12
3
7
13
5
15
8
5
9
12
5
3
6
9
13
9
10
11
10
15
9
14
8
8
8
14
18
17
13
8
0
12
13
11
7
13
5
4
6
17
12
3
9
19
8
9
16
7
8
15
8
15
3
5
18
4
7
10
8
10
10
9
7
13
5
12
13
4
11
7
3
7
7
8
8
4
5
7
7
8
14
13
20
16
3
13
3
6
10
9
3
9
7
14
11
14
9
13
11
18
7
17
6
12
12
9
3
11
8
6
10
8
4
9
10
9
12
3
13
18


12
3
5
4
4
15
12
13
17
5
11
9
15
6
9
3
10
5
11
6
6
10
20
11
18
8
11
11
7
9
9
7
20
9
6
5
18
11
12
3
9
15
10
9
16
10
17
14
6
4
8
10
9
13
16
10
5
13
5
3
4
14
9
12
11
5
17
5
8
5
6
11
8
7
10
12
13
15
13
5
14
12
20
3
14
11
8
5
1
9
14
8
13
4
5
12
9
6
10
7
8
7
3
3
13
3
4
8
5
16
3
7
8
9
13
12
3
11
12
6
0
3
12
6
13
5
7
13
9
7
14
10
6
14
7
11
9
8
16
9
14
13
13
17
4
2
5
15
8
9
3
13
2
6
11
7
12
12
6
6
6
5
13
5
12
14
13
11
16
19
8
12
9
5
3
10
5
13
16
8
9
13
10
18
18
8
13
4
8
10
15
9
2
16
12
14
5
5
12
6
17
9
6
9
6
5
7
7
9
10
18
10
6
8
14
9
4
5
12
16
7
4
10
7
4
9
14
7
4
8
12
9
6
8
5
9
7
16
7
6
13
11
11
6
7
14
18
16
13
14
16
8
14
8
9
8
2
14
16
3
13
9
2
5
13
7
7
7
13
12
8
9
17
6
10
5
2
8
10
7
10
3
8
10
14
14
18
10
16
8
6
12
5
12
12
7
12
2
6
12
11
18
3
5
10
18
5
3
14
13
7
18
10
17
19
7
6
11
10
12
4
6
6
15
13
7
14
5
11
10
11
8
11
14
8
6
17
8
7
14
12
6
6
13
18
17
10
7
13
10
9
20
12
13
17
6
7
6
16
7
11
5
11
4
8
7
5
2
6
11
9
9
4
11
12
8
17
14
6
7
11
13
12
8
6
6
7
13
4
16
3
13
15
18
17
6
5
11
5
11
11
13
7
3
5

16
10
10
7
18
12
9
1
7
4
10
6
10
11
6
9
10
7
4
10
7
8
13
8
11
17
5
13
4
5
8
9
10
9
8
13
5
5
11
14
10
5
8
14
2
12
16
5
18
6
5
13
3
6
7
2
8
11
7
7
2
11
12
6
6
6
8
14
4
8
10
12
18
11
10
3
1
4
4
15
7
9
8
10
15
6
10
13
6
10
15
16
5
14
12
9
18
14
8
17
9
7
6
9
9
15
8
8
18
12
10
10
8
5
15
18
2
14
9
8
13
12
10
13
9
8
16
5
16
10
9
8
3
10
12
11
13
12
8
8
9
8
8
9
5
12
15
18
14
17
2
7
5
5
13
10
7
12
10
10
5
2
11
4
13
8
11
11
6
13
7
8
18
17
4
8
7
2
8
4
5
16
14
7
6
4
8
5
12
4
9
8
13
3
16
6
4
16
10
9
9
16
17
7
7
15
8
6
7
14
1
6
16
11
10
9
8
9
8
12
7
16
6
17
16
8
4
17
13
10
5
16
9
11
15
8
4
14
4
11
15
2
8
19
18
9
12
7
8
14
15
5
14
5
19
6
5
5
7
6
4
6
6
13
6
8
8
7
8
18
6
3
7
11
7
16
13
16
4
7
6
6
10
4
13
15
12
14
10
14
4
8
8
16
11
8
16
14
6
11
5
5
5
9
16
13
8
8
12
6
7
10
16
4
9
16
10
5
10
8
9
13
14
14
10
5
10
12
14
10
18
5
2
15
9
9
4
15
5
15
7
5
4
9
6
7
5
4
3
3
13
5
4
9
6
14
10
11
4
15
3
4
11
3
10
16
10
15
6
16
7
13
13
14
10
7
10
3
4
5
14
13
8
5
7
5
8
10
9
6
7
10
4
5
5
3
12
7
5
17
14
4
9
7
11
6
12
6
6
2


In [43]:
print(len(coincidencias))
print(sum(coincidencias)/len(coincidencias))
# Este valor indica la exactitud de las s
print(sum(coincidencias)/len(coincidencias)/(to_n-from_n))

14004
9.425878320479862
0.4712939160239931


### 5to Rango

In [44]:
from_n = 4001
to_n =4021
print("Número de documentos similares a obtener: ",to_n-from_n)

Número de documentos similares a obtener:  20


In [45]:
matrix_idx=[]
for similitudes in similarity_df.values:
    #print(similitudes)
    similar_docs_idxs= np.argsort(-similitudes)[from_n:to_n] # Se excluye el  propio doc
    #print(similar_docs_idxs)
    matrix_idx.append(similar_docs_idxs)
# Matriz con los indices de los n documentos más parecidos
matrix_idx = np.array(matrix_idx)

In [46]:
# describir
i=0
coincidencias=[]
for indices in matrix_idx:
    #print(utils.num_of_coincidances(indices,original_paper_idx=i,topics=temas))
    coincidencias.append(utils.num_of_coincidances(indices,original_paper_idx=i,topics=temas))
    i+=1

6
12
8
6
16
11
11
14
17
5
10
6
5
10
19
6
3
4
2
5
15
15
9
11
12
16
12
4
10
7
8
4
9
8
13
8
3
15
7
16
6
9
9
3
3
0
15
16
8
2
12
17
5
14
9
4
14
20
6
7
8
11
10
9
5
14
14
3
18
5
11
14
14
13
11
14
11
8
12
4
11
11
5
5
11
7
6
4
6
3
6
7
15
9
11
11
5
10
11
6
6
13
14
16
20
12
11
18
10
6
10
6
9
8
6
14
9
11
8
3
4
7
11
4
15
13
6
8
4
13
6
4
7
10
7
9
7
12
4
2
3
5
14
5
11
16
11
11
10
13
14
8
7
12
15
1
5
6
17
7
9
8
16
8
2
4
3
7
14
4
5
7
4
1
10
12
10
9
14
9
17
8
15
6
12
2
8
15
5
5
13
14
12
6
6
17
10
7
10
3
4
16
12
12
16
3
7
3
12
7
16
10
9
15
15
6
7
9
5
8
10
12
12
13
11
7
15
5
13
6
10
13
3
4
10
2
15
8
14
4
6
14
10
6
15
13
15
15
4
10
11
9
2
4
5
5
9
6
3
10
10
7
3
7
7
14
6
7
12
2
7
9
5
5
15
13
6
14
7
11
18
15
17
8
10
5
7
10
9
15
2
3
7
8
4
16
6
9
2
14
17
7
3
7
14
14
9
11
17
20
8
6
7
9
6
7
5
7
10
10
9
15
11
12
5
11
0
7
5
15
14
5
6
12
10
14
13
9
17
3
8
14
7
10
10
5
19
20
17
4
14
10
4
6
6
9
14
14
13
15
14
7
11
1
4
13
6
11
17
7
15
15
3
5
3
13
5
8
10
12
5
10
7
2
9
7
12
6
4
8
17
6
16
17
8
6
5
6
11
10
2
6
7
4
5
9
8
6


6
12
4
5
16
3
8
7
16
14
6
8
4
7
11
7
15
4
14
8
16
9
3
9
15
7
10
19
4
10
8
12
5
15
7
9
10
8
7
5
11
1
8
4
18
3
6
6
11
2
9
17
4
5
4
5
4
6
12
2
5
14
9
12
8
4
12
9
8
8
6
11
4
13
11
7
3
3
6
10
5
7
7
7
7
9
4
1
5
4
5
4
9
6
10
16
8
15
2
13
16
11
3
3
6
12
9
8
7
8
5
8
6
4
17
5
5
11
4
8
9
11
6
9
8
7
4
11
7
12
11
3
3
19
16
4
6
5
6
4
5
2
9
10
13
16
3
5
7
10
2
6
6
4
13
3
7
6
11
18
10
12
16
6
13
4
11
4
6
19
19
4
16
0
17
3
7
12
8
14
8
15
3
13
11
11
3
12
6
6
12
9
16
2
12
4
9
6
8
11
16
7
4
5
13
5
9
16
6
12
5
4
8
18
12
3
4
8
17
6
6
16
11
5
17
17
7
2
6
9
2
7
8
11
6
7
14
20
15
6
9
7
11
7
12
3
16
9
4
12
10
10
14
1
10
8
6
9
17
14
14
5
1
7
5
13
5
8
10
7
5
10
4
5
7
12
16
14
11
2
7
19
9
6
8
4
2
7
13
10
14
14
5
10
11
18
5
5
16
3
15
7
9
9
2
14
6
10
7
9
4
14
5
6
6
4
16
7
9
4
8
12
16
7
8
8
6
9
9
13
1
10
11
4
9
7
5
14
5
6
13
10
6
5
13
7
11
7
12
12
6
2
9
19
7
3
10
3
13
13
12
10
12
5
9
8
7
7
10
10
7
12
10
3
4
5
6
17
9
11
4
4
18
2
8
15
14
3
3
3
3
9
5
7
8
7
12
2
16
11
5
9
17
8
6
4
10
8
18
8
5
8
5
13
12
5
4
4
15
13
18
11


2
9
7
15
7
8
11
4
5
16
7
3
11
6
17
6
4
3
3
12
16
5
9
7
3
13
19
5
5
7
9
3
7
7
16
7
4
4
14
11
12
5
12
7
10
6
16
16
4
14
6
2
13
9
3
3
15
6
4
8
8
7
5
11
19
2
12
9
12
12
3
4
14
8
3
6
4
6
5
5
13
18
13
19
5
1
6
7
4
8
12
7
9
10
9
3
1
15
12
16
12
4
6
9
7
2
11
15
7
13
3
3
9
1
17
18
8
16
9
10
2
16
7
7
4
7
11
12
5
9
16
10
5
2
14
3
15
13
6
11
11
6
4
4
5
10
8
9
9
6
13
8
14
6
10
7
12
12
16
14
10
5
13
5
6
6
13
8
6
6
14
9
2
10
16
9
9
11
5
7
9
8
11
7
7
12
4
11
14
8
10
10
11
6
8
3
12
15
7
11
9
5
2
4
13
8
4
9
4
3
7
13
9
18
15
3
12
5
2
12
9
8
6
11
11
15
12
9
9
9
17
7
18
4
15
10
7
0
11
7
7
7
7
6
9
16
8
11
3
12
15
5
6
13
4
19
6
7
4
17
19
5
7
5
8
4
10
6
4
6
11
13
11
6
8
13
15
1
14
4
8
7
11
13
15
14
6
7
12
3
9
18
9
20
16
14
5
12
9
13
8
5
6
8
15
2
3
11
7
6
9
4
5
13
10
9
17
2
7
5
9
3
4
14
12
6
10
8
5
6
6
14
5
14
5
3
6
8
6
6
12
10
3
5
11
6
16
5
10
7
9
6
5
13
14
4
12
6
8
2
9
11
6
4
8
8
10
10
8
5
8
13
6
3
16
9
4
10
7
11
11
4
4
5
9
5
10
6
15
15
5
10
10
12
7
10
12
11
14
5
3
5
5
16
5
6
11
4
3
8
10
8
7
11
9
9
3
6
8
5
1

7
6
7
16
4
2
5
8
13
13
15
13
3
7
5
3
4
10
5
10
4
12
14
9
12
16
13
16
6
4
13
5
8
8
11
13
5
3
15
9
15
5
6
8
15
4
8
6
10
4
16
12
13
15
9
10
17
8
13
4
7
4
14
11
7
9
5
8
12
12
12
11
9
8
6
16
6
4
14
9
5
9
10
15
17
8
11
14
5
10
19
9
11
17
6
7
2
16
7
9
5
9
3
8
10
5
4
6
7
5
8
7
11
8
6
13
12
5
7
11
13
8
6
5
11
5
14
6
15
1
15
16
15
15
14
5
10
5
11
8
18
7
2
8
7
9
10
10
6
5
7
9
8
14
7
3
11
6
6
5
4
11
7
11
7
7
6
5
8
8
12
5
12
12
14
11
17
11
6
5
12
5
12
13
15
6
4
12
4
8
9
12
11
17
7
18
5
12
10
9
11
5
11
7
8
4
6
2
6
17
14
12
9
2
13
6
12
7
4
5
17
4
3
4
9
14
4
17
15
6
11
16
5
16
9
9
14
8
6
6
12
16
10
3
16
9
4
9
5
13
3
7
5
14
10
3
10
4
10
9
6
10
17
7
14
9
10
8
10
11
11
14
4
14
10
10
14
11
10
10
14
3
9
7
16
7
8
6
5
8
10
7
10
15
4
9
4
6
8
10
11
13
11
19
11
6
12
2
12
8
2
7
5
2
4
4
5
9
7
13
5
7
6
12
14
8
4
4
14
5
6
13
11
11
7
10
14
8
7
2
6
11
11
4
14
15
9
18
10
11
19
5
9
13
5
9
8
17
14
9
12
12
18
11
6
14
2
11
9
6
11
5
10
6
7
6
7
3
9
16
12
4
3
15
6
10
13
15
18
4
4
11
10
9
4
5
7
6
6
3
4
4
13
11
8
10
11
15
1
3


13
7
10
5
15
9
10
5
13
8
12
7
4
3
5
10
5
5
15
5
15
12
7
5
14
1
4
17
7
11
17
14
9
4
9
7
11
10
13
11
11
12
5
6
5
15
12
6
5
6
4
15
12
4
3
3
16
5
3
5
1
12
5
3
11
12
6
10
5
8
1
8
12
5
6
3
4
11
11
10
4
9
15
9
14
3
4
4
16
11
9
8
8
16
16
6
14
12
4
7
12
3
3
8
16
13
13
13
5
16
16
9
12
6
9
5
9
14
11
0
3
10
8
10
10
6
1
15
13
14
14
9
7
3
9
15
4
6
16
1
8
14
5
11
14
7
12
9
6
10
2
13
16
17
11
8
6
9
6
7
10
7
10
5
3
6
17
4
10
5
11
11
15
15
18
19
2
10
6
14
9
17
12
8
4
9
15
9
11
13
8
4
7
18
1
15
8
7
8
11
6
10
10
12
13
8
12
10
11
2
5
6
17
18
15
8
11
4
5
8
5
4
10
7
10
8
10
8
7
6
6
17
8
5
13
6
14
4
12
13
8
7
7
13
2
15
16
4
6
12
2
11
7
8
3
7
7
18
8
7
3
5
2
4
7
6
6
14
2
11
14
15
5
12
16
7
6
17
7
10
9
4
10
8
5
9
15
3
6
12
5
2
3
14
7
9
9
7
5
14
10
8
12
2
9
3
13
9
9
5
11
4


In [47]:
print(len(coincidencias))
print(sum(coincidencias)/len(coincidencias))
# Este valor indica la exactitud de las s
print(sum(coincidencias)/len(coincidencias)/(to_n-from_n))

14004
8.75249928591831
0.43762496429591546
