In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import pandas as pd
data = pd.read_csv("/content/drive/MyDrive/keywords.csv")
data['keywords']

0       parabolic-hyperbolic_type Gerasimov–Caputo_der...
1       rearrangement_invariant_spaces Hardy_classes g...
2       boundary_value_problem Fourier_series eigenval...
3       Schrödinger_type_equation the_Caputo_derivativ...
4       problem_with_parameter second_order_system_of_...
                              ...                        
1695    Generalised_derivation Matrix_and_operator_equ...
1696    Harmonic_univalent_starlike_functions Dziok-Sr...
1697    Almost_sure_limit_theorems random_allocations ...
1698    L1-convergence Dirichlet_kernel monotone_decre...
1699     Determinant LU-decomposition Recurrence_relation
Name: keywords, Length: 1700, dtype: object

In [6]:
doc = []
for line in data['keywords']:
  for word in line.split(' '):
    if len(word) > 3 and line not in doc:
      doc.append(line)
doc

['parabolic-hyperbolic_type Gerasimov–Caputo_derivatives discontinuous_gluing_condition integral_equations',
 'rearrangement_invariant_spaces Hardy_classes grand-Lebesgue_spaces the_Orlicz_spaces Boyd_indices',
 'boundary_value_problem Fourier_series eigenvalues eigenfunctions uniqueness_of_a_solution existence_of_a_solution',
 'Schrödinger_type_equation the_Caputo_derivatives time-dependent_source_identification_problem',
 'problem_with_parameter second_order_system_of_hyperbolic_equations Goursat_problem solvability algorithm',
 'biharmonic_map triharmonic_map conformal_deformation',
 'trigonometric_series absolute_summability measuring_set coefficient function',
 'unit_graph traversability clique_number independent_number covering_number',
 'two-body_Hamiltonian Schrödinger_operators Fridrix’s_model eigenvalue essential_spectrum asymptotics_of_the_Fredholm_determinant',
 'operator_norm spectral_radius numerical_radius',
 'acoustic_wave liquid_droplet vapor–gas_bubble viscoelastic_sh

In [5]:
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk
nltk.download('stopwords')


def pre_process(text):
  tokenizer = RegexpTokenizer(r'\w+')
  p_stemmer = PorterStemmer()
  stop_words = set(stopwords.words('english'))
  texts = []
  for i in text:
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)
    stopped_tokens = [i for i in tokens if not i in stop_words]
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]
    texts.append(stopped_tokens)

  return texts
doc = pre_process(data['keywords'])
doc

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


[['parabolic',
  'hyperbolic_type',
  'gerasimov',
  'caputo_derivatives',
  'discontinuous_gluing_condition',
  'integral_equations'],
 ['rearrangement_invariant_spaces',
  'hardy_classes',
  'grand',
  'lebesgue_spaces',
  'the_orlicz_spaces',
  'boyd_indices'],
 ['boundary_value_problem',
  'fourier_series',
  'eigenvalues',
  'eigenfunctions',
  'uniqueness_of_a_solution',
  'existence_of_a_solution'],
 ['schrödinger_type_equation',
  'the_caputo_derivatives',
  'time',
  'dependent_source_identification_problem'],
 ['problem_with_parameter',
  'second_order_system_of_hyperbolic_equations',
  'goursat_problem',
  'solvability',
  'algorithm'],
 ['biharmonic_map', 'triharmonic_map', 'conformal_deformation'],
 ['trigonometric_series',
  'absolute_summability',
  'measuring_set',
  'coefficient',
  'function'],
 ['unit_graph',
  'traversability',
  'clique_number',
  'independent_number',
  'covering_number'],
 ['two',
  'body_hamiltonian',
  'schrödinger_operators',
  'fridrix',
  's

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
vector = TfidfVectorizer()

In [8]:
X =vector.fit_transform(doc)
print(X)

  (0, 2630)	0.3553435377722737
  (0, 1414)	0.46250080227033863
  (0, 649)	0.4382517149455751
  (0, 2185)	0.4077014779196955
  (0, 2466)	0.3967976263589244
  (0, 3868)	0.37959262509703723
  (1, 590)	0.4117671835067655
  (1, 5406)	0.4117671835067655
  (1, 2925)	0.3901780784904868
  (1, 2243)	0.4117671835067655
  (1, 2310)	0.4117671835067655
  (1, 4437)	0.4117671835067655
  (2, 1747)	0.44012605178465763
  (2, 5673)	0.4581106815899699
  (2, 1551)	0.44012605178465763
  (2, 1556)	0.39679350993828993
  (2, 1950)	0.38284354729506187
  (2, 576)	0.3141630934076388
  (3, 1283)	0.5250749911554526
  (3, 5454)	0.41579894298723746
  (3, 5389)	0.5250749911554526
  (3, 4730)	0.5250749911554526
  (4, 186)	0.3843737082957127
  (4, 5010)	0.3843737082957127
  (4, 2216)	0.46721313403432324
  :	:
  (1673, 5921)	0.513109854428187
  (1673, 5671)	0.38764896856638725
  (1674, 4661)	0.4472135954999579
  (1674, 2023)	0.4472135954999579
  (1674, 4260)	0.4472135954999579
  (1674, 3224)	0.4472135954999579
  (1674, 21

In [9]:
from sklearn.decomposition import TruncatedSVD
lsa = TruncatedSVD(n_components=10,n_iter=100)
tfidf_lsa = lsa.fit_transform(X)

In [10]:
lsa.fit(X)

TruncatedSVD(n_components=10, n_iter=100)

In [11]:
terms = vector.get_feature_names()
components = lsa.components_[1]
idx_top_terms = sorted(range(len(components)), key=lambda k: components[k])
print("10 highest-weighted terms in concept 1:")
for t in idx_top_terms[:10]:
    print(" - %s : %0.02f"%(terms[t], t))

10 highest-weighted terms in concept 1:
 - non : 3602.00
 - maxwell : 3245.00
 - s_equations : 4638.00
 - linear_eigenvalue_problem : 3011.00
 - operator : 3773.00
 - polarized_electromagnetic_waves : 4081.00
 - spectrum : 5053.00
 - function : 2029.00
 - commutative_operator_graphs : 837.00
 - quantum_anticliques : 4286.00




In [12]:
for i,comp in enumerate(lsa.components_):
    termsInComp = zip(terms,comp)
    sortedterms = sorted(termsInComp, key=lambda x: x[1],reverse=True)[:10]
    print("Concept %d:" % i)
    for term in sortedterms:
        print(term[0])
    print(" ")

Concept 0:
non
spectrum
function
maxwell
s_equations
linear_eigenvalue_problem
operator
polarized_electromagnetic_waves
inhomogeneous_waveguide
commutative_operator_graphs
 
Concept 1:
hilbert_space
projection
von_neumann_algebra
linear_operator
algebra
idempotent
operator_inequality
trace
commutativity
unitary_operator
 
Concept 2:
eigenvalue
finite_element_method
eigenfunction
eigenvalue_problem
ordinary_differential_equation
nonlinear_eigenvalue_problem
eigenvibration
positive_eigenfunction
boundary_value_problem
beam
 
Concept 3:
boundary_value_problem
uniqueness
differential_equation
existence
integro
small_denominators
series
unique_solvability
initial
inverse_problem
 
Concept 4:
differential_equation
integro
degenerate_kernel
solvability
unique_solvability
characteristic_equation
two
singular_integral_operator
inverse_boundary_value_problem
spectral_parameters
 
Concept 5:
algebra
toeplitz_algebra
function
spectrum
reduced_semigroup_c
inductive_limit
partially_ordered_set
inver

In [14]:
from scipy.spatial import distance

query_idx = 400

query_features = tfidf_lsa[query_idx]
distances = [ distance.cosine(query_features, feat) for feat in tfidf_lsa ]
idx_closest = sorted(range(len(distances)), key=lambda k: distances[k])[1:]

query_doc = doc[query_idx]
return_doc = doc[idx_closest[0]]
print("Document's keywords:\n %s \nMost similar keywords to previous document:\n %s" %(query_doc, return_doc))

Document's keywords:
 saddlepoint_approximations moment_generating_function branching_process cumulative_distribution_function 
Most similar keywords to previous document:
 saddlepoint_approximations moment_generating_function survival_analysis cumulant_generating_function
