In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("arxiv_embeddings.csv")
print(df.columns)

Index(['id', 'title', 'categories', 'title_tokenized', 'categories_tokenized',
       'categories_vectors'],
      dtype='object')


In [3]:
titles = []

for a in df['title']:
    titles.append(a)


In [4]:
import re

import numpy as np

from east.asts import base


def clear_text(text, lowerize=True):

    pat = re.compile(r'[^A-Za-z0-9 \-\n\r.,;!?А-Яа-я]+')
    cleared_text = re.sub(pat, ' ', text)

    if lowerize:
        cleared_text = cleared_text.lower()

    tokens = cleared_text.split()
    return tokens


def make_substrings(tokens, k=4):

    for i in range(max(len(tokens) - k + 1, 1)):
        yield ' '.join(tokens[i:i + k])


def get_relevance_matrix(texts, strings):

    matrix = np.empty((0, len(strings)), float)
    prepared_text_tokens = [clear_text(t) for t in texts]

    prepared_string_tokens = [clear_text(s) for s in strings]
    prepared_strings = [' '.join(t) for t in prepared_string_tokens]

    for text_tokens in prepared_text_tokens:
        ast = base.AST.get_ast(list(make_substrings(text_tokens)))
        row = np.array([ast.score(s) for s in prepared_strings])
        matrix = np.append(matrix, [row], axis=0)

    return matrix


def save_matrix(matrix):
    np.savetxt("relevance_matrix.txt", matrix)

In [6]:
with open("taxonomy_leaves.txt") as f:
    strings = [l.strip() for l in f.readlines()]

In [7]:
relevance_matrix = get_relevance_matrix(titles[:1500], strings)
save_matrix(relevance_matrix)

In [8]:
import numpy as np
import numpy.linalg as LA

ZERO_BOUND = 10 ** (-9)
MIN_CLUSTER_CONTRIBUTION = 5 * 10 ** (-3)
EPSILON = 5 * 10 ** (-2)
# Maximum number of clusters
MAX_NUM_CLUSTERS = 15


def ensure_np_matrix(A):

    if not isinstance(A, np.matrix):
        A = np.matrix(A)
    return A


def faddis(A):
    A = ensure_np_matrix(A)
    
    # minimum cluster's relative contribution to the data scatter
    min_cont = MIN_CLUSTER_CONTRIBUTION
    # minimum relative residual data scatter
    eps = EPSILON
    # maximum number of clusters
    max_clust_num = MAX_NUM_CLUSTERS

    is_positive = True
    matrix_dim, _ = A.shape

    sc = np.power(A, 2)
    # Total data scatter
    scatter = np.sum(sc)

    cluster_got = 0
    membership_matrix = np.empty((matrix_dim, 0))
    contrib = np.array([])
    lat = np.array([])
    intensities = np.empty((0, 2))
    curr_cont = 1
    res_cont = 1

    # 'zero' and 'one' vectors for comparisons
    zeros_vect = np.zeros((matrix_dim, 1))
    ones_vect = np.ones((matrix_dim, 1))

    # ensure matrix is symmetrical
    At = (A + A.T) / 2
    matrix_sequence = [At]

    # Stop condition:
    # is_positive is True: eigen-value of the residual matrix is not positive;
    # OR la cluster intensity  reaches its minimum lam;
    # OR ep relative residual data scatter reaches its minimum eps;
    # OR maximum number of clusters max_clust_num is achieved
    while is_positive and curr_cont > min_cont and res_cont > eps and cluster_got <= max_clust_num:
        # collecting a fuzzy cluster membership uf, with contrib con and intensity la,
        eig_vals, eig_vecs = LA.eig(At)
        # (lt, ii) - (maximum eigen-value, corresponding position)
        eig_vals_diag = np.diag(eig_vals)
        # Only positive eigenvalues
        eig_vals_pos = np.argwhere(eig_vals > ZERO_BOUND).ravel()
        eig_vals_pos_len = eig_vals_pos.size
        cur_intensities = np.zeros((eig_vals_pos_len, 1))
        vm = np.zeros((matrix_dim, eig_vals_pos_len))
        for k in range(eig_vals_pos_len):
            lt = eig_vals_diag[eig_vals_pos[k]]
            vf = eig_vecs[:, eig_vals_pos[k]]

            # Calculate normalized membership vector belonging to [0, 1] by
            # projection on the space. The normalization factor is the
            # Euclidean length of the vector
            bf = np.maximum(zeros_vect, vf)
            uf = np.minimum(bf, ones_vect)
            
            if LA.norm(uf) > 0:
                uf = uf / LA.norm(uf)

            vt = uf.T.dot(At).dot(uf)
            uf = np.squeeze(np.asarray(uf))

            wt = uf.T.dot(uf)
            # Calculates the intensity Lambda (la) of the cluster, which is
            # defined almost as the Rayleigh quotient
            if wt > 0:
                la = vt.item() / (wt **2)
            else:
                la = 0

            # since lt*vf =(-lt)*(-vf), try symmetric version 
            # using -vf:
            vf1 = -vf

            bf1 = np.maximum(zeros_vect, vf1)
            uf1 = np.minimum(bf1, ones_vect)
            uf1 = np.squeeze(np.asarray(uf1))

            if LA.norm(uf1) > 0:
                uf1 = uf1 / LA.norm(uf1)
                
            vt1 = uf1.T.dot(At).dot(uf1)
            wt1 = uf1.T.dot(uf1)
            if wt1 > 0:
                la1 = vt1.item() / (wt1 **2)
            else:
                la1 = 0

            if la > la1:
                cur_intensities[k] = la
                vm[:, k] = uf.ravel()
            else:
                cur_intensities[k] = la1
                vm[:, k] = uf1.ravel()

        contrib_max, contrib_max_index = cur_intensities.max(), cur_intensities.argmax()
        if contrib_max > ZERO_BOUND:
            lat = np.append(lat, eig_vals[eig_vals_pos[contrib_max_index]])
            intensities = np.append(intensities, np.matrix([np.sqrt(contrib_max),
                                                            contrib_max]), axis=0)
            # square root and value of lambda intensity of cluster_got
            # square root shows the value of fuzzyness
            uf = vm[:, contrib_max_index]
            vt = uf.T.dot(At).dot(uf)
            wt = uf.T.dot(uf)

            membership_matrix = np.append(membership_matrix, np.matrix(uf).T, axis=1)
            # calculate residual similarity matrix:
            # remove the present cluster (i.e. itensity* membership) from
            # similarity matrix
            Att = At - contrib_max * np.matrix(uf).T * np.matrix(uf)
            At = (Att + Att.T) / 2
            matrix_sequence.append(At)

            curr_cont = (vt / wt) ** 2
            # Calculate the relative contribution of cluster_got
            curr_cont /= scatter
            contrib = np.append(contrib, curr_cont)
            # Calculate the residual contribution
            res_cont -= curr_cont
            cluster_got += 1
        else:
            is_positive = False

    if not is_positive:
        print('No positive weights at spectral clusters')
    elif curr_cont < min_cont:
        print('Cluster contribution is too small')
    elif res_cont < eps:
        print('Residual is too small')
    elif cluster_got > max_clust_num:
        print('Maximum number of clusters reached')

    return matrix_sequence, membership_matrix, contrib, intensities, lat, cluster_got


if __name__ == '__main__':

    M = np.matrix([[1, .5, .3,  .1],
                   [.5, 1, .98, .4],
                   [.3, .98, 1, .6],
                   [.1, .4, .6, 1 ]])
    #M = np.matrix([[1, 0, 1], [0, 3, 0], [1, 0, 9]])
    M = np.matrix(np.random.rand(500, 500))

    B, member, contrib, intensity, lat, tt = faddis(M)
    print("B")
    print(B)
    print("member")
    print(member)
    print("contrib")
    print(contrib)
    print("intensity")
    print(intensity)
    print("lat")
    print(lat)
    print("tt")
    print(tt)

Cluster contribution is too small
B
[matrix([[0.35243066, 0.36258678, 0.92027008, ..., 0.5435544 , 0.69995649,
         0.60429098],
        [0.36258678, 0.63354169, 0.50249713, ..., 0.57876935, 0.81993271,
         0.36650739],
        [0.92027008, 0.50249713, 0.24504618, ..., 0.41062836, 0.30363743,
         0.45641049],
        ...,
        [0.5435544 , 0.57876935, 0.41062836, ..., 0.51127147, 0.9082341 ,
         0.28337575],
        [0.69995649, 0.81993271, 0.30363743, ..., 0.9082341 , 0.09049915,
         0.53220939],
        [0.60429098, 0.36650739, 0.45641049, ..., 0.28337575, 0.53220939,
         0.71722806]]), matrix([[-0.11573511, -0.12586319,  0.43961036, ...,  0.04855073,
          0.21026136,  0.11986343],
        [-0.12586319,  0.12392867,  0.00101188, ...,  0.06231868,
          0.30902058, -0.13890892],
        [ 0.43961036,  0.00101188, -0.24844092, ..., -0.09758548,
         -0.1991262 , -0.04094498],
        ...,
        [ 0.04855073,  0.06231868, -0.09758548, ..., 

In [9]:
import numpy as np
import numpy.linalg as LA

ZERO_BOUND = 10 ** (-8)
ENTITY_BOUND = 10 ** (-4)


def lapin(A):
    A = (A + A.T) / 2
    a_sums = np.ravel(abs(sum(A)))
    checked = np.array(a_sums > ENTITY_BOUND)
    is_correct = checked.all()

    if not is_correct:
        print('These entities are no good - remove them first!!!')
        print([i for i, j in enumerate(checked, 1) if not j])
        A = A[:, checked][checked, :]
        a_sums = a_sums[checked]

    matrix_dim, _ = A.shape
    C = np.empty((matrix_dim, matrix_dim))
    for i in range(matrix_dim):
        for j in range(matrix_dim):
            C[i, j] = A[i, j] / np.sqrt(a_sums[i] * a_sums[j])

    eig_vals, eig_vecs = LA.eig(np.eye(matrix_dim) - C)
    eig_vals_diag = np.diag(eig_vals)
    nonzero_cond = np.array(eig_vals > ZERO_BOUND)
    nonzero_eig_vals_diag = eig_vals_diag[nonzero_cond, :][:, nonzero_cond]
    nonzero_eig_vecs = eig_vecs[:, nonzero_cond]
    B = nonzero_eig_vecs.dot(LA.inv(nonzero_eig_vals_diag)).dot(nonzero_eig_vecs.T)

    return B


if __name__ == '__main__':

    M = np.matrix([[1, 2, 1], [2, 4, 1], [1, 1, 9]])
    M = np.matrix([[1, 0, 1], [0, 3, 0], [1, 0, 9]])
    M_transformed = lapin(M)
    print(M_transformed)

[[ 1.38888889  0.         -0.62112999]
 [ 0.          0.         -0.        ]
 [-0.62112999 -0.          0.27777778]]


In [3]:
from operator import itemgetter

tc = relevance_matrix.dot(relevance_matrix.T)

In [5]:
from faddis import faddis
from lapin import lapin
from operator import itemgetter

In [6]:
tc_transformed = lapin(tc)

In [7]:
B, member, contrib, intensity, lat, tt = faddis(tc_transformed)

  cur_intensities[k] = la1
  vm[:, k] = uf1.ravel()
  cur_intensities[k] = la
  vm[:, k] = uf.ravel()


Cluster contribution is too small


In [8]:
np.savetxt("clusters.dat", member)

In [9]:
for cluster in member.T:
    print(list(sorted(zip(titles, cluster.flat), key=itemgetter(1), reverse=True))[:3], '\n')

NameError: name 'titles' is not defined

In [11]:
!python3 pargenfs.py Data_Science_taxonomy.csv taxonomy_leaves.txt clusters.dat 0

Number of leaves: 351
All positive weights:
bayesian analysis                                            0.31843
canonical correlation analysis                               0.24021
factor analysis                                              0.22553
mixture modeling                                             0.19191
lifelong machine learning                                    0.18885
modelling                                                    0.18657
fusion of classifiers                                        0.16400
object recognition                                           0.16318
least moduli                                                 0.15447
q-learning                                                   0.15015
sequential monte carlo methods                               0.14405
database query languages                                     0.14215
cross-validation                                             0.12816
support vector machines                     

In [12]:
!python3 pargenfs.py Data_Science_taxonomy.csv taxonomy_leaves.txt clusters.dat 1

Number of leaves: 351
All positive weights:
feature selection                                            0.32607
kernel-based clustering                                      0.32370
semi-supervised learning                                     0.22434
decision diagrams                                            0.19681
approximate dynamic programming methods                      0.19348
unidimensional range search                                  0.16934
bagging                                                      0.16728
value iteration                                              0.16429
ensembling                                                   0.12945
exploratory data analysis                                    0.12915
retrieval efficiency                                         0.12889
surfacing                                                    0.12436
learning to rank                                             0.12318
learning to rank                            

In [13]:
!python3 pargenfs.py Data_Science_taxonomy.csv taxonomy_leaves.txt clusters.dat 2

Number of leaves: 351
All positive weights:
sample complexity and generalization bounds                  0.50953
video search                                                 0.39376
kernel-based clustering                                      0.37435
spatial and physical reasoning                               0.15668
incomplete data                                              0.15181
semi-supervised learning                                     0.15034
decision diagrams                                            0.14855
ensembling                                                   0.14104
mixture models                                               0.13784
statistical relational learning                              0.10894
cladograms                                                   0.09514
hierarchical representations                                 0.09257
query learning                                               0.09079
boolean function learning                   

In [14]:
!python3 pargenfs.py Data_Science_taxonomy.csv taxonomy_leaves.txt clusters.dat 3

Number of leaves: 351
All positive weights:
bayesian analysis                                            0.40910
factor analysis                                              0.35236
modelling                                                    0.26800
temporal difference learning                                 0.26135
max marginal computation                                     0.20840
inverse reinforcement learning                               0.17352
inverse reinforcement learning                               0.17352
online learning settings                                     0.15226
database query languages                                     0.14502
deep web                                                     0.14336
cognitive robotics                                           0.14049
least moduli                                                 0.13993
multiresolution                                              0.12765
natural language generation                 

In [16]:
!python3 pargenfs.py Data_Science_taxonomy.csv taxonomy_leaves.txt clusters.dat 4

Number of leaves: 351
All positive weights:
feature selection                                            0.37080
approximate dynamic programming methods                      0.22638
temporal difference learning                                 0.21215
multiresolution                                              0.18933
bagging                                                      0.17873
unidimensional range search                                  0.17648
semi-supervised learning                                     0.16342
value iteration                                              0.16150
exploratory data analysis                                    0.16143
boosting                                                     0.15685
boosting                                                     0.15685
online learning settings                                     0.15491
factorial hmm                                                0.15088
learning to rank                            