In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("arxiv_embeddings.csv")
print(df.columns)

Index(['id', 'title', 'categories', 'title_tokenized', 'categories_tokenized',
       'categories_vectors'],
      dtype='object')


In [3]:
titles = []

for a in df['title']:
    titles.append(a)

In [4]:
import re

import numpy as np

from east.asts import base


def clear_text(text, lowerize=True):

    pat = re.compile(r'[^A-Za-z0-9 \-\n\r.,;!?А-Яа-я]+')
    cleared_text = re.sub(pat, ' ', text)

    if lowerize:
        cleared_text = cleared_text.lower()

    tokens = cleared_text.split()
    return tokens


def make_substrings(tokens, k=4):

    for i in range(max(len(tokens) - k + 1, 1)):
        yield ' '.join(tokens[i:i + k])


def get_relevance_matrix(texts, strings):

    matrix = np.empty((0, len(strings)), float)
    prepared_text_tokens = [clear_text(t) for t in texts]

    prepared_string_tokens = [clear_text(s) for s in strings]
    prepared_strings = [' '.join(t) for t in prepared_string_tokens]

    for text_tokens in prepared_text_tokens:
        ast = base.AST.get_ast(list(make_substrings(text_tokens)))
        row = np.array([ast.score(s) for s in prepared_strings])
        matrix = np.append(matrix, [row], axis=0)

    return matrix


def save_matrix(matrix):
    np.savetxt("relevance_matrix.txt", matrix)

In [5]:
with open("taxonomy_leaves.txt") as f:
    strings = [l.strip() for l in f.readlines()]

In [6]:
relevance_matrix = get_relevance_matrix(titles[:1500], strings)

In [7]:
save_matrix(relevance_matrix)

In [11]:
import numpy as np

In [16]:
import numpy as np

def init(c, data_n):
    A = np.random.random(size=(c, data_n))
    col_sum = np.sum(A, axis=0)
    return A/col_sum


def stepfcm(data, cntr, U, T, expo, a, b, nc, ni):
    mf = np.power(U, expo)
    tf = np.power(T, nc)
    tfo = np.power((1-T), nc)
    cntr = (np.dot(a*mf+b*tf, data).T/np.sum(
        a*mf+b*tf, axis=1).T).T
    dist = distfcm(cntr, data)
    obj_fcn = np.sum(np.sum(np.power(dist, 2)*(a*mf+b*tf), axis=0)) + np.sum(
        ni*np.sum(tfo, axis=0))
    ni = mf*np.power(dist, 2)/(np.sum(mf, axis=0))
    tmp = np.power(dist, (-2/(nc-1)))
    U = tmp/(np.sum(tmp, axis=0))
    tmpt = np.power((b/ni)*np.power(dist, 2), (1/(nc-1)))
    T = 1/(1+tmpt)
    return U, T, cntr, obj_fcn, ni


def distfcm(cntr, data):
    out = np.zeros(shape=(cntr.shape[0], data.shape[0]))
    for k in range(cntr.shape[0]):
        out[k] = np.sqrt(np.sum((np.power(data-cntr[k], 2)).T, axis=0))
    return out


def pfcm(data, c, expo=2, max_iter=1000, min_impro=0.005, a=1, b=4, nc=3):
    obj_fcn = np.zeros(shape=(max_iter, 1))
    ni = np.zeros(shape=(c, data.shape[0]))
    U = init(c, data.shape[0])
    T = init(c, data.shape[0])
    cntr = np.random.uniform(low=np.min(data), high=np.max(data), size=(
        c, data.shape[1]))
    for i in range(max_iter):
        current_cntr = cntr
        U, T, cntr, obj_fcn[i], ni = stepfcm(
                data, cntr, U, T, expo, a, b, nc, ni)
        if i > 1:
            if abs(obj_fcn[i] - obj_fcn[i-1]) < min_impro:
                break
            elif np.max(abs(cntr - current_cntr)) < min_impro:
                break
    return cntr, U

In [17]:
import matplotlib.pyplot as plt

In [19]:
cntr, U = pfcm(relevance_matrix.T, 3)

In [20]:
np.savetxt("clusters.dat", U)

In [1]:
!python3 pargenfs.py Data_Science_taxonomy.csv taxonomy_leaves.txt clusters.dat 0

Number of leaves: 351
All positive weights:
sample complexity and generalization bounds                  0.58095
unsupervised learning and clustering                         0.57694
boolean function learning                                    0.57415
After transformation:
sample complexity and generalization bounds                  0.58095
unsupervised learning and clustering                         0.57694
boolean function learning                                    0.57415
Setting weights for internal nodes
Membership in root: 1.00000
Pruning tree...
Setting gaps...
Other parameters setting...
ParGenFS main steps...
Done. Saving...
Table saved in the file: table.csv
ete representation saved in the file: taxonomy_tree_lifted.ete
ete representation saved.
Done.


In [2]:
!python3 pargenfs.py Data_Science_taxonomy.csv taxonomy_leaves.txt clusters.dat 1

Number of leaves: 351
All positive weights:
sample complexity and generalization bounds                  0.58271
unsupervised learning and clustering                         0.57670
boolean function learning                                    0.57260
After transformation:
sample complexity and generalization bounds                  0.58271
unsupervised learning and clustering                         0.57670
boolean function learning                                    0.57260
Setting weights for internal nodes
Membership in root: 1.00000
Pruning tree...
Setting gaps...
Other parameters setting...
ParGenFS main steps...
Done. Saving...
Table saved in the file: table.csv
ete representation saved in the file: taxonomy_tree_lifted.ete
ete representation saved.
Done.


In [3]:
!python3 pargenfs.py Data_Science_taxonomy.csv taxonomy_leaves.txt clusters.dat 2

Number of leaves: 351
All positive weights:
boolean function learning                                    0.58172
unsupervised learning and clustering                         0.57780
sample complexity and generalization bounds                  0.57249
After transformation:
boolean function learning                                    0.58172
unsupervised learning and clustering                         0.57780
sample complexity and generalization bounds                  0.57249
Setting weights for internal nodes
Membership in root: 1.00000
Pruning tree...
Setting gaps...
Other parameters setting...
ParGenFS main steps...
Done. Saving...
Table saved in the file: table.csv
ete representation saved in the file: taxonomy_tree_lifted.ete
ete representation saved.
Done.
