In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
import scattertext as st
from scipy.sparse.linalg import svds
import numpy as np

In [2]:
import os

In [None]:
pdf_dir = "../TEMAC/top-20-papers/"
files = os.listdir(pdf_dir)

In [None]:
files.remove('.DS_Store')

In [None]:
len(files)

In [None]:
if not os.path.exists("./txt/"):
    print('creating ', "./txt/")
    os.makedirs("./txt/")

In [None]:
have = set(os.listdir("./txt"))

In [None]:
for i,f in enumerate(files): 
    txt_basename = f + '.txt'
    if txt_basename in have:
        print('%d/%d skipping %s, already exists.' % (i, len(files), txt_basename, ))
        continue
    pdf_path = os.path.join(pdf_dir, f)
    txt_path = os.path.join("./txt/", txt_basename)
    cmd = "pdftotext %s %s" % (pdf_path, txt_path)
    os.system(cmd)
    print('%d/%d %s' % (i, len(files), cmd))

In [3]:
data = pd.read_csv('documents.csv', sep=";")
data.head(3)

Unnamed: 0,Nome,Arquivo,Ano
0,Pan (2010),10_1109--TKDE_2009_191.pdf.txt,2010
1,Shelhamer (2017),10_1109--TPAMI_2016_2572683.pdf.txt,2017
2,Girshick (2016),10_1109--TPAMI_2015_2437384.pdf.txt,2016


In [4]:
data['Texto']=data["Arquivo"].apply(lambda fname: open("./top20abstracts_txt/"+fname, "r").read().replace('\n', ''))

In [5]:
data['Parsed']= data['Texto'].apply(st.whitespace_nlp_with_sentences)
data['Categoria']= np.where(data['Ano']> 2016, 'FRONTIER', 'NORMAL')

In [6]:
data.tail(3)

Unnamed: 0,Nome,Arquivo,Ano,Texto,Parsed,Categoria
17,Zhang (2016),10_1109--TIP_2016_2516952.pdf.txt,2016,LSDT: Latent Sparse Domain Transfer Learningfo...,LSDT: Latent Sparse Domain Transfer Learningfo...,NORMAL
18,Long (2013),10_1109--ICCV_2013_274.pdf.txt,2013,Transfer Feature Learning with Joint Distribut...,Transfer Feature Learning with Joint Distribut...,NORMAL
19,Fu (2015),10_1109--TPAMI_2015_2408354.pdf.txt,2015,Transductive Multi-view Zero-Shot LearningMost...,Transductive Multi-view Zero-Shot LearningMost...,NORMAL


In [7]:
corpus = (st.CorpusFromParsedDocuments(data.sample(frac=1), category_col='Categoria', parsed_col='Parsed').build().get_stoplisted_unigram_corpus())
corpus = corpus.add_doc_names_as_metadata(corpus.get_df()['Nome'])

In [8]:
embeddings = TfidfTransformer().fit_transform(corpus.get_term_doc_mat())
embeddings

<20x1047 sparse matrix of type '<class 'numpy.float64'>'
	with 1718 stored elements in Compressed Sparse Row format>

In [9]:
u, s, vt = svds(embeddings, k=3, maxiter=20000, which='LM')

In [34]:
corpus.get_term_count_df().sort_values(by=['corpus']).tail(10)

Unnamed: 0_level_0,corpus
term,Unnamed: 1_level_1
distribution,19
convolutional,20
methods,20
different,21
adaptation,25
target,28
data,53
transfer,55
domain,59
learning,85


In [10]:
u.T[0], u.T[1]

(array([-0.3473816 , -0.14022942,  0.42432806,  0.26518075,  0.01838124,
        -0.10537036, -0.01787369, -0.17523474,  0.37128464, -0.18892453,
        -0.11021823, -0.27273756,  0.10467141, -0.10246393, -0.24263888,
        -0.07520039,  0.12405308, -0.06952901, -0.21671091,  0.39187982]),
 array([ 0.08952413, -0.39590432, -0.07686658, -0.05711692, -0.31135918,
        -0.54383051, -0.32990408, -0.07459006, -0.0350106 ,  0.15352596,
        -0.46751766,  0.14085436,  0.07903418, -0.04940756, -0.00272074,
         0.02183445,  0.08674152,  0.0922206 ,  0.17468877,  0.00725442]))

In [39]:
projection = pd.DataFrame({'term': corpus.get_metadata(), 'x': u.T[0], 'y': u.T[1]}).set_index('term')
projection

Unnamed: 0_level_0,x,y
term,Unnamed: 1_level_1,Unnamed: 2_level_1
Li (2014),-0.347382,0.089524
Girshick (2016),-0.140229,-0.395904
Taylor (2009),0.424328,-0.076867
Shao (2015),0.265181,-0.057117
Dong (2015),0.018381,-0.311359
Shelhamer (2017),-0.10537,-0.543831
Kendall (2015),-0.017874,-0.329904
Gao (2014),-0.175235,-0.07459
Lu (2015),0.371285,-0.035011
Bruzzone (2010),-0.188925,0.153526


In [40]:
category = 'NORMAL'
scores = (corpus.get_category_ids() == corpus.get_categories().index(category)).astype(int)
scores

array([1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [41]:
html = st.produce_pca_explorer(corpus,
                               category,
                               category_name='2009~2016',
                               not_category_name='2017~2019',
                               metadata=data['Nome'],
                               width_in_pixels=300,
                               show_axes=False,
                               use_non_text_features=True,
                               use_full_doc=True,
                               projection=projection,
                               scores=scores,
                               show_top_terms=False,
                               save_svg_button=True
                              )

In [42]:
open("Documents4.html", 'wb').write(html.encode('utf-8'))

351127