In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfTransformer
import scattertext as st
from scipy.sparse.linalg import svds
import numpy as np
import os

In [2]:
data = pd.read_csv('top20_withAbstracts.csv', sep=";")
data.head(3)

Unnamed: 0,Author - Year,Author,Title,Year,Keywords,Abstract
0,"Pan, SJ (2010)","Pan, SJ",A Survey on Transfer Learning,2010,Transfer learning; survey; machine learning; d...,A major assumption in many machine learning an...
1,"Shelhamer, E (2017)","Shelhamer, E",Fully Convolutional Networks for Semantic Segm...,2017,Semantic Segmentation; Convolutional Networks;...,Convolutional networks are powerful visual mod...
2,"Girshick, R (2016)","Girshick, R",Region-Based Convolutional Networks for Accura...,2016,Object recognition; detection; semantic segmen...,"Object detection performance, as measured on t..."


In [3]:
data['Texto']= data["Title"].map(str) + " "+ data["Keywords"].map(str) +" "+data["Abstract"].map(str)

In [4]:
data['Parsed']= data['Texto'].apply(st.whitespace_nlp_with_sentences)

In [5]:
data['Categoria']= np.where(data['Year']> 2016, 'FRONTIER', 'NORMAL')

In [6]:
data.tail(3)

Unnamed: 0,Author - Year,Author,Title,Year,Keywords,Abstract,Texto,Parsed,Categoria
17,"Long, MS (2014)","Long, MS",Adaptation Regularization: A General Framework...,2014,Transfer learning; adaptation regularization; ...,"Domain transfer learning, which learns a targe...",Adaptation Regularization: A General Framework...,Adaptation Regularization: A General Framework...,NORMAL
18,"Zhang, L (2016)","Zhang, L",LSDT: Latent Sparse Domain Transfer Learning f...,2016,Transfer learning; domain adaptation; visualca...,We propose a novel reconstruction-based transf...,LSDT: Latent Sparse Domain Transfer Learning f...,LSDT: Latent Sparse Domain Transfer Learning f...,NORMAL
19,"Fu, YW (2015)","Fu, YW",Transductive Multi-View Zero-Shot Learning,2015,Transducitve learning; multi-view Learning; tr...,Most existing zero-shot learning approaches ex...,Transductive Multi-View Zero-Shot Learning Tra...,Transductive Multi-View Zero-Shot Learning Tra...,NORMAL


In [7]:
corpus = (st.CorpusFromParsedDocuments(data.sample(frac=1), category_col='Categoria', parsed_col='Parsed').build().get_stoplisted_unigram_corpus())

In [8]:
corpus = corpus.add_doc_names_as_metadata(corpus.get_df()['Author - Year'])

In [9]:
embeddings = TfidfTransformer().fit_transform(corpus.get_term_doc_mat())
embeddings

<20x889 sparse matrix of type '<class 'numpy.float64'>'
	with 1662 stored elements in Compressed Sparse Row format>

In [10]:
u, s, vt = svds(embeddings, k=3, maxiter=20000, which='LM')

In [11]:
corpus.get_term_count_df().sort_values(by=['corpus']).tail(10)

Unnamed: 0_level_0,corpus
term,Unnamed: 1_level_1
different,22
methods,22
source,23
convolutional,23
adaptation,32
target,42
data,57
transfer,68
domain,76
learning,112


In [None]:
u.T[0], u.T[1]

In [12]:
projection = pd.DataFrame({'term': corpus.get_metadata(), 'x': u.T[0], 'y': u.T[1]}).set_index('term')
projection

Unnamed: 0_level_0,x,y
term,Unnamed: 1_level_1,Unnamed: 2_level_1
"Ben-David, S (2010)",0.091006,-0.075597
"Lu, J (2015)",-0.34053,0.066492
"Girshick, R (2016)",0.173054,0.437989
"Duan, LX (2012)",0.17217,-0.178303
"Fu, YW (2015)",0.09583,0.040449
"Taylor, ME (2009)",-0.462783,0.09416
"Pan, SJ (2010)",-0.467088,0.029676
"Donahue, J (2017)",0.14367,0.435661
"Bruzzone, L (2010)",0.138631,-0.146922
"Dong, C (2015)",0.066674,0.301108


In [13]:
category = 'NORMAL'
scores = (corpus.get_category_ids() == corpus.get_categories().index(category)).astype(int)
scores

array([1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1])

In [19]:
html = st.produce_pca_explorer(corpus,
                               category,
                               category_name='2009~2016',
                               not_category_name='2017~2019',
                               metadata=data['Author - Year'],
                               width_in_pixels=600,
                               show_axes=False,
                               use_non_text_features=True,
                               use_full_doc=True,
                               projection=projection,
                               scores=scores,
                               show_top_terms=False,
                               save_svg_button=True
                              )

In [20]:
open("Documents5.html", 'wb').write(html.encode('utf-8'))

352070