In [1]:
from os import listdir
from gensim import corpora, models, similarities

In [2]:
listdir('test/full/')

['diario_de_los_ninos_tomo_2_1840.txt',
 'encuentro.txt',
 'hechizo_de_oaxaca.txt',
 'nuevos_misterios_de_mexico.txt',
 'viejo_testamento.txt',
 'viera_da_silva.txt',
 'viollet-le-duc_tome_1.txt',
 'vitral.txt',
 'wifredo_lam.txt',
 'xavier_icaza_trayectoria.txt',
 'zuniga.txt',
 'zurbaran.txt']

In [6]:
import re
def clean_text(d):
    '''d debe ser un string en unicode'''
    d = re.sub(u'[^a-z0-9áéíóúñäëïöü]', ' ', d)
    d = re.sub(' +', ' ', d)
    d = re.sub(' ([^ ]{1,3} )+', ' ', d, )
    d = re.sub(' [^ ]*(.)\\1{2,}[^ ]* ', ' ', d)
    return d

class CorpusCleaner(object):
    def __init__(self, dir, outdir):
        '''dir tiene los documentos sucios y outdir contendrá los limpios'''
        self.dir = dir
        self.outdir = outdir
        self.dir_list = listdir(self.dir)
    
    def __iter__(self):
        for doc in self.dir_list:
            d = open(self.dir + '/' + doc).read().decode('utf-8').lower()
            d = clean_text(d)
            #yield doc + '\n---------\n' + d[0:100]
            yield self.outdir + '/' + doc, d
            
class CorpusIterator(object):
    def __init__(self, dir):
        '''dir debe contener los documentos limpios'''
        self.dir = dir
        self.dir_list = listdir(self.dir)
    
    def __iter__(self):
        for doc in self.dir_list:
            f = open(self.dir + '/' + doc)
            d = f.read().decode('utf-8')
            f.close()
            yield d

            

In [54]:
import codecs
corpus_cleaner = CorpusCleaner('../../data/full-txt/', 'test/clean')
for f, tx in corpus_cleaner:
    print f
    out_file = codecs.open(f, 'w', 'utf-8')
    out_file.write(tx)
    out_file.close()


test/clean/101_masterpieces_of_american_primative_painting.txt
test/clean/1200_years_of_italian_sculpture.txt
test/clean/12_artistas_donde_se_origina_el_arte_en_el_aire.txt
test/clean/12_dibujos_de_jose_maria_velasco.txt
test/clean/20_dibujos_mexicanos_de_maroto.txt
test/clean/25_estudios_de_folklore.txt
test/clean/300_anos_de_fraudes_en_el_comercio_de_antiguedades.txt
test/clean/330_grabados_originales_manuel_manilla.txt
test/clean/45_contemporary_mexican_artists.txt
test/clean/50_anos_de_danza_en_el_palacio_de_bellas_artes_1934_-_1984_vol._2.txt
test/clean/a_cien_anos_del_5_de_mayo_de_1862.txt
test/clean/a_grevin_le_monde_amusant.txt
test/clean/a_guide_to_mexican_art.txt
test/clean/a_la_politica_en_el_arte.txt
test/clean/a_new_history_of_photography.txt
test/clean/a_wall_to_paint_on.txt
test/clean/abraham_angel_y_su_tiempo.txt
test/clean/abstract_and_surrealist.txt
test/clean/acambaro_colonial.txt
test/clean/acapulco.txt
test/clean/accion_de_las_naciones_unidas_en_mexico.txt
test/cle

In [8]:
corpus_iterator = CorpusIterator('test/clean/')
for text in corpus_iterator:
    print text[:20]

 masterpieces americ
 kíssá jgübmj 1200 y
 artistas donde orig
 jose maria velasco 
 veinte dibujos impr
 estudios folklore e
 dónde acaba labor c
 éfli lmtó grabados 
 contemporary charlo
 palacio bellas arte
 piilitifiilsf iiaff
 monde amusant paris
artmrchitecturtl wit
 p0l3t3c eduardo tam
 iriilil1 mihlisli l
 frontispiece ione r
 cculturpo gobierno 
 ipii lililí flpl jg
 a11t1 mfxico docume
mexico biblioteca po
 acción naciones uni
 nacional arte cróna
il utrf attf jlíwljí
 serie volador joaqu
 african interest pr
 aguafoptistas vioda
 sobff uvufíík omcuí
 agustín lazo juan g
 visión histórica es
 util hroalgo obra m
 1990 1992 madrid co
l manuales ciencias 
12 diciembre 1531 di
ubum hempo perdido h
p ísta obra imprimía
 ifífe aviír srjk tr
i gonzalez rbyna gar
 orladu rnlálofío on
n alfredo zalee alfr
 michoacán 1996 2002
 juan farill solares
 ivíí xrfi tijil saf
 autor visiones prov
 almacen universal a
 itéqccx para comerc
 üalvadot lica félan
 liga escritores amé
 revista cont

In [34]:
# Generamos el diccionario
dictionary = corpora.Dictionary(doc.split() for doc in corpus_iterator)
once_ids = [tokenid for tokenid, docfreq in dictionary.dfs.iteritems() if docfreq <= 1]
dictionary.filter_tokens(once_ids) # remove stop words and words that appear only once
dictionary.compactify() # remove gaps in id sequence after words that were removed
dictionary.save('test/out/dictionary.dict')
print(dictionary)

Dictionary(972547 unique tokens: [u'conferuantur', u'nualart', u'bloqueos', u'maderista', u'alible']...)


In [35]:
# Guardamos el corpus vectorizado
corpus_bow = [dictionary.doc2bow(d.split()) for d in corpus_iterator]
corpora.MmCorpus.serialize('test/out/corpus.bow', corpus_bow)

In [36]:
# TF-IDF
dictionary = corpora.Dictionary.load('test/out/dictionary.dict')
corpus = corpora.MmCorpus('test/out/corpus.bow')
tfidf = models.TfidfModel(corpus)
index = similarities.MatrixSimilarity(tfidf[corpus])
index.save('test/out/example.index')



In [10]:
#dictionary = corpora.Dictionary.load('test/out/dictionary.dict')
#corpus = corpora.MmCorpus('test/out/corpus.bow')
#tfidf = models.TfidfModel(corpus)
#index = similarities.MatrixSimilarity.load('test/out/example.index')
doc = u'diego rivera y frida kahlo'
vec_bow = dictionary.doc2bow(doc.lower().split())
vec_tfidf = tfidf[vec_bow]
#vec_tfidf = tfidf[corpus[3]]
sims = sorted(enumerate(index[vec_tfidf]), key = lambda item: item[1], reverse=True)
#sims
[(i, corpus_iterator.dir_list[i], s) for i, s in sims[:10]]

[(525, 'el_pincel_de_la_angustia.txt', 0.79978663),
 (1281, 'teresa_del_conde_frida_kahlo.txt', 0.78909117),
 (1297, 'the_frida_kahlo_photographs.txt', 0.75656635),
 (671, 'frida_kahlo_rauda_jamis.txt', 0.68619716),
 (672, 'frida_kahlo_song_of_herself.txt', 0.63377595),
 (420, 'el_bano_de_frida_kahlo.txt', 0.61040503),
 (669, 'frida_kahlo_and_diego_rivera.txt', 0.52570051),
 (294,
  'configuracion_de_un_modelo_axiologico_para_la_critica_de_arte.txt',
  0.51556849),
 (542, 'el_ropero_de_frida.txt', 0.46197942),
 (670, 'frida_kahlo_das_gesamtwerk.txt', 0.43894216)]

In [81]:
for doc in tfidf[corpus[:3]]:
    print doc[:3]

[(1765, 0.1596380728102906), (2072, 0.0023899285951274616), (2264, 0.029824777462042568)]
[(105, 0.005509653511591907), (225, 0.0011634396236651899), (230, 0.0010732932592533197)]
[(1045, 0.0023174377860215125), (1519, 0.005310752826494951), (1639, 0.009418469091893696)]


In [87]:
sims = []
for doc in tfidf[corpus[:5]]:
    sim = sorted(enumerate(index[doc]), key = lambda item: item[1], reverse=True)
    sims.append(sim[:5])
    print sims[:5]

[[(0, 1.0000004), (49, 0.41182685), (48, 0.37594783), (373, 0.36813477), (325, 0.35876077)]]
[[(0, 1.0000004), (49, 0.41182685), (48, 0.37594783), (373, 0.36813477), (325, 0.35876077)], [(1, 0.99999857), (88, 0.73734629), (808, 0.65770662), (284, 0.64293075), (965, 0.64045703)]]
[[(0, 1.0000004), (49, 0.41182685), (48, 0.37594783), (373, 0.36813477), (325, 0.35876077)], [(1, 0.99999857), (88, 0.73734629), (808, 0.65770662), (284, 0.64293075), (965, 0.64045703)], [(2, 0.99999988), (1235, 0.47848839), (1120, 0.45816061), (1184, 0.35574362), (803, 0.31199992)]]
[[(0, 1.0000004), (49, 0.41182685), (48, 0.37594783), (373, 0.36813477), (325, 0.35876077)], [(1, 0.99999857), (88, 0.73734629), (808, 0.65770662), (284, 0.64293075), (965, 0.64045703)], [(2, 0.99999988), (1235, 0.47848839), (1120, 0.45816061), (1184, 0.35574362), (803, 0.31199992)], [(3, 1.0000001), (825, 0.43960726), (824, 0.39911795), (112, 0.23059526), (999, 0.2194667)]]
[[(0, 1.0000004), (49, 0.41182685), (48, 0.37594783), (37

In [1]:
import luigi

In [20]:
with luigi.LocalTarget('../luigi/test/full/zurbaran.txt').open('r') as f:
    t = f.read()
t[:100].decode('utf-8')

u'V )j\n  i\n\x0c\x0c\x0c_\n\x0c\x0c\x0c   -   u f /-aP   -/\n       \u2022 >/\xbfy\n\n\n\n\nZUREARAN\n\x0c\xa9 Ediciones Pol\xedgrafo, S. A.\n'

In [13]:
a = 'áü'
a.decode('utf-8')

u'\xe1\xfc'

In [24]:
a.decode()

u'aeio\xe4'

In [30]:
'älo'

'\xc3\xa4lo'

In [41]:
import unicodedata
def remove_accents(input_str):
    if type(input_str) is not unicode:
        input_str = unicode(input_str, 'utf-8')
    nkfd_form = unicodedata.normalize('NFKD', input_str)
    return u"".join([c for c in nkfd_form if not unicodedata.combining(c)])
a = u'áü'
remove_accents(a)

u'au'

In [36]:
unicode('áü', encoding='utf-8')

u'\xe1\xfc'

In [43]:
remove_accents('áéíóúäëïöüñ')

u'aeiouaeioun'

In [44]:
import pickle

In [46]:
with open('../luigi/test/models/dictionary.pickle', 'r') as f:
    d = pickle.load(f)

In [50]:
len(d.token2id)

76830