In [1]:
import nltk
import re
import math
from bs4 import BeautifulSoup
from pickle import dump, load
from nltk.corpus import cess_esp
from nltk.corpus import PlaintextCorpusReader
from nltk.corpus import stopwords

In [2]:
##############################################################
#					NORMALIZE TEXT
##############################################################

#Parameters: File path, encoding
#Return: String with only lower case letters
#Notes: path = '/Users/27AGO2019/Desktop/AbiiSnn/GitHub/Natural-Language-Processing/corpus/e961024.htm'
def getText(corpusRoot, code):
	f = open(corpusRoot, encoding = code) #Cod: utf-8, latin-1
	text = f.read()
	f.close()
	soup = BeautifulSoup(text, 'lxml')
	text = soup.get_text()
	text = text.lower()
	return text

In [3]:
#Parameters: Text
#Return: List of original tokens
def getTokens(text):
	tokens = nltk.word_tokenize(text)
	return tokens

#Parameters: List of normalize tokens
#Return: Set, vocabulary
def getVocabulary(tokens):
	vocabulary = sorted(set(tokens))
	return vocabulary

In [4]:
#Parameters: List of tuples of tokens
#Return: List of clean tokens and Tags
def getCleanTokensTags(tokens):
	clean = []
	for token in tokens:
		t = []
		for char in token[0]:
			if re.match(r'[a-záéíóúñüA-ZÁÉÍÓÚÜÑ]', char):
				t.append(char)
		letterToken = ''.join(t)

		if len(token[1]) > 0:
			tag = token[1]
			tag = tag[0].lower()

		if letterToken != '':
			l = (letterToken, tag)
			clean.append(l)
	return clean

In [5]:
#Parameters: List of clean tokens, language of stopwords
#Return: List of tokens without stopwords
def removeStopwords(tokens, language):
	sw = stopwords.words(language)

	cleanTokens = []
	for tok in tokens:
		l = ()
		if tok[0] not in sw:
			l = (tok[0], tok[1])
			cleanTokens.append(l)
	return cleanTokens

In [6]:
##############################################################
#						TAGGING
##############################################################
def make_and_save_combined_tagger(fname):
    default_tagger = nltk.DefaultTagger('v')
    patterns = [ (r'.*o$', 'n'),   # noun masculine singular
               	 (r'.*os$', 'n'),  # noun masculine plural
                 (r'.*a$', 'n'),   # noun feminine singular
                 (r'.*as$', 'n')   # noun feminine singular
               ]
    regexp_tagger = nltk.RegexpTagger(patterns, backoff=default_tagger)
    cess_tagged_sents = cess_esp.tagged_sents()
    combined_tagger = nltk.UnigramTagger(cess_tagged_sents, backoff=regexp_tagger)
    
    output = open(fname, 'wb')
    dump(combined_tagger, output, -1)
    output.close()

In [7]:
def tag(fname, text):
    input = open(fname, 'rb')
    default_tagger = load(input)
    input.close()

    s_tagged = default_tagger.tag(text)
    return s_tagged

In [8]:
##############################################################
#				     		LEMMAS
##############################################################
def getWord(word):
	cleanWord = ''
	for char in word:
		if char != '#':
			cleanWord += char
	return cleanWord

def getTag(word):
	c = 'v'
	if len(word) > 0:
		c = word[0]
	return c.lower()

In [9]:

# Return: Dictionary 
def createDicLemmas(tokensLemmas):
	lemmas = {}
	j = 0
	for i in range(0, len(tokensLemmas)- 2, 3):
		word = tokensLemmas[i]
		tag = tokensLemmas[i+1]
		val = tokensLemmas[i+2]
		l = (word, tag[0].lower())
		lemmas[l] = val
		j = j+1
	return lemmas

def lemmatizeText(tokens, lemmas):
	text = []
	for token in tokens:
		lemma = token[0]
		if token in lemmas:
			lemma = lemmas[token]
		aux = (lemma, token[1])
		text.append(aux)
	return text

In [10]:
##############################################################
#						CONTEXT
##############################################################

#Parameters: clean Tokens, vocabulary
#Return: Map of positions of every word in vocabulary
def initializeContext(tokens, vocabulary, lemmas):
	contexto = {}
	for word in vocabulary:
		contexto[word] = []

	for i in range(len(tokens)):
		contexto[tokens[i]].append(i)
	return contexto

In [11]:
#Parameters: Position of the word, size of window
#Return: Position of begin
def leftContextPosition(pos, window):
	pos = pos - window
	if(pos < 0):
		pos = 0
	return pos

#Parameters: Position of the word, size of window, size of window
#Return: Position of end
def rightContextPosition(pos, window, n):
	pos = pos + window
	if(pos > n):
		pos = n - 1 
	return pos

In [12]:
#Parameters: List of list(size 2)
#Return: list of context
def getContext(token, positions, window, originalText, vocabulary):
	context = []

	if token in positions:
		for pos in positions[token]:
			lpos = leftContextPosition(pos, window)
			rpos = rightContextPosition(pos, window, len(originalText))
			#con = []
			for i in range(lpos, pos):
				context.append(originalText[i])
			#con.append(token)
			for i in range(pos + 1, rpos):
				context.append(originalText[i])			
			#context.append(con)
	return context

In [13]:
##############################################################
#				VECTOR OPERATIONS
##############################################################

#Parameters: vector a, vector b
#Return: int, point product
#Notes: Need vectors of the same size
def pointProduct(a, b):
	ans = 0
	for i in range(0, len(a)):
		ans += (a[i] * b[i])
	return ans

#Parameters: vector of int
#Return: mag of a vector
def mag(v):
	ans = 0
	for i in range(0, len(v)):
		ans += (v[i] * v[i])
	return math.sqrt(ans)

In [14]:
##############################################################
#						FRECUENCY
##############################################################
def sumElements(vector):
	sum = 0
	for v in vector:
		sum = sum + v
	return sum

def getDocumentFrecuency(vocabulary, contexts):
	documentFreq = {}
	for term in vocabulary:
		documentFreq[term] = 0

	for context in contexts:
		for word in contexts[context]:
			documentFreq[word] = documentFreq[word] + 1 

	return documentFreq

In [15]:
def getIndexTuples(vocabulary):
	index = {}
	i = 0
	for term in vocabulary:
		index[term] = i 
		i = i + 1
	return index

In [16]:
def getFrecuency(vocabulary, contexts, b, k, documentFreq, lemmas, indexTuples):
	vectors = {}
	v = []
	for i in range(0, len(vocabulary) + 1):
		v.append(0)

	for term in vocabulary:
		vector = []
		for i in range(0, len(vocabulary) + 1):
			vector.append(0)
		for t in contexts[term]:
			vector[indexTuples[t]] = vector[indexTuples[t]] + 1
		vectors[term] = vector 

	sum = 0
	for element in vectors:
		for t in vectors[element]:
			sum = sum + t
	avdl = sum / len(vocabulary)
	
	# Getting IBM25 dictionary:
	IBM25 = {}
	for term in vocabulary:
		vectorFrecuency = vectors[term]
		magd1 = sumElements(vectorFrecuency)
		vIBM25 = []
		for t in vectorFrecuency:
			frec = 0
			num = (k + 1) * t 
			den = t + (k * (1 - b + ((b * magd1) / avdl)))
			if den != 0:
				frec = num / den
			vIBM25.append(frec)
		IBM25[term] = vIBM25

	# Getting IDF:
	IDF = []
	for term in vocabulary:
		frec = 0
		if documentFreq[term] != 0:
			frec = math.log((len(vocabulary) + 1) / documentFreq[term])
		IDF.append(frec)

	# Gettin tf - IDF = tf * IDF
	finalFrecuency = {}
	for term in vocabulary:
		vector = []
		ibm25 = IBM25[term]
		sumBM25 = sumElements(ibm25)
		for i in range(0, len(vocabulary)):
			frec = 0
			if sumBM25 != 0:
				frec = (ibm25[i] / sumBM25) * IDF[i]
			vector.append(frec)
		finalFrecuency[term] = vector

	return finalFrecuency

In [17]:
##############################################################
#						SIMILITUD
##############################################################
def getSimilitud(vocabulary, vectors, lemma):
	similitud = {}
	if lemma in vectors:
		v = vectors[lemma]
		for term in vocabulary:
			similitud[term] = 0
			if term in vectors:
				vec = vectors[term]
				if mag(v) != 0 and mag(vec) != 0:
					cos = pointProduct(v, vec) / (mag(v) * mag(vec))
					similitud[term] = cos
	return similitud

def getWords(fpath, code):
	f = open(fpath, encoding = code) #Cod: utf-8, latin-1
	text = f.read()
	f.close()

	words = re.sub(" ", " ",  text).split()
	# words = text.words(fname)
	# words = list(words) #Convertir a lista de palabras
	return words

def filtredSimilitud(similitud, word):
	sim = {}
	for t in similitud:
		if t[1] == word[1]:
			sim[t] = similitud[t]
	return sim

In [18]:
##############################################################
#						CREATE FILE
##############################################################

#Parameters: Set, vocabulary
#Return: Nothing
def createFile(path, vocabulary):
	f = open(path, 'w')
	for word in vocabulary:
		f.write(word + '\n')
	f.close()

#Parameters: , vocabulary
#Return: Nothing
def createFileDic(path, l):
	f = open(path, 'w')
	for item in l:
		f.write(str(item))
		f.write('\n')
	f.close()

def printContext(context):
	for i in range(0, len(context)):
		aux = ''
		for j in range(0, len(context[i])):
			aux += context[i][j] + " "
		print(aux)

def printDictionary(dic, n):
	i = 0
	for j in dic:
		print(j, dic[j])
		i = i + 1
		if i > n:
			break

def makePKL(fname, aux):
	output = open(fname, 'wb')
	dump(aux, output, -1)
	output.close()

def getPKL(fname):
	input = open(fname, 'rb')
	aux = load(input)
	input.close()
	return aux

In [19]:

##############################################################
#						SYNTAGMATIC RELATION
##############################################################
def getSyntagmatic(vectors, vocabulary, word):
	l = list()
	if word in vectors:
		print("HOLA")
		array = vectors[word]
		for i in range(0, len(vocabulary)):
			t = (array[i], vocabulary[i])
			l.append(t)
			i = i + 1
	l.sort(reverse = True)
	return l

In [21]:
fpathLemmas = './Normalize/generateClean.txt'
code = 'ISO-8859-1'
textLemmas = getWords(fpathLemmas, code)
print(textLemmas[:100])

['a', 'SPS00', 'a', 'a-já', 'I', 'a-já', 'abad', 'NCMS000', 'abad', 'abades', 'NCMP000', 'abad', 'abadesa', 'NCFS000', 'abadesa', 'abadesas', 'NCFP000', 'abadesa', 'abajo', 'I', 'abajo', 'abajo', 'RG000', 'abajo', 'abalancé', 'V0IS1S0', 'abalanzar', 'abalance', 'V0SP1S0', 'abalanzar', 'abalances', 'V0SP2S0', 'abalanzar', 'abalance', 'V0SP3S0', 'abalanzar', 'abalancemos', 'V0SP1P0', 'abalanzar', 'abalancéis', 'V0SP2P0', 'abalanzar', 'abalancen', 'V0SP3P0', 'abalanzar', 'abalance', 'V0R03S0', 'abalanzar', 'abalánceme', 'V0R03S0', 'abalanzar', 'abaláncese', 'V0R03S0', 'abalanzar', 'abaláncelo', 'V0R03S0', 'abalanzar', 'abaláncela', 'V0R03S0', 'abalanzar', 'abaláncelos', 'V0R03S0', 'abalanzar', 'abaláncelas', 'V0R03S0', 'abalanzar', 'abaláncele', 'V0R03S0', 'abalanzar', 'abalánceles', 'V0R03S0', 'abalanzar', 'abaláncenos', 'V0R03S0', 'abalanzar', 'abaláncemelo', 'V0R03S0', 'abalanzar', 'abaláncemela', 'V0R03S0', 'abalanzar', 'abaláncemelos', 'V0R03S0', 'abalanzar', 'abaláncemelas', 'V0R03S

In [27]:
def getPOS():
    with open("./Normalize/generateClean.txt", 'r') as f:
        lines = [line.strip() for line in f.readlines()]
    
    posDict = dict()
    for line in lines:
        if line != "":
            words = [word.strip() for word in line.split()]
            posDict[words[0]] = words[1]

    return posDict

In [28]:
def dumpToJson(obj, name):
    import json
    with open("./files/"+name, 'w', encoding="utf8") as outfile:
        json.dump(obj, outfile, indent=4)

In [29]:
pos = getPOS()

In [31]:
dumpToJson(pos, "generatePOS.json")

In [33]:
lemmas = {}
lemmas = createDicLemmas(textLemmas)

<class 'dict'>


In [34]:
print(list(lemmas.items())[:100])

[(('a', 's'), 'a'), (('a-já', 'i'), 'a-já'), (('abad', 'n'), 'abad'), (('abades', 'n'), 'abad'), (('abadesa', 'n'), 'abadesa'), (('abadesas', 'n'), 'abadesa'), (('abajo', 'i'), 'abajo'), (('abajo', 'r'), 'abajo'), (('abalancé', 'v'), 'abalanzar'), (('abalance', 'v'), 'abalanzar'), (('abalances', 'v'), 'abalanzar'), (('abalancemos', 'v'), 'abalanzar'), (('abalancéis', 'v'), 'abalanzar'), (('abalancen', 'v'), 'abalanzar'), (('abalánceme', 'v'), 'abalanzar'), (('abaláncese', 'v'), 'abalanzar'), (('abaláncelo', 'v'), 'abalanzar'), (('abaláncela', 'v'), 'abalanzar'), (('abaláncelos', 'v'), 'abalanzar'), (('abaláncelas', 'v'), 'abalanzar'), (('abaláncele', 'v'), 'abalanzar'), (('abalánceles', 'v'), 'abalanzar'), (('abaláncenos', 'v'), 'abalanzar'), (('abaláncemelo', 'v'), 'abalanzar'), (('abaláncemela', 'v'), 'abalanzar'), (('abaláncemelos', 'v'), 'abalanzar'), (('abaláncemelas', 'v'), 'abalanzar'), (('abaláncemele', 'v'), 'abalanzar'), (('abaláncemeles', 'v'), 'abalanzar'), (('abalánceselo'

In [37]:
fpath = './files/e961024.htm'
code = 'utf-8'

In [38]:
textSource = getText(fpath, code) 
tokensHtml = getTokens(textSource) #Get tokens with out html tags
# print("Text with tags, stopwords and punctuation:")
print(tokensHtml[:100])

['e961024_mod.htm', 'http', ':', '//www.excelsior.com.mx/9610/961024/art01.html', 'excelsior', ':', 'editorial', 'jueves', '24', 'de', 'octubre', 'de', '1996', 'epigrama', 'jorge', 'mansilla', 'torres', 'critica', 'el', 'miami', 'herald', 'al', 'presidente', 'ecuatoriano', ',', 'por', 'autoproclamarse', 'loco', '.', 'el', 'neoliberalismo', 'que', 'se', 'aplica', 'sin', 'encomio', 'hace', 'que', 'sean', 'lo', 'mismo', 'un', 'país', 'y', 'un', 'manicomio', '.', 'editorial', '-', 'nota', 'siguiente', 'http', ':', '//www.excelsior.com.mx/9610/961024/art02.html', 'excelsior', ':', 'editorial', 'jueves', '24', 'de', 'octubre', 'de', '1996', 'hungría', 'la', 'rebelión', 'antiestalinista', 'de', '1956', 'oscar', 'gonzalez', 'lopez', 'en', 'el', 'curso', 'de', 'octubre', 'de', '1956', ',', 'una', 'insurgencia', 'popular', 'comandada', 'por', 'estudiantes', ',', 'intelectuales', 'y', 'obreros', 'partidarios', 'de', 'establecer', 'en', 'suelo', 'húngaro', 'un', 'régimen', 'socialista', 'con']


In [39]:
# Tagging
fcombinedTagger = './Normalize/combined_tagger.pkl'
#make_and_save_combined_tagger(fcombinedTagger)
textTagged = tag(fcombinedTagger, tokensHtml)
print(textTagged[:100])

[('e961024_mod.htm', 'v'), ('http', 'v'), (':', 'Fd'), ('//www.excelsior.com.mx/9610/961024/art01.html', 'v'), ('excelsior', 'v'), (':', 'Fd'), ('editorial', 'ncms000'), ('jueves', 'W'), ('24', 'Z'), ('de', 'sps00'), ('octubre', 'W'), ('de', 'sps00'), ('1996', 'W'), ('epigrama', 'n'), ('jorge', 'v'), ('mansilla', 'n'), ('torres', 'v'), ('critica', 'vmip3s0'), ('el', 'da0ms0'), ('miami', 'v'), ('herald', 'v'), ('al', 'spcms'), ('presidente', 'ncms000'), ('ecuatoriano', 'aq0ms0'), (',', 'Fc'), ('por', 'sps00'), ('autoproclamarse', 'v'), ('loco', 'aq0ms0'), ('.', 'Fp'), ('el', 'da0ms0'), ('neoliberalismo', 'n'), ('que', 'pr0cn000'), ('se', 'p0300000'), ('aplica', 'vmip3s0'), ('sin', 'sps00'), ('encomio', 'n'), ('hace', 'vmip3s0'), ('que', 'pr0cn000'), ('sean', 'vssp3p0'), ('lo', 'da0ns0'), ('mismo', 'di0ms0'), ('un', 'di0ms0'), ('país', 'ncms000'), ('y', 'cc'), ('un', 'di0ms0'), ('manicomio', 'n'), ('.', 'Fp'), ('editorial', 'ncms000'), ('-', 'Fg'), ('nota', 'ncfs000'), ('siguiente', 'aq0

In [41]:
cleanTokens = getCleanTokensTags(textTagged)
# print("text with tags corrected:")
print(cleanTokens[:100])

[('emodhtm', 'v'), ('http', 'v'), ('wwwexcelsiorcommxarthtml', 'v'), ('excelsior', 'v'), ('editorial', 'n'), ('jueves', 'w'), ('de', 's'), ('octubre', 'w'), ('de', 's'), ('epigrama', 'n'), ('jorge', 'v'), ('mansilla', 'n'), ('torres', 'v'), ('critica', 'v'), ('el', 'd'), ('miami', 'v'), ('herald', 'v'), ('al', 's'), ('presidente', 'n'), ('ecuatoriano', 'a'), ('por', 's'), ('autoproclamarse', 'v'), ('loco', 'a'), ('el', 'd'), ('neoliberalismo', 'n'), ('que', 'p'), ('se', 'p'), ('aplica', 'v'), ('sin', 's'), ('encomio', 'n'), ('hace', 'v'), ('que', 'p'), ('sean', 'v'), ('lo', 'd'), ('mismo', 'd'), ('un', 'd'), ('país', 'n'), ('y', 'c'), ('un', 'd'), ('manicomio', 'n'), ('editorial', 'n'), ('nota', 'n'), ('siguiente', 'a'), ('http', 'v'), ('wwwexcelsiorcommxarthtml', 'v'), ('excelsior', 'v'), ('editorial', 'n'), ('jueves', 'w'), ('de', 's'), ('octubre', 'w'), ('de', 's'), ('hungría', 'n'), ('la', 'd'), ('rebelión', 'n'), ('antiestalinista', 'n'), ('de', 's'), ('oscar', 'v'), ('gonzalez', 

In [42]:
language = 'spanish'
tokens = removeStopwords(cleanTokens, language)
# print("Text without stopwords:")
print(tokens[:100])

[('emodhtm', 'v'), ('http', 'v'), ('wwwexcelsiorcommxarthtml', 'v'), ('excelsior', 'v'), ('editorial', 'n'), ('jueves', 'w'), ('octubre', 'w'), ('epigrama', 'n'), ('jorge', 'v'), ('mansilla', 'n'), ('torres', 'v'), ('critica', 'v'), ('miami', 'v'), ('herald', 'v'), ('presidente', 'n'), ('ecuatoriano', 'a'), ('autoproclamarse', 'v'), ('loco', 'a'), ('neoliberalismo', 'n'), ('aplica', 'v'), ('encomio', 'n'), ('hace', 'v'), ('mismo', 'd'), ('país', 'n'), ('manicomio', 'n'), ('editorial', 'n'), ('nota', 'n'), ('siguiente', 'a'), ('http', 'v'), ('wwwexcelsiorcommxarthtml', 'v'), ('excelsior', 'v'), ('editorial', 'n'), ('jueves', 'w'), ('octubre', 'w'), ('hungría', 'n'), ('rebelión', 'n'), ('antiestalinista', 'n'), ('oscar', 'v'), ('gonzalez', 'v'), ('lopez', 'v'), ('curso', 'n'), ('octubre', 'w'), ('insurgencia', 'n'), ('popular', 'a'), ('comandada', 'n'), ('estudiantes', 'n'), ('intelectuales', 'n'), ('obreros', 'n'), ('partidarios', 'n'), ('establecer', 'v'), ('suelo', 'n'), ('húngaro', '

In [43]:
# Lemmatize text
tokens = lemmatizeText(tokens, lemmas)
print(tokens[:100])

[('emodhtm', 'v'), ('http', 'v'), ('wwwexcelsiorcommxarthtml', 'v'), ('excelsior', 'v'), ('editorial', 'n'), ('jueves', 'w'), ('octubre', 'w'), ('epigrama', 'n'), ('jorge', 'v'), ('mansilla', 'n'), ('torres', 'v'), ('criticar', 'v'), ('miami', 'v'), ('herald', 'v'), ('presidente', 'n'), ('ecuatoriano', 'a'), ('autoproclamarse', 'v'), ('loco', 'a'), ('neoliberalismo', 'n'), ('aplicar', 'v'), ('encomio', 'n'), ('hacer', 'v'), ('mismo', 'd'), ('país', 'n'), ('manicomio', 'n'), ('editorial', 'n'), ('nota', 'n'), ('siguiente', 'a'), ('http', 'v'), ('wwwexcelsiorcommxarthtml', 'v'), ('excelsior', 'v'), ('editorial', 'n'), ('jueves', 'w'), ('octubre', 'w'), ('hungría', 'n'), ('rebelión', 'n'), ('antiestalinista', 'n'), ('oscar', 'v'), ('gonzalez', 'v'), ('lopez', 'v'), ('curso', 'n'), ('octubre', 'w'), ('insurgencia', 'n'), ('popular', 'a'), ('comandada', 'n'), ('estudiante', 'n'), ('intelectuales', 'n'), ('obrero', 'n'), ('partidario', 'n'), ('establecer', 'v'), ('suelo', 'n'), ('húngaro', '

In [44]:
vocabulary = getVocabulary(tokens)
print("vocabulary:")
print(vocabulary[3100:3200])

vocabulary:
[('gasoducto', 'n'), ('gasolina', 'n'), ('gasolinerías', 'n'), ('gastar', 'v'), ('gasto', 'n'), ('gato', 'n'), ('genaro', 'n'), ('generable', 'v'), ('generación', 'n'), ('generador', 'a'), ('general', 'a'), ('generalizada', 'a'), ('generalizarlo', 'n'), ('generalmente', 'r'), ('generar', 'v'), ('generis', 'v'), ('gente', 'n'), ('genéricas', 'n'), ('genético', 'a'), ('geográfico', 'a'), ('ger', 'v'), ('gerardo', 'n'), ('gerencia', 'n'), ('gerente', 'n'), ('geriatría', 'n'), ('gerstenzang', 'v'), ('gestar', 'v'), ('gestión', 'n'), ('gesto', 'n'), ('gigante', 'n'), ('gigantesca', 'n'), ('gilberto', 'n'), ('giordano', 'n'), ('gir', 'v'), ('gira', 'n'), ('girardi', 'v'), ('girolamo', 'n'), ('glaciares', 'v'), ('global', 'a'), ('globalizacion', 'v'), ('globalización', 'n'), ('globalizador', 'v'), ('globalizante', 'v'), ('globito', 'n'), ('glándula', 'n'), ('gméxico', 'n'), ('gobbée', 'v'), ('gobernación', 'v'), ('gobernador', 'n'), ('gobernadores', 'v'), ('gobernar', 'v'), ('gobe

In [46]:
positions = initializeContext(tokens, vocabulary, lemmas) #Initialize Context
printDictionary(positions, 10)

('abad', 'v') [9673]
('abajo', 'n') [3277]
('abajo', 'r') [21353]
('abanderada', 'n') [8001]
('abandonar', 'v') [17224, 26183]
('abandonarnos', 'n') [496]
('abarcar', 'v') [16256]
('abascadocarranza', 'n') [26567]
('abastecer', 'v') [18449, 29188, 29250]
('abasteciera', 'n') [11866]
('abastecimiento', 'n') [6293, 6338, 27191, 27236]


In [47]:
contexts = {}
for term in vocabulary:
	contexts[term] = getContext(term, positions, 4, tokens, vocabulary)

In [48]:
print(list(contexts.items())[:100])

[(('abad', 'v'), [('propio', 'd'), ('deshacer', 'v'), ('molestar', 'v'), ('injerencia', 'n'), ('emérito', 'n'), ('guadalupe', 'v'), ('contexto', 'n')]), (('abajo', 'n'), [('raúl', 'v'), ('castellano', 'n'), ('explicar', 'v'), ('pleno', 'a'), ('tribuna', 'v'), ('sorpresivo', 'n'), ('voto', 'n')]), (('abajo', 'r'), [('ingreso', 'n'), ('minoría', 'n'), ('empujadas', 'n'), ('aún', 'r'), ('deber', 'v'), ('inmigrantes', 'n'), ('ocupar', 'v')]), (('abanderada', 'n'), [('tomar', 'v'), ('contar', 'v'), ('aquellas', 'd'), ('corriente', 'n'), ('poderoso', 'a'), ('irresistible', 'a'), ('dogma', 'n')]), (('abandonar', 'v'), [('lanzar', 'v'), ('regreso', 'n'), ('atribulados', 'n'), ('feligreses', 'v'), ('rebaño', 'n'), ('atraídos', 'n'), ('hedonismo', 'n'), ('carácter', 'n'), ('religioso', 'a'), ('necesario', 'a'), ('añadir', 'v'), ('postura', 'n'), ('intransigentes', 'v'), ('confrontación', 'n')]), (('abandonarnos', 'n'), [('dejar', 'v'), ('comer', 'v'), ('tener', 'v'), ('hijo', 'n'), ('morirnos', 

In [50]:
# Getting Document frecuency for each word
documentFreq = {}
documentFreq = getDocumentFrecuency(vocabulary, contexts)

In [52]:
indexTuples = {}
indexTuples = getIndexTuples(vocabulary)

In [53]:
vectors = {}
b = 0.75
k = 1.2
vectors = getFrecuency(vocabulary, contexts, b, k, documentFreq, lemmas, indexTuples)

In [54]:
print(list(vectors.items())[:20])

[(('abad', 'v'), [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.