# Ejemplo de uso de modelos entrenados.

In [1]:
from __future__ import print_function
import nltk
from util.preprocessing import addCharInformation, createMatrices, addCasingInformation
from neuralnets.BiLSTM import BiLSTM
import sys

Using TensorFlow backend.


# Carga de modelo preentrenado.

In [3]:
lstmModel = BiLSTM()
lstmModel.loadModel("models/am/AM_TAG/0.3880_0.4147_12.h5")

# Carga de texto de prueba "test_.txt"

In [4]:
with open("test_.txt", 'r') as f:
    text = f.read()
    if len(text.split("\t"))>1:
        os.system('python util/conll_to_txt.py --input={} --output={}_txt --target_column={}'.format("test_.txt","test_.txt",1))
        with open("test_.txt_txt", 'r') as newf:
            text = newf.read()
text[:1000]

'International tourism is now more common than ever before The last 50 years have seen a significant increase in the number of tourist traveling worldwide . While some might think the tourism bring large profit for the destination countries , I would contend that this industry has affected the cultural attributes and damaged the natural environment of the tourist destinations . Firstly , it is an undeniable fact that tourists from different cultures will probably cause changes to the cultural identity of the tourist destinations . Take Thailand for example , in the Vietnam War , many American soldiers came to Thailand for a break and involved in sexual and drug activities , these huge demands caused many local businesses opened and expanded , even illegally involved in under - age prostitutes to maximize their profits . This was due to the lack of adequate controls by authorities and lead to a bad image of Thailand tourism . Therefore this proves that international tourism can create n

# Preparación previa de nuestro texto a evaluar

In [5]:
paragraphs = filter(lambda x: x != '', text.splitlines())
sentences = [{'tokens': nltk.word_tokenize(par)} for par in paragraphs]
sentences[0]['tokens'][:10]

['International',
 'tourism',
 'is',
 'now',
 'more',
 'common',
 'than',
 'ever',
 'before',
 'The']

In [7]:
addCharInformation(sentences)
addCasingInformation(sentences)
for key in sentences[0].keys():
    print(key, sentences[0][key][:10])


tokens ['International', 'tourism', 'is', 'now', 'more', 'common', 'than', 'ever', 'before', 'The']
casing ['initialUpper', 'allLower', 'allLower', 'allLower', 'allLower', 'allLower', 'allLower', 'allLower', 'allLower', 'initialUpper']
characters [['I', 'n', 't', 'e', 'r', 'n', 'a', 't', 'i', 'o', 'n', 'a', 'l'], ['t', 'o', 'u', 'r', 'i', 's', 'm'], ['i', 's'], ['n', 'o', 'w'], ['m', 'o', 'r', 'e'], ['c', 'o', 'm', 'm', 'o', 'n'], ['t', 'h', 'a', 'n'], ['e', 'v', 'e', 'r'], ['b', 'e', 'f', 'o', 'r', 'e'], ['T', 'h', 'e']]


In [8]:
dataMatrix = createMatrices(sentences, lstmModel.mappings)
for key in dataMatrix[0].keys():
    print(key, dataMatrix[0][key][:10])

AM_TAG []
casing [6, 4, 4, 4, 4, 4, 4, 4, 4, 6]
raw_tokens ['International', 'tourism', 'is', 'now', 'more', 'common', 'than', 'ever', 'before', 'The']
label []
tokens [444, 5023, 5, 314, 88, 823, 246, 1681, 367, 2]
characters [[47, 26, 32, 17, 30, 26, 13, 32, 21, 27, 26, 13, 24], [32, 27, 33, 30, 21, 31, 25], [21, 31], [26, 27, 35], [25, 27, 30, 17], [15, 27, 25, 25, 27, 26], [32, 20, 13, 26], [17, 34, 17, 30], [14, 17, 18, 27, 30, 17], [58, 20, 17]]


# Ahora si, usemos nuestro modelo para identificar argumentos

In [11]:
tags = lstmModel.tagSentences(dataMatrix)
tags[0][:50]

[u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'O',
 u'MajorClaim',
 u'MajorClaim',
 u'MajorClaim',
 u'MajorClaim',
 u'MajorClaim',
 u'MajorClaim']

# Veamos los resultados humanizados

In [16]:
for sentenceIdx in range(len(sentences[:2])):
    tokens = sentences[sentenceIdx]['tokens']
    tokenTags = tags[sentenceIdx]
    for tokenIdx in range(len(tokens[:50])):
        print("{}\t{}".format(tokens[tokenIdx], tokenTags[tokenIdx]))
    print("")

International	O
tourism	O
is	O
now	O
more	O
common	O
than	O
ever	O
before	O
The	O
last	O
50	O
years	O
have	O
seen	O
a	O
significant	O
increase	O
in	O
the	O
number	O
of	O
tourist	O
traveling	O
worldwide	O
.	O
While	O
some	O
might	O
think	O
the	O
tourism	O
bring	O
large	O
profit	O
for	O
the	O
destination	O
countries	O
,	O
I	O
would	O
contend	O
that	O
this	MajorClaim
industry	MajorClaim
has	MajorClaim
affected	MajorClaim
the	MajorClaim
cultural	MajorClaim

Living	O
and	O
studying	O
overseas	O
It	O
is	O
every	O
student	O
'	O
s	O
desire	O
to	O
study	O
at	O
a	O
good	O
university	O
and	O
experience	O
a	O
new	O
environment	O
.	O
While	O
some	O
students	O
study	O
and	O
live	O
overseas	O
to	O
achieve	O
this	O
,	O
some	O
prefer	O
to	O
study	O
home	O
because	O
of	O
the	O
difficulties	O
of	O
living	O
and	O
studying	O
overseas	O
.	O
In	O

