In [9]:
import numpy as np
import pandas as pd
from nltk.tokenize.stanford import StanfordTokenizer
import nltk
import spacy
sp = spacy.load('en_core_web_sm')
import collections
import tensorflow as tf
import keras

In [10]:
# Installs Spacy
# import sys
# !{sys.executable} -m pip install spacy
# !{sys.executable} -m spacy download en

In [11]:
# !pip install tensorflow

In [12]:
trainPath = "./semeval_train.txt"
testPath = "./semeval_test.txt"
outputTrain = "./trainSemeval.txt"
outputTest = "./testSemeval.txt"

In [13]:
def createFile(inputFilePath,outputFilePath):
    data = []
    doc = []
    f = open(inputFilePath, "r")
    fOut = open(outputFilePath, 'w')
    lines = f.readlines()
    for idx in range(0,len(lines),4):
        sentence_number = lines[idx].split("\t")[0]
        sentence = lines[idx].split("\t")[1][1:-1]
        label = lines[idx+1]
        sentence = sentence.replace("<e1>", " E1_START ").replace("</e1>", " E1_END ")
        sentence = sentence.replace("<e2>", " E2_START ").replace("</e2>", " E2_END ")
        tokens = nltk.word_tokenize(sentence)
        fOut.write(" ".join([ label, " ".join(tokens) ]))
        fOut.write("\n")
        e1 = tokens.index("E1_END") - 1
        e2 = tokens.index("E2_END") - 1
        sentence = " ".join(tokens)
        doc.append(sentence)
        spacy_sentence = sp(sentence)
        new_sentence = []
        for word in spacy_sentence:
#             print(word.text,  word.lemma_)
            new_sentence.append(word.lemma_)
    
        new_sentence = " ".join(new_sentence)
        data.append([sentence_number, new_sentence, e1, e2, label])
        
    fOut.close()
    print(outputFilePath," Has been created")
    return data,doc

In [14]:
train,doc = createFile(trainPath,outputTrain)
test = createFile(testPath,outputTest)

./trainSemeval.txt  Has been created
./testSemeval.txt  Has been created


In [None]:
print(doc)

In [15]:
dfTrain = pd.DataFrame(data=train, columns=["sentence_number", "sentence", "e1", "e2", "relation"])

dfTrain['word_count'] = dfTrain["sentence"].apply(lambda x: len(str(x).split(" ")))
dfTrain["relation"] = dfTrain["relation"].astype('category')
dfTrain["relation_cat"] = dfTrain["relation"].cat.codes
dfTrain.head()

Unnamed: 0,sentence_number,sentence,e1,e2,relation,word_count,relation_cat
0,1,the system as describe above have -PRON- great...,13,18,"Component-Whole(e2,e1)\n",22,3
1,2,the E1_START child E1_END be carefully wrap an...,2,12,Other\n,21,16
2,3,the E1_START author E1_END of a keygen use a E...,2,10,"Instrument-Agency(e2,e1)\n",21,11
3,4,a misty E1_START ridge E1_END uprise from the ...,3,9,Other\n,13,16
4,5,the E1_START student E1_END E2_START associati...,2,5,"Member-Collection(e1,e2)\n",26,12


In [16]:
dfTrain["tags"] = dfTrain["sentence"].apply(lambda x: [(tag.text, tag.label_) 
                                for tag in sp(x).ents] )

In [17]:
dfTrain.head()

Unnamed: 0,sentence_number,sentence,e1,e2,relation,word_count,relation_cat,tags
0,1,the system as describe above have -PRON- great...,13,18,"Component-Whole(e2,e1)\n",22,3,[]
1,2,the E1_START child E1_END be carefully wrap an...,2,12,Other\n,21,16,[]
2,3,the E1_START author E1_END of a keygen use a E...,2,10,"Instrument-Agency(e2,e1)\n",21,11,[]
3,4,a misty E1_START ridge E1_END uprise from the ...,3,9,Other\n,13,16,[]
4,5,the E1_START student E1_END E2_START associati...,2,5,"Member-Collection(e1,e2)\n",26,12,"[(the State University of New York at Buffalo,..."


In [18]:
def utils_lst_count(lst):
    dic_counter = collections.Counter()
    for x in lst:
        dic_counter[x] += 1
    dic_counter = collections.OrderedDict( 
                     sorted(dic_counter.items(), 
                     key=lambda x: x[1], reverse=True))
    lst_count = [ {key:value} for key,value in dic_counter.items() ]
    return lst_count

## count tags
dfTrain["tags"] = dfTrain["tags"].apply(lambda x: utils_lst_count(x))

In [19]:
def utils_ner_features(lst_dics_tuples, tag):
    if len(lst_dics_tuples) > 0:
        tag_type = []
        for dic_tuples in lst_dics_tuples:
            for tuple in dic_tuples:
                type, n = tuple[1], dic_tuples[tuple]
                tag_type = tag_type + [type]*n
                dic_counter = collections.Counter()
                for x in tag_type:
                    dic_counter[x] += 1
        return dic_counter[tag]
    else:
        return 0

In [20]:
tags_set = []
for lst in dfTrain["tags"].tolist():
     for dic in lst:
          for k in dic.keys():
              tags_set.append(k[1])
tags_set = list(set(tags_set))
for feature in tags_set:
     dfTrain["tags_"+feature] = dfTrain["tags"].apply(lambda x: 
                             utils_ner_features(x, feature))

In [21]:
dfTrain.head()

Unnamed: 0,sentence_number,sentence,e1,e2,relation,word_count,relation_cat,tags,tags_MONEY,tags_QUANTITY,...,tags_TIME,tags_ORDINAL,tags_CARDINAL,tags_PERCENT,tags_WORK_OF_ART,tags_FAC,tags_LANGUAGE,tags_PERSON,tags_GPE,tags_LOC
0,1,the system as describe above have -PRON- great...,13,18,"Component-Whole(e2,e1)\n",22,3,[],0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,the E1_START child E1_END be carefully wrap an...,2,12,Other\n,21,16,[],0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,the E1_START author E1_END of a keygen use a E...,2,10,"Instrument-Agency(e2,e1)\n",21,11,[],0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,a misty E1_START ridge E1_END uprise from the ...,3,9,Other\n,13,16,[],0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,the E1_START student E1_END E2_START associati...,2,5,"Member-Collection(e1,e2)\n",26,12,[{('the State University of New York at Buffal...,0,0,...,0,0,0,0,0,0,0,0,0,0


In [23]:
print(doc)



In [24]:
vocab_size = len(doc)
encoded_docs = [tf.keras.preprocessing.text.one_hot(d, vocab_size) for d in doc]
print(encoded_docs)


[[4757, 6093, 7251, 6143, 3957, 159, 4296, 3207, 6708, 1421, 3982, 774, 5627, 2463, 1240, 5627, 1143, 195, 1205, 2639, 2463, 2285, 2639, 1143, 2890], [4757, 5627, 2463, 45, 5627, 1143, 6106, 3728, 7499, 3827, 5146, 6575, 4757, 2639, 2463, 6591, 2639, 1143, 621, 2803, 195, 5999, 7260, 2890], [4757, 5627, 2463, 194, 5627, 1143, 195, 5999, 2270, 1328, 5999, 2639, 2463, 4448, 2639, 1143, 1178, 2300, 4871, 4757, 4518, 5587, 2939, 2890], [5999, 6580, 5627, 2463, 7863, 5627, 1143, 7812, 2619, 4757, 2639, 2463, 3817, 2639, 1143, 2890], [4757, 5627, 2463, 5059, 5627, 1143, 2639, 2463, 324, 2639, 1143, 2463, 4757, 6293, 195, 4757, 278, 5059, 2896, 195, 4757, 6759, 6806, 195, 2129, 979, 4871, 1803, 2890], [2639, 2463, 4757, 1130, 5627, 2463, 5428, 5627, 1143, 4914, 2463, 1463, 5467, 42, 2639, 2463, 2958, 2639, 1143, 195, 7562, 2890], [4757, 1092, 7435, 2463, 4914, 4757, 6997, 5627, 2463, 4737, 5627, 1143, 1421, 4757, 3180, 7594, 195, 4757, 4658, 6581, 621, 5818, 2534, 2639, 2463, 2842, 2639, 1143

In [26]:
max_length = 200
padded_docs = tf.keras.preprocessing.sequence.pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

[[4757 6093 7251 ...    0    0    0]
 [4757 5627 2463 ...    0    0    0]
 [4757 5627 2463 ...    0    0    0]
 ...
 [1178 5374 2639 ...    0    0    0]
 [4757 3693 3827 ...    0    0    0]
 [4757 5627 2463 ...    0    0    0]]


In [30]:
model = tf.keras.Sequential()
model.add(Embedding(vocab_size, 8, input_length=max_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, dfTrain['relation_cat'], verbose=0)
print('Accuracy: %f' % (accuracy*100))

AttributeError: module 'tensorflow.keras' has no attribute 'Embedding'