In [2]:
import numpy as np
import pandas as pd
from nltk.tokenize.stanford import StanfordTokenizer
import nltk
import spacy
sp = spacy.load('en_core_web_sm')
import collections
import tensorflow as tf
import keras

In [3]:
# Installs Spacy
# import sys
# !{sys.executable} -m pip install spacy
# !{sys.executable} -m spacy download en

In [4]:
# !pip install tensorflow

In [5]:
trainPath = "./semeval_train.txt"
testPath = "./semeval_test.txt"
outputTrain = "./trainSemeval.txt"
outputTest = "./testSemeval.txt"

In [6]:
def createFile(inputFilePath,outputFilePath):
    data = []
    f = open(inputFilePath, "r")
    fOut = open(outputFilePath, 'w')
    lines = f.readlines()
    for idx in range(0,len(lines),4):
        sentence_number = lines[idx].split("\t")[0]
        sentence = lines[idx].split("\t")[1][1:-1]
        label = lines[idx+1]
        sentence = sentence.replace("<e1>", " E1_START ").replace("</e1>", " E1_END ")
        sentence = sentence.replace("<e2>", " E2_START ").replace("</e2>", " E2_END ")
        tokens = nltk.word_tokenize(sentence)
        fOut.write(" ".join([ label, " ".join(tokens) ]))
        fOut.write("\n")
        e1 = tokens.index("E1_END") - 1
        e2 = tokens.index("E2_END") - 1
        sentence = " ".join(tokens)
        spacy_sentence = sp(sentence)
        new_sentence = []
        
        for word in spacy_sentence:
#             print(word.text,  word.lemma_)
            new_sentence.append(word.lemma_)
    
        new_sentence = " ".join(new_sentence)
        data.append([sentence_number, new_sentence, e1, e2, label])
        
    fOut.close()
    print(outputFilePath," Has been created")
    return data

In [7]:
train = createFile(trainPath,outputTrain)
test = createFile(testPath,outputTest)


./trainSemeval.txt  Has been created
./testSemeval.txt  Has been created


In [8]:
dfTrain = pd.DataFrame(data=train, columns=["sentence_number", "sentence", "e1", "e2", "relation"])

dfTrain['word_count'] = dfTrain["sentence"].apply(lambda x: len(str(x).split(" ")))
dfTrain["relation"] = dfTrain["relation"].astype('category')
dfTrain["relation_cat"] = dfTrain["relation"].cat.codes
dfTrain.head()

Unnamed: 0,sentence_number,sentence,e1,e2,relation,word_count,relation_cat
0,1,the system as describe above have -PRON- great...,13,18,"Component-Whole(e2,e1)\n",22,3
1,2,the E1_START child E1_END be carefully wrap an...,2,12,Other\n,21,16
2,3,the E1_START author E1_END of a keygen use a E...,2,10,"Instrument-Agency(e2,e1)\n",21,11
3,4,a misty E1_START ridge E1_END uprise from the ...,3,9,Other\n,13,16
4,5,the E1_START student E1_END E2_START associati...,2,5,"Member-Collection(e1,e2)\n",26,12


In [9]:
dfTrain["tags"] = dfTrain["sentence"].apply(lambda x: [(tag.text, tag.label_) 
                                for tag in sp(x).ents] )

In [10]:
dfTrain.head()

Unnamed: 0,sentence_number,sentence,e1,e2,relation,word_count,relation_cat,tags
0,1,the system as describe above have -PRON- great...,13,18,"Component-Whole(e2,e1)\n",22,3,[]
1,2,the E1_START child E1_END be carefully wrap an...,2,12,Other\n,21,16,[]
2,3,the E1_START author E1_END of a keygen use a E...,2,10,"Instrument-Agency(e2,e1)\n",21,11,[]
3,4,a misty E1_START ridge E1_END uprise from the ...,3,9,Other\n,13,16,[]
4,5,the E1_START student E1_END E2_START associati...,2,5,"Member-Collection(e1,e2)\n",26,12,"[(the State University of New York at Buffalo,..."


In [11]:
def utils_lst_count(lst):
    dic_counter = collections.Counter()
    for x in lst:
        dic_counter[x] += 1
    dic_counter = collections.OrderedDict( 
                     sorted(dic_counter.items(), 
                     key=lambda x: x[1], reverse=True))
    lst_count = [ {key:value} for key,value in dic_counter.items() ]
    return lst_count

## count tags
dfTrain["tags"] = dfTrain["tags"].apply(lambda x: utils_lst_count(x))

In [12]:
def utils_ner_features(lst_dics_tuples, tag):
    if len(lst_dics_tuples) > 0:
        tag_type = []
        for dic_tuples in lst_dics_tuples:
            for tuple in dic_tuples:
                type, n = tuple[1], dic_tuples[tuple]
                tag_type = tag_type + [type]*n
                dic_counter = collections.Counter()
                for x in tag_type:
                    dic_counter[x] += 1
        return dic_counter[tag]
    else:
        return 0

In [13]:
tags_set = []
for lst in dfTrain["tags"].tolist():
     for dic in lst:
          for k in dic.keys():
              tags_set.append(k[1])
tags_set = list(set(tags_set))
for feature in tags_set:
     dfTrain["tags_"+feature] = dfTrain["tags"].apply(lambda x: 
                             utils_ner_features(x, feature))

In [14]:
dfTrain.head()

Unnamed: 0,sentence_number,sentence,e1,e2,relation,word_count,relation_cat,tags,tags_NORP,tags_MONEY,...,tags_LAW,tags_PERCENT,tags_PRODUCT,tags_EVENT,tags_PERSON,tags_ORG,tags_QUANTITY,tags_FAC,tags_WORK_OF_ART,tags_LOC
0,1,the system as describe above have -PRON- great...,13,18,"Component-Whole(e2,e1)\n",22,3,[],0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,the E1_START child E1_END be carefully wrap an...,2,12,Other\n,21,16,[],0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,the E1_START author E1_END of a keygen use a E...,2,10,"Instrument-Agency(e2,e1)\n",21,11,[],0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,a misty E1_START ridge E1_END uprise from the ...,3,9,Other\n,13,16,[],0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,the E1_START student E1_END E2_START associati...,2,5,"Member-Collection(e1,e2)\n",26,12,[{('the State University of New York at Buffal...,0,0,...,0,0,0,0,0,1,0,0,0,0
