## Information extracted from the train dataset

In [1]:
# These suffixes were taken from the train dataset
DRUG_SUFFIXES = ["oin", "ine","ide", "cin", "fil", "ion", "mil", "tal", "rin", "lin", "ium", "ril", "nol","lyn",
				"hol", "ole","mic","xic","xib","vir","mab","vec","ast","lax","sin","pam","tan"]


DRUG_COMPOSED_WORDS = ['acid','sodium','alkaloids','hydrochloride','f2alpha','edisylate','iodide','sulfate','acetate'
						'antiinflammatory','mustard','hcl','mofetil','gallate','cations','nitrate']

GROUP_COMPOSED_LAST_WORDS = [
	'adjuvant',
	'agonist',
	'alkaloid',
	'antibiotic',
	'antidepressent',
	'anti-inflammatory',
	'blocker',
	'class',
	'compound',
	'depressants',
	'diuretic',
	'drug',
	'hormone',
	'inhibitor',
	'medication',
	'product',
	'solution',
	'steroid',
	'vaccine',
	'vasodilator'
]

GROUP_COMPOSED_MIDDLE_WORDS = [
	'oxidase',
	'channel',
	'anti-inflammatory',
	'blocking',
	'reuptake',
	'serotonin',
	'reductase',
	'pump',
	'depressant',
	'anhydrase'
]
ALL_COMPOSED_WORDS = DRUG_COMPOSED_WORDS + GROUP_COMPOSED_LAST_WORDS

## Auxiliary functions from the extract entities

In [None]:
def append_token(ret, name, offset1, offset2, classtype):
	ret.append(dict(name=name, offset=str(offset1) + '-' + str(offset2), type=classtype))
	return ret


def match_words(list_words_1, list_words_2):
	for i in range(min(len(list_words_1), len(list_words_2))):
		if list_words_1[i][0] != list_words_2[i]:
			return False
	return True


def match_suffix(token, suffixes):
	for suf in suffixes:
		if re.search(".+" + suf + "$", token):
			return True
	return False

## Extract entities function

- if a word shows in the trainning data we detect them with the same class
- if a tokken matches any of the affixes previously declared (drug composed words taking into consideration as well)
- We saw that several vitamines are usually as type drugs and some others as type group
- We detect composed words from type groups (which usually is the one with more composed words)
- Capital letters most probably are brands

In [None]:
def extract_entities(token_list):
	ret = []
	with open('entities/trainingFeatures/entity_dict.json', 'r') as f:
		training_words_dict = json.load(f)
	skipwords = 0
	for i,token in enumerate(token_list):
		appended = False
		if skipwords != 0:
			skipwords -= 1
			appended = True
		# If token is any word that appears in the training data
		if not appended:
			if token[0] in training_words_dict:
				words = training_words_dict[token[0]][0]
				tag = training_words_dict[token[0]][1]
				appended = match_words(token_list[i:], words)
				if appended:
					skipwords = len(words)
					word_appended = token[0]
					if len(words) > 0:
						word_appended += ' ' + ' '.join(words)
					append_token(ret, word_appended, token[1], token_list[i+skipwords][2], tag)
		# If the token matches any custom affix
		if not appended:
			if match_suffix(token[0].lower(), DRUG_SUFFIXES):
				if len(token_list) > i+1:
					#If the following token is in DRUG_COMPOSED_WORDS both tokens will be a drug
					for words in DRUG_COMPOSED_WORDS:
						if token_list[i+1][0].lower() == words:
							appended = True
							append_token(ret,token[0]+" "+token_list[i+1][0],token[1],token_list[i+1][2],"drug")
				if not appended:
					appended = True
					append_token(ret,token[0],token[1],token[2],"drug")
		# Vitamins C, E, B*, D* are drugs
		if not appended and token[0].lower() == 'vitamin' and len(token_list) > i+1 and re.search("^(C|E|[BD][\w-]+)$", token_list[i+1][0]):
			appended = True
			append_token(ret,token[0]+" "+token_list[i+1][0],token[1],token_list[i+1][2],"drug")
		# Vitamins D*, K*, A are group
		if not appended and token[0].lower() == 'vitamin' and len(token_list) > i+1:
			if len(token_list) > i+2 and\
					(token_list[i+1][0]=='D' and (token_list[i+2][0].lower() == 'analogue' or token_list[i+2][0].lower() == 'preparations') or\
					(token_list[i+1][0] == 'K' and token_list[i+2][0].lower() == 'antagonists')):
				appended = True
				append_token(ret,token[0]+" "+token_list[i+1][0]+" "+token_list[i+2][0],token[1],token_list[i+2][2],"group")
			elif token_list[i+1][0] == 'A' or token_list[i+1][0] == 'D' or token_list[i+1][0] == 'K':
				appended = True
				append_token(ret,token[0]+" "+token_list[i+1][0],token[1],token_list[i+1][2],"group")
		#Capital letters most probably are brands
		## Extremely useful
		if not appended and token[0].isupper() and len(token[0]) > 4:
			appended = True
			append_token(ret,token[0],token[1],token[2],"brand")
		#Composed words belonging to group
		## This rule creates a lot of hits and a lot of misses (not very precise)
		if not appended:
			for words in GROUP_COMPOSED_LAST_WORDS:
				if i-1 >= 0 and re.search(words + "s?$", token[0]):
					stop = False
					j = 0
					while(not stop and i-j >= 0):
						stop = True
						j += 1
						for mid_words in GROUP_COMPOSED_MIDDLE_WORDS:
							if (token_list[i-j][0] == mid_words):
								stop = False
					appended = True
					append_token(ret,token_list[i-j][0]+" "+token[0],token_list[i-j][1],token[2],"group")
	return ret