# ANN Feature Extraction

In [2]:
# Conor O'Sullivan 
# 22 October 2018
# Extract ANN features from text

In [3]:
import numpy as np
import pandas as pd
import string
from collections import Counter
import json

### Extract trigram features from training set 

In [8]:
#Import data
train = pd.read_csv("train.csv")
train.drop(['Unnamed: 0'], axis=1,inplace=True)
print(len(train))
train.head()

210000


Unnamed: 0,lang,sent
0,eng,Do you intend to pursue your education?
1,fra,Les étages du bas sont très sombres.
2,deu,"Einen Wodka Tonic, bitte."
3,por,Está muito calor lá fora?
4,deu,Ich habe dreizehn Katzen.


In [4]:
#Remove punctuation and numbers
def clean_text(sent):
    "Takes in a string and returns it with no numbers or punctuation and normalized spaces"
    remove=string.punctuation + "1234567890" #Characters to be removed
    table=str.maketrans("","",remove)    
    sent = sent.translate(table)  
    sent = " ".join(sent.split()) #Normalize spaces
    return sent

sent = "Hello. #This function908 Removes  numbers12,    punctuation... and     normalizes spaces"
clean_text(sent)

'Hello This function Removes numbers punctuation and normalizes spaces'

In [5]:
def char_trigram(sent):
    "Takes a string and returns a list of character n-grams"
    return [sent[i:i+3] for i in range(len(sent)-3+1)]

sent = "This is a sentence."
char_trigram(sent)

['Thi',
 'his',
 'is ',
 's i',
 ' is',
 'is ',
 's a',
 ' a ',
 'a s',
 ' se',
 'sen',
 'ent',
 'nte',
 'ten',
 'enc',
 'nce',
 'ce.']

In [9]:
def trigram_list(sent_list):
    "Takes in a list of sentences and returns a list of trigrams "
    sent_clean = list(map(clean_text, sent_list))
    sent_trigram = list(map(char_trigram,sent_clean))
    list_trigram = [item for sublist in sent_trigram for item in sublist]
    return list_trigram

sent_list = train['sent'][0:10]
trigram_list(sent_list)

['Do ',
 'o y',
 ' yo',
 'you',
 'ou ',
 'u i',
 ' in',
 'int',
 'nte',
 'ten',
 'end',
 'nd ',
 'd t',
 ' to',
 'to ',
 'o p',
 ' pu',
 'pur',
 'urs',
 'rsu',
 'sue',
 'ue ',
 'e y',
 ' yo',
 'you',
 'our',
 'ur ',
 'r e',
 ' ed',
 'edu',
 'duc',
 'uca',
 'cat',
 'ati',
 'tio',
 'ion',
 'Les',
 'es ',
 's é',
 ' ét',
 'éta',
 'tag',
 'age',
 'ges',
 'es ',
 's d',
 ' du',
 'du ',
 'u b',
 ' ba',
 'bas',
 'as ',
 's s',
 ' so',
 'son',
 'ont',
 'nt ',
 't t',
 ' tr',
 'trè',
 'rès',
 'ès ',
 's s',
 ' so',
 'som',
 'omb',
 'mbr',
 'bre',
 'res',
 'Ein',
 'ine',
 'nen',
 'en ',
 'n W',
 ' Wo',
 'Wod',
 'odk',
 'dka',
 'ka ',
 'a T',
 ' To',
 'Ton',
 'oni',
 'nic',
 'ic ',
 'c b',
 ' bi',
 'bit',
 'itt',
 'tte',
 'Est',
 'stá',
 'tá ',
 'á m',
 ' mu',
 'mui',
 'uit',
 'ito',
 'to ',
 'o c',
 ' ca',
 'cal',
 'alo',
 'lor',
 'or ',
 'r l',
 ' lá',
 'lá ',
 'á f',
 ' fo',
 'for',
 'ora',
 'Ich',
 'ch ',
 'h h',
 ' ha',
 'hab',
 'abe',
 'be ',
 'e d',
 ' dr',
 'dre',
 'rei',
 'eiz',
 'ize',


In [10]:
def most_frequent(n,trigrams):
    "Takes in a list of trigrams and returns the n most frequent trigrams"
    common = []
    for e in Counter(trigrams).most_common(n):
        common.append(e[0])
    return common

sent_list = train['sent'][0:1000]
trigrams = trigram_list(sent_list)
most_frequent(20, trigrams)


[' de',
 'en ',
 'er ',
 'es ',
 'om ',
 'Tom',
 ' qu',
 'de ',
 'que',
 'est',
 'as ',
 'ent',
 ' es',
 ' co',
 'te ',
 'to ',
 ' di',
 'ch ',
 'ue ',
 ' a ']

In [15]:
def lang_features(n_list,lang):
    """Returns a dictionary of the most frequent trigrams for a given language. Each element is a list of the n most
    frequent trigrams when n is a element of n_list"""
    
    train_lang = train[train['lang'] == lang]
    sent_list = train_lang['sent']
    trigrams = trigram_list(sent_list)
    
    freq = {}
    for n in n_list:
        freq[n] = most_frequent(int(n), trigrams)
    return freq 
    
lang_features(['20','30'],'eng')

{'20': [' th',
  ' to',
  'he ',
  'Tom',
  'om ',
  'the',
  'to ',
  'hat',
  'nt ',
  'ing',
  'at ',
  'ed ',
  'tha',
  ' do',
  'ng ',
  'is ',
  'you',
  ' yo',
  ' wa',
  'e t'],
 '30': [' th',
  ' to',
  'he ',
  'Tom',
  'om ',
  'the',
  'to ',
  'hat',
  'nt ',
  'ing',
  'at ',
  'ed ',
  'tha',
  ' do',
  'ng ',
  'is ',
  'you',
  ' yo',
  ' wa',
  'e t',
  're ',
  't t',
  'ry ',
  ' ha',
  ' he',
  ' an',
  'ou ',
  ' a ',
  'as ',
  'er ']}

In [12]:
#Create a dictionary of the featurs for all the languages 
lang = ['eng','deu','spa','fra','por','ita']
n_list = ['50','100','200']
lang_trigrams = {}
for l in lang:
    lang_trigrams[l] = lang_features(n_list,l)
    print(l)
lang_trigrams

eng
deu
spa
fra
por
ita


{'eng': {'50': [' th',
   ' to',
   'he ',
   'Tom',
   'om ',
   'the',
   'to ',
   'hat',
   'nt ',
   'ing',
   'at ',
   'ed ',
   'tha',
   ' do',
   'ng ',
   'is ',
   'you',
   ' yo',
   ' wa',
   'e t',
   're ',
   't t',
   'ry ',
   ' ha',
   ' he',
   ' an',
   'ou ',
   ' a ',
   'as ',
   'er ',
   'd t',
   ' be',
   'nd ',
   'ary',
   ' is',
   'Mar',
   'her',
   'thi',
   ' To',
   ' in',
   've ',
   'and',
   'll ',
   'o t',
   ' Ma',
   ' of',
   'ere',
   'e a',
   'e w',
   'in '],
  '100': [' th',
   ' to',
   'he ',
   'Tom',
   'om ',
   'the',
   'to ',
   'hat',
   'nt ',
   'ing',
   'at ',
   'ed ',
   'tha',
   ' do',
   'ng ',
   'is ',
   'you',
   ' yo',
   ' wa',
   'e t',
   're ',
   't t',
   'ry ',
   ' ha',
   ' he',
   ' an',
   'ou ',
   ' a ',
   'as ',
   'er ',
   'd t',
   ' be',
   'nd ',
   'ary',
   ' is',
   'Mar',
   'her',
   'thi',
   ' To',
   ' in',
   've ',
   'and',
   'll ',
   'o t',
   ' Ma',
   ' of',
   'ere',
   'e a',

In [14]:
#From the lang_trigrams select list of unique trigrams i.e. final feature list
features = {} #final feature list
for n in n_list:
    n_trigrams = []
    for l in lang:
            n_trigrams = n_trigrams + lang_trigrams[l][n]
    features[n] = sorted(list(set(n_trigrams)))

with open('ANN_features/features.json', 'w') as outfile:
    json.dump(features, outfile)