In [3]:

'''
These are functions for letter bigram language identification
some details: 
    space and punctuations are preserved when training the model.
'''
import sys
import os
import numpy as np
import codecs
import collections
import pandas as pd
import itertools


In [2]:
def get_data():
    '''get the preprocessed training data and testing data and store 
    them in the dictionary'''
    workdir = get_workdir() 
    os.path.join = join_path
    data_path = path_join(workdir, 'database')
    data = {}
    data['test'] = readtxt(os.path.join(data_path,'LangId.test'))
    data['train'] = {}
    data['train']['English'] = preprocess(readtxt(os.path.join(data_path,'LangId.train.English')))
    data['train']['French'] = preprocess(readtxt(os.path.join(data_path,'LangId.train.French')))
    data['train']['Italian'] = preprocess(readtxt(os.path.join(data_path,'LangId.train.Italian')))
    return data

def get_workdir():
    bin_path = os.getcwd()
    return os.path.dirname(bin_path)
    

In [3]:
def readtxt(path):
    f = codecs.open(path, encoding='latin-1', mode='r')
    data = [(u'≤'+line).replace(u'\n', u'≥') for line in f.readlines()]
    f.close()
    return data

def preprocess(sentences):
    #sentences is a list containing multiple string
    s = np.array(list(''.join(sentences)))
    ss_ = bicharconc(s)
    return {'s': s, 'ss_': ss_} 

def bicharconc(s):
    #s_ is the shifted array of s by one unit
    s_ = np.append(s[1:], s[0])
    #element wise addition of two array, get bigram letter array now
    ss_ = np.core.defchararray.add(s, s_)
    #since the combintation '≥≤' is useless
    return np.delete(ss_, np.where(ss_==u'≥≤'))


In [4]:
def count(train_data):
    """get the model.
    since every element cc_ in ss_, c appears first, then c_,
    P(c_|c) = P(cc_) / P(c) = #(cc_) / #(c)
    """
    model = pd.DataFrame()
    sskeys = pd.DataFrame(data['train']).loc['ss_',:].values
    sskeys =  list(set(itertools.chain(*map(list, sskeys))))
    for dkey in train_data.keys():
        s, ss_ = train_data[dkey]['s'], train_data[dkey]['ss_']
        s_count, ss_count = map(collections.Counter, [s, ss_])
        for sskey in sskeys: 
            model.loc[sskey, dkey] = smooth(ss_count[sskey], s_count[sskey[0]], len(s_count.keys()))
    return model

def smooth(joint, marg, n_letter):
    'smooth function'
    #joint represents joint prob, marg represents marginal prob
    return laplace(joint, marg, n_letter)

def laplace(joint, marg, n_letter):
    return float(joint+1) / (marg+n_letter)

In [43]:
def predict(test_data, model):
    predicted_values = []
    for i in xrange(len(test_data)):
        ss_ = bicharconc(list(test_data[i]))     
        logp_sum = np.zeros(3,)
        for bichar in ss_:
            if bichar in model.index.values:
                logp_sum += np.log(model.loc[bichar, :].values)       
        predicted_values.append(model.columns.values[logp_sum.argmax()])
    return np.array(predicted_values)   
                

def evaluate(predicted_values):
    data_path = get_datapath()
    groundtruth_values = np.loadtxt(os.path.join(data_path, 'LangId.sol'), dtype='S', delimiter=' ', usecols=[-1])
    return (groundtruth_values == predicted_values).sum() / float(groundtruth_values.size)

In [91]:
def write_solution(predicted_values):
    idx = np.arange(predicted_values.size)
    out = np.array(zip(idx, predicted_values))
    workdir = get_workdir()
    out_path = os.path.join(workdir, 'output', 'LangId.sol')
    np.savetxt(out_path, out, fmt='%s', delimiter=' ')
    return out

In [94]:
sen = u"Signora Presidente , vorrei sapere perché questo Parlamento non rispetta le norme in materia di salute e sicurezza che esso"


In [95]:
sen

u'Signora Presidente , vorrei sapere perch\xe9 questo Parlamento non rispetta le norme in materia di salute e sicurezza che esso'

In [97]:
sen.split(' ')

[u'Signora',
 u'Presidente',
 u',',
 u'vorrei',
 u'sapere',
 u'perch\xe9',
 u'questo',
 u'Parlamento',
 u'non',
 u'rispetta',
 u'le',
 u'norme',
 u'in',
 u'materia',
 u'di',
 u'salute',
 u'e',
 u'sicurezza',
 u'che',
 u'esso']

In [16]:
workdir = get_workdir()
path = os.path.join(workdir, 'database', 'LangId.train.English')
f = codecs.open(path, encoding='latin-1', mode='r')
s = [[u'≤']+line.strip().split(' ')+[u'≥'] for line in f.readlines()]
f.close()

In [30]:
a = np.array(list(itertools.chain(*s)))
a

array([u'\u2264', u'Approval', u'of', ..., u')', u'.', u'\u2265'], 
      dtype='<U18')

In [31]:
t = a
t_ = np.append(t[1:], t[0])
t_

array([u'Approval', u'of', u'the', ..., u'.', u'\u2265', u'\u2264'], 
      dtype='<U18')

In [38]:

ss_ = reduce(np.core.defchararray.add, [t, u' ', t_])

In [39]:
np.where(ss_==u'≥ ≤')

(array([    9,    21,    28, ..., 88829, 88840, 88852]),)

In [114]:
# %load wordLangId.py
'''
Author: Yifeng Chu (ychu26)

These are functions for word bigram language identification
some details: 
    space and punctuations are preserved when training the model.
'''
import os
import numpy as np
import codecs
import collections
import pandas as pd
import itertools

def get_data():
    '''get the preprocessed training data and testing data and store 
    them in the dictionary'''
    workdir = get_workdir()
    join_path = os.path.join
    data_path = join_path(workdir, 'database')
    testin_path = join_path(workdir, 'input') 
    data = {}
    data['test'] = readtxt(join_path(testin_path,'LangId.test'), 'test')
    data['train'] = {}
    data['train']['English'] = preprocess(readtxt(join_path(data_path,'LangId.train.English')))
    data['train']['French'] = preprocess(readtxt(join_path(data_path,'LangId.train.French')))
    data['train']['Italian'] = preprocess(readtxt(join_path(data_path,'LangId.train.Italian')))
    return data

def get_workdir():
    bin_path = os.getcwd()
    return  os.path.dirname(bin_path)

def readtxt(path, *args):
    f = codecs.open(path, encoding='latin-1', mode='r')
    #'â¤â¥' are usd as start and end symbol for every sentence
    data = [[u'â¤'] + line.strip().split(' ') + [u'â¥'] for line in f.readlines()]
    f.close()
    print not args
    return data if args else data[:100]

def preprocess(sentences):
    w = np.array(list(itertools.chain(*sentences)))
    ww_ = biwordconc(w)
    return {'w': w, 'ww_': ww_}

def biwordconc(w):
    w_ = np.append(w[1:], w[0])
    # in order to separate two words, add a space between them
    ww_ = reduce(np.core.defchararray.add, [w, u' ', w_])
    return np.delete(ww_, np.where(ww_==u'â¥ â¤'))
    
def train(train_data):
    """get the model.
    since every element vv_ in ww_, v appears first, then v_,
    P(v_|v) = P(vv_) / P(v) = #(vv_) / #(v)
    """
    model = pd.DataFrame()
    #fetch all bichars and remove the redundancy
    wwkeys = pd.DataFrame(train_data).loc['ww_',:].values
    wwkeys =  list(set(itertools.chain(*map(list, wwkeys))))
    priors = []
    #fill the table with probs
    for dkey in train_data.keys():
        w, ww_ = train_data[dkey]['w'], train_data[dkey]['ww_']
        w_cont, ww_cont = map(collections.Counter, [w, ww_])
        #optimization 
        
        for wwkey in wwkeys: 
            model.loc[wwkey, dkey] = smooth(ww_cont[wwkey], w_cont[wwkey[0]], len(w_cont.keys()))
            if wwkeys.index(wwkey) % 10000 == 0:
                print "every 10000 period!!"
        
        print dkey, 'finished!'    
    return model

def smooth(joint, marg, n_word):
    'smooth function'
    #joint represents joint prob, marg represents marginal prob
    return laplace(joint, marg, n_word)

def laplace(joint, marg, n_word):
    return float(joint+1) / (marg+n_word)

def predict(test_data, model, priors):
    'predict the language class using trained model'
    predicted_values = []
    for i in xrange(len(test_data)):
        #same technique for preprocessing the training data, concat the neighbor chars
        ww_ = biwordconc(test_data[i])   
        #to avoid the underflow, use log of prob
        logp_sum = np.zeros(3,)
        for biword in ww_:
            if biword in model.index.values:
                logp_sum += np.log(model.loc[biword, :].values)
        predicted_values.append(model.columns.values[logp_sum.argmax()])

    return np.array(predicted_values)


In [107]:
data = get_data()

False
True
True
True


In [85]:
data['train']['English']['ww_'].size

2440

In [115]:
model = train(data['train'])

every 10000 period!!
Italian finished!
every 10000 period!!
French finished!
every 10000 period!!
English finished!


array([ 0.03388682,  0.03328895,  0.03937008])

In [105]:
u'â¤' in data['train']['English']['w']

True

In [116]:
predicted_values = predict(data['test'], model, priors)

In [117]:
predicted_values.size

300

In [89]:
model

Unnamed: 0,Italian,French,English
avons conjointement,0.000975,0.002301,0.001253
in Brussels,0.000985,0.001179,0.002639
relazione che,0.002004,0.001179,0.001319
Vicepresidente in,0.002004,0.001179,0.001319
objectif qui,0.000996,0.002358,0.001319
contre les,0.001002,0.002345,0.001319
e soprattutto,0.001919,0.001179,0.001314
perfectly well,0.001002,0.001179,0.005277
point -,0.001002,0.002358,0.001319
British but,0.001002,0.001179,0.002639


In [118]:
accuracy = evaluate(predicted_values)
accuracy

0.94999999999999996

In [35]:
wwkeys = pd.DataFrame(data['train']).loc['ww_',:].values

In [36]:
len(collections.Counter(data['train']['English']['ww_']))

5119

In [30]:
wwkeys_ =  list(itertools.chain(*map(list, wwkeys)))

In [31]:
wwkeys__ = list(set(wwkeys_))

In [50]:
len(wwkeys__)
len(data['train']['English']['ww_'])

85873

In [34]:
wwkeys__

[u'Eurocontrol has',
 u'tempi rapidi',
 u'cette position',
 u'collega ,',
 u'own level',
 u'pi\xf9 appropriati',
 u'citizenship that',
 u'asking :',
 u'approach the',
 u'and environmentally',
 u'cinquantina di',
 u'asking ,',
 u'ethnic tensions',
 u'in lyrical',
 u'sono gi\xe0',
 u'past twenty',
 u'costatare l',
 u'changes in',
 u'de quelques',
 u'communities from',
 u'essere condiviso',
 u'say a',
 u'faits sur',
 u'( the',
 u'and complaints',
 u'bode well',
 u'to prevent',
 u'the Freedom',
 u'chose doit',
 u'into toxicity',
 u'dire con',
 u'et reconnus',
 u'called for',
 u'point 9',
 u'point 8',
 u'utilisateurs de',
 u'strong voice',
 u'be consistent',
 u'Between 1986',
 u'colleghi non',
 u'will make',
 u'information sur',
 u'point -',
 u'point ,',
 u'point .',
 u'tre mesi',
 u'entail .',
 u'Unione esistono',
 u'point d',
 u'Mr Cossutta',
 u'of hand',
 u'ma qualit\xe9',
 u'semblent v\xe9ritablement',
 u'minuto si',
 u'lotta contro',
 u'accessibile a',
 u'tables .',
 u'voler in',
 u'ar

In [45]:
s1 = set([1,2,3])
s2 = set([2,76,4])
dir(s1)

['__and__',
 '__class__',
 '__cmp__',
 '__contains__',
 '__delattr__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__iand__',
 '__init__',
 '__ior__',
 '__isub__',
 '__iter__',
 '__ixor__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__or__',
 '__rand__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__ror__',
 '__rsub__',
 '__rxor__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__sub__',
 '__subclasshook__',
 '__xor__',
 'add',
 'clear',
 'copy',
 'difference',
 'difference_update',
 'discard',
 'intersection',
 'intersection_update',
 'isdisjoint',
 'issubset',
 'issuperset',
 'pop',
 'remove',
 'symmetric_difference',
 'symmetric_difference_update',
 'union',
 'update']

In [46]:
help(s1.difference)

Help on built-in function difference:

difference(...)
    Return the difference of two or more sets as a new set.
    
    (i.e. all elements that are in this set but not the others.)



In [47]:
s1.difference(s2)

{1, 3}

In [14]:
def evaluate(predicted_values):
    'calculate the accuracy'
    workdir = get_workdir()
    #read the test solution
    sol_path = os.path.join(workdir, 'solution')
    groundtruth_values = np.loadtxt(os.path.join(sol_path, 'LangId.sol'), dtype='S', delimiter=' ', usecols=[-1])
    return (groundtruth_values == predicted_values).sum() / float(groundtruth_values.size)

In [20]:
workdir = get_workdir()
sol_path = os.path.join(workdir, 'solution')
groundtruth_values = np.loadtxt(os.path.join(sol_path, 'LangId.sol'), dtype='S', delimiter=' ', usecols=[-1])

In [24]:
groundtruth_values

array(['Italian', 'English', 'Italian', 'French', 'French', 'English',
       'English', 'English', 'French', 'French', 'English', 'Italian',
       'English', 'English', 'French', 'English', 'French', 'French',
       'Italian', 'English', 'Italian', 'French', 'Italian', 'French',
       'Italian', 'English', 'English', 'English', 'English', 'English',
       'English', 'English', 'English', 'French', 'French', 'Italian',
       'French', 'English', 'French', 'Italian', 'Italian', 'French',
       'Italian', 'Italian', 'French', 'Italian', 'French', 'Italian',
       'French', 'Italian', 'English', 'Italian', 'Italian', 'Italian',
       'Italian', 'Italian', 'Italian', 'English', 'French', 'French',
       'Italian', 'French', 'English', 'Italian', 'Italian', 'French',
       'French', 'English', 'Italian', 'Italian', 'English', 'Italian',
       'English', 'English', 'English', 'English', 'English', 'Italian',
       'Italian', 'Italian', 'Italian', 'English', 'Italian', 'English',


In [26]:
predicted_values.size

100