In [1]:

'''
These are functions for letter bigram language identification
some details: 
    space and punctuations are preserved when training the model.
'''
import sys
import os
import numpy as np
import codecs
import collections
import pandas as pd
import itertools


In [3]:
# %load letterLangId.py
'''
Author: Yifeng Chu (ychu26)

These are functions for letter bigram language identification
some details: 
    space and punctuations are preserved when training the model.
'''
import os
import numpy as np
import codecs
import collections
import pandas as pd
import itertools

def get_data():
    '''get the preprocessed training data and testing data and store 
    them in the dictionary'''
    workdir = get_workdir()
    join_path = os.path.join
    data_path = join_path(workdir, 'database')
    testin_path = join_path(workdir, 'input') 
    data = {}
    data['test'] = readtxt(join_path(testin_path,'LangId.test'))
    data['targets'] = set(preprocess(data['test'])['ss_'])
    data['train'] = {}
    data['train']['English'] = preprocess(readtxt(join_path(data_path,'LangId.train.English')))
    data['train']['French'] = preprocess(readtxt(join_path(data_path,'LangId.train.French')))
    data['train']['Italian'] = preprocess(readtxt(join_path(data_path,'LangId.train.Italian')))
    return data

def get_workdir():
    bin_path = os.getcwd()
    return  os.path.dirname(bin_path)

def readtxt(path):
    f = codecs.open(path, encoding='latin-1', mode='r')
    #'â¤â¥' are usd as start and end symbol for every sentence
    data = [(u'â¤'+line).replace(u'\n', u'â¥') for line in f.readlines()]
    f.close()
    return data

def preprocess(sentences):
    #sentences is a list containing multiple string
    s = np.array(list(''.join(sentences)))
    ss_ = bicharconc(s)
    return {'s': s, 'ss_': ss_} 

def bicharconc(s):
    #s_ is the shifted array of s by one unit
    s_ = np.append(s[1:], s[0])
    #element wise addition of two array, get bigram letter array now
    ss_ = np.core.defchararray.add(s, s_)
    #since the combintation 'â¥â¤' is useless
    return np.delete(ss_, np.where(ss_==u'â¥â¤'))

def train(data):
    """get the model.
    since every element cc_ in ss_, c appears first, then c_,
    P(c_|c) = P(cc_) / P(c) = #(cc_) / #(c)
    """
    train_data = data['train']
    targ_sskeys = data['targets']
    #fetch all bichars and remove the redundancy
    train_sskeys = pd.DataFrame(train_data).loc['ss_',:].values
    train_sskeys =  set(itertools.chain(*map(list, train_sskeys)))
    sskeys = targ_sskeys.intersection(train_sskeys)
    model = pd.DataFrame(index=sskeys, columns=train_data.keys(), dtype='float')
    #fill the table with probs
    for dkey in train_data.keys():
        s, ss_ = train_data[dkey]['s'], train_data[dkey]['ss_']
        s_cont, ss_cont = map(collections.Counter, [s, ss_])
        for sskey in sskeys: 
            model.loc[sskey, dkey] = smooth(ss_cont[sskey], s_cont[sskey[0]], len(s_cont.keys()))
    return model

def smooth(joint, marg, n_letter):
    'smooth function'
    #joint represents joint prob, marg represents marginal prob
    return laplace(joint, marg, n_letter)

def laplace(joint, marg, n_letter):
    return float(joint+1) / (marg+n_letter)

def predict(test_data, model):
    'predict the language class using trained model'
    predicted_values = []
    for i in xrange(len(test_data)):
        #same technique for preprocessing the training data, concat the neighbor chars
        ss_ = bicharconc(list(test_data[i]))   
        #to avoid the underflow, use log of prob
        logp_sum = np.zeros(3,)
        for bichar in ss_:
            if bichar in model.index.values:
                
                logp_sum += np.log(model.loc[bichar, :].values)
        predicted_values.append(model.columns.values[logp_sum.argmax()])

    return np.array(predicted_values)

def write_solution(predicted_values):
    idx = np.arange(1, predicted_values.size+1)
    out = np.array(zip(idx, predicted_values))
    workdir = get_workdir()
    out_path = os.path.join(workdir, 'output', 'letterLangId.out')
    np.savetxt(out_path, out, fmt='%s', delimiter=' ')
    print "Refer to output file under work directory for solution\n"

def evaluate(predicted_values):
    'calculate the accuracy'
    workdir = get_workdir()
    #read the test solution
    sol_path = os.path.join(workdir, 'solution')
    groundtruth_values = np.loadtxt(os.path.join(sol_path, 'LangId.sol'), dtype='S', delimiter=' ', usecols=[-1])
    return (groundtruth_values == predicted_values).sum() / float(groundtruth_values.size)





In [None]:
# %load LangIdScript.py
#!/usr/bin/env python
import letterLangId
import wordLangId
def letter_lang_id_script():
    'letter bigram model for language ID'
    data = letterLangId.get_data()
    model = letterLangId.train(data['train'])
    predicted_values = letterLangId. predict(data['test'], model)
    accuracy = letterLangId. evaluate(predicted_values)
    print "accuracy is {}".format(accuracy)
    letterLangId. write_solution(predicted_values)

def word_lang_id_script():
    'word bigram model for language ID'
    data = wordLangId.get_data()
    model = wordLangId.train(data['train'])
    predicted_values = wordLangId.predict(data['test'], model)
    accuracy = letterLangId.evaluate(predicted_values)
    print "accuracy is {}".format(accuracy)
    wordLangId.write_solution(predicted_values)

if __name__ == '__main__':
    print "This is part I: language identification by letter"
    letter_lang_id_script()

    print "This is part II: language identification by word"
    word_lang_id_script()


In [5]:
data = get_data()


In [39]:
data['targets']

[u' b',
 u' c',
 u'gu',
 u' a',
 u' f',
 u' g',
 u' d',
 u' e',
 u' j',
 u' k',
 u' h',
 u' i',
 u' n',
 u' o',
 u' l',
 u' m',
 u' r',
 u' s',
 u' p',
 u' q',
 u' v',
 u' w',
 u' t',
 u' u',
 u'go',
 u'gn',
 u' x',
 u'gl',
 u'gi',
 u'gr',
 u' B',
 u' C',
 u' A',
 u' F',
 u' G',
 u' D',
 u'ty',
 u' J',
 u'tw',
 u'tt',
 u'tu',
 u'tr',
 u'ts',
 u' L',
 u' M',
 u' R',
 u'to',
 u' P',
 u' Q',
 u' V',
 u' W',
 u'th',
 u'ti',
 u' Z',
 u'te',
 u'tc',
 u'ta',
 u' "',
 u' #',
 u' !',
 u' &',
 u" '",
 u': ',
 u' (',
 u' )',
 u' .',
 u' ,',
 u' -',
 u' 2',
 u' 3',
 u' 1',
 u' 6',
 u' 4',
 u'g ',
 u' :',
 u' ;',
 u'E ',
 u' ?',
 u'gg',
 u'- ',
 u'--',
 u'ge',
 u't.',
 u'\xf3n',
 u'Go',
 u"t'",
 u'Gl',
 u"g'",
 u't ',
 u'rz',
 u' \xe0',
 u'zo',
 u'ga',
 u' \xea',
 u'ze',
 u' \xe8',
 u' \xe9',
 u'za',
 u' z',
 u'aw',
 u'zz',
 u'\xe8 ',
 u'gm',
 u'g\xe9',
 u' y',
 u' \xc9',
 u'\xe2\x89',
 u't\xe8',
 u't\xe9',
 u'wn',
 u'gh',
 u't\xe0',
 u'Me',
 u'Ma',
 u'\xe8r',
 u'\xe8s',
 u'Mo',
 u'z ',
 u':\xe2',


In [6]:
model = train(data)
predicted_values = predict(data['test'], model)
accuracy = evaluate(predicted_values)
print "accuracy is {}".format(accuracy)

accuracy is 0.996666666667


In [50]:
model

Unnamed: 0,Italian,French,English
b,0.00628983,0.00691864,0.0322184
c,0.0830713,0.0550979,0.0371705
gu,0.0547049,0.0679671,0.03822
a,0.0744018,0.0569924,0.0963179
f,0.0177455,0.0160144,0.0282785
g,0.0154735,0.00685584,0.00945829
d,0.109319,0.118852,0.021242
e,0.0460736,0.0648008,0.0184467
j,5.97893e-05,0.00755712,0.00220493
k,2.39157e-05,0.000104669,0.00193985


In [18]:
%%time
model = train(data)


CPU times: user 3.71 s, sys: 72 ms, total: 3.79 s
Wall time: 3.76 s


In [8]:
%%time
model = train(data)

CPU times: user 2.55 s, sys: 154 ms, total: 2.7 s
Wall time: 2.62 s


In [42]:
data['train'].keys()

['Italian', 'French', 'English']

In [5]:
# %load wordLangId.py
'''
Author: Yifeng Chu (ychu26)

These are functions for word bigram language identification
some details: 
    space and punctuations are preserved when training the model.
'''
import os
import numpy as np
import codecs
import collections
import pandas as pd
import itertools

def get_data():
    '''get the preprocessed training data and testing data and store 
    them in the dictionary'''
    workdir = get_workdir()
    join_path = os.path.join
    data_path = join_path(workdir, 'database')
    testin_path = join_path(workdir, 'input') 
    data = {}
    data['test'] = readtxt(join_path(testin_path,'LangId.test'), 'test')
    data['targets'] = set(preprocess(data['test'])['ww_'])
    data['train'] = {}
    data['train']['English'] = preprocess(readtxt(join_path(data_path,'LangId.train.English')))
    data['train']['French'] = preprocess(readtxt(join_path(data_path,'LangId.train.French')))
    data['train']['Italian'] = preprocess(readtxt(join_path(data_path,'LangId.train.Italian')))
    return data

def get_workdir():
    bin_path = os.getcwd()
    return  os.path.dirname(bin_path)

def readtxt(path, *args):
    f = codecs.open(path, encoding='latin-1', mode='r')
    #'â¤â¥' are usd as start and end symbol for every sentence
    data = [[u'â¤'] + line.strip().split(' ') + [u'â¥'] for line in f.readlines()]
    f.close()
    return data if args else data[:100]

def preprocess(sentences):
    w = np.array(list(itertools.chain(*sentences)))
    ww_ = biwordconc(w)
    return {'w': w, 'ww_': ww_}

def biwordconc(w):
    w_ = np.append(w[1:], w[0])
    # in order to separate two words, add a space between them
    ww_ = reduce(np.core.defchararray.add, [w, u' ', w_])
    return np.delete(ww_, np.where(ww_==u'â¥ â¤'))
    
def train(data):
    """get the model.
    since every element vv_ in ww_, v appears first, then v_,
    P(v_|v) = P(vv_) / P(v) = #(vv_) / #(v)
    """
    #fetch all bichars and remove the redundancy
    train_data = data['train']
    targ_wwkeys = data['targets']
    train_wwkeys = pd.DataFrame(train_data).loc['ww_',:].values
    train_wwkeys =  set(itertools.chain(*map(list, train_wwkeys)))
    wwkeys = targ_wwkeys.intersection(train_wwkeys) 
    model = pd.DataFrame(index=wwkeys, columns=train_data.keys(), dtype='float')
    #fill the table with probs
    for dkey in train_data.keys():
        w, ww_ = train_data[dkey]['w'], train_data[dkey]['ww_']
        w_cont, ww_cont = map(collections.Counter, [w, ww_])
        for wwkey in wwkeys: 
            model.loc[wwkey, dkey] = smooth(ww_cont[wwkey], w_cont[wwkey[0]], len(w_cont.keys()))
    return model

def smooth(joint, marg, n_word):
    'smooth function'
    #joint represents joint prob, marg represents marginal prob
    return laplace(joint, marg, n_word)

def laplace(joint, marg, n_word):
    return float(joint+1) / (marg+n_word)

def predict(test_data, model):
    'predict the language class using trained model'
    predicted_values = []
    for i in xrange(len(test_data)):
        #same technique for preprocessing the training data, concat the neighbor chars
        ww_ = biwordconc(test_data[i])   
        #to avoid the underflow, use log of prob
        logp_sum = np.zeros(3,)
        for biword in ww_:
            if biword in model.index.values:
                logp_sum += np.log(model.loc[biword, :].values)
        predicted_values.append(model.columns.values[logp_sum.argmax()])

    return np.array(predicted_values)

def write_solution(predicted_values):
    idx = np.arange(1, predicted_values.size+1)
    out = np.array(zip(idx, predicted_values))
    workdir = get_workdir()
    out_path = os.path.join(workdir, 'output', 'wordLangId.out')
    np.savetxt(out_path, out, fmt='%s', delimiter=' ')
    print "Refer to output file under work directory for solution\n"


In [3]:
data = get_data()

In [8]:
model = train(data)
predicted_values = predict(data['test'], model)
accuracy = evaluate(predicted_values)
print "accuracy is {}".format(accuracy)

accuracy is 0.95


In [7]:
def evaluate(predicted_values):
    'calculate the accuracy'
    workdir = get_workdir()
    #read the test solution
    sol_path = os.path.join(workdir, 'solution')
    groundtruth_values = np.loadtxt(os.path.join(sol_path, 'LangId.sol'), dtype='S', delimiter=' ', usecols=[-1])
    return (groundtruth_values == predicted_values).sum() / float(groundtruth_values.size)

In [9]:
%%time
model = train(data)

CPU times: user 422 ms, sys: 6.46 ms, total: 428 ms
Wall time: 425 ms


In [10]:
model

Unnamed: 0,Italian,French,English
que j,0.001002,0.004717,0.001319
qui est,0.001002,0.003538,0.001319
al Parlamento,0.005848,0.001151,0.001253
et de,0.000960,0.003538,0.001314
que l,0.001002,0.004717,0.001319
it in,0.000985,0.001179,0.002639
to have,0.001002,0.001175,0.002639
", but",0.000893,0.001028,0.003601
the most,0.001002,0.001175,0.003958
concerning the,0.001002,0.001172,0.002639
