In [8]:
import sys
sys.path.append('../')
from utility.file_utility import FileUtility
from alignment.fastalign_utility import FastAlignUtility
from sklearn.feature_extraction.text import TfidfVectorizer
from chi2analysis.chi2analysis import Chi2Analysis
import itertools
import codecs

Similar to what we did in the first notebook now let's generate word alignment for the forward alignment again. 

In [9]:
FastAlignUtility.run_fastalign_file('dataset/english2german.txt','output/')
FastAlignUtility.generate_word_alignemnts('dataset/english2german.txt','output/english2german_fwd.align','output/fwd_words.txt')

This time we would like to use Chi-Square test to find the the words that are significantly correlated with the 'was'

# Word-level Chi-Square

In [19]:
def produce_labels(pairs):
    '''
    :param item:
    :return: used by generate report
    '''
    global pos_words
    if pairs.split(':')[0] in pos_words:
        return [(1, pairs.split(':')[1])]
    else:
        return [(0, pairs.split(':')[1])]

def Chi2Alignment(tagged_file, level):

    if level=='word':
        tfvec = TfidfVectorizer(use_idf=False, ngram_range=(1, 1), norm=None, stop_words=[], lowercase=True, binary=False)
    elif level=='char':
        tfvec = TfidfVectorizer(use_idf=False, ngram_range=(2, 6), norm=None, analyzer='char', stop_words=[], lowercase=True, binary=False)

    tagged_word_reduced = list(itertools.chain(*[
        [list(itertools.chain(*produce_labels(pairs))) for pairs in l.split()] for l
        in codecs.open(tagged_file, 'r', 'utf-8').readlines()]))

    if len(tagged_word_reduced) > 1:
        if level=='char':
            corp=['$'+item[1].strip()+'@' for item in tagged_word_reduced if len(item[1])>0]
        else:
            corp=[item[1] for item in tagged_word_reduced if len(item[1])>0]
        X = tfvec.fit_transform(corp)
        Y = [item[0] for item in tagged_word_reduced if len(item[1])>0]
    feature_names = tfvec.get_feature_names()
    CHA = Chi2Analysis(X, Y, feature_names)
    res = CHA.extract_features_fdr('output/chi2_res.txt', 50)
    return res

In [20]:
global pos_words
pos_words=['was']

In [21]:
res=Chi2Alignment('output/fwd_words.txt', 'word')


In [22]:
res

[['wurde', 21720.05, 0.0, 247.0, 203.0, 1011.0, 203399.0],
 ['war', 19402.44, 0.0, 279.0, 360.0, 979.0, 203242.0],
 ['geriet', 1087.96, 1.3643309780469353e-238, 9.0, 3.0, 1249.0, 203599.0],
 ['begriff', 424.47, 2.5914390499432313e-94, 9.0, 21.0, 1249.0, 203581.0],
 ['pflegte', 362.4, 8.46538489828032e-81, 5.0, 6.0, 1253.0, 203596.0],
 ['worden', 349.04, 6.874968575342696e-78, 29.0, 311.0, 1229.0, 203291.0],
 ['dicht', 323.69, 2.274105886817308e-72, 2.0, 0.0, 1256.0, 203602.0],
 ['ehrfurchtsvoll', 323.69, 2.274105886817308e-72, 2.0, 0.0, 1256.0, 203602.0],
 ['fürchtete', 319.7, 1.6803180397175876e-71, 4.0, 4.0, 1254.0, 203598.0],
 ['fand', 290.81, 3.318721092523675e-65, 9.0, 34.0, 1249.0, 203568.0],
 ['kam', 263.24, 3.37671111783482e-59, 19.0, 178.0, 1239.0, 203424.0],
 ['schwäche', 214.46, 1.459450727583522e-48, 2.0, 1.0, 1256.0, 203601.0],
 ['zerrissen', 214.46, 1.459450727583522e-48, 2.0, 1.0, 1256.0, 203601.0],
 ['freute', 204.67, 1.995895310913875e-46, 3.0, 4.0, 1255.0, 203598.0],


# Char-level Chi-Square

In [23]:
res=Chi2Alignment('output/fwd_words.txt', 'char')


In [24]:
res

[['urde@', 21720.05, 0.0, 247.0, 203.0, 1011.0, 203399.0],
 ['wurde@', 21720.05, 0.0, 247.0, 203.0, 1011.0, 203399.0],
 ['$war@', 19402.44, 0.0, 279.0, 360.0, 979.0, 203242.0],
 ['war@', 19004.55, 0.0, 279.0, 373.0, 979.0, 203229.0],
 ['$wurd', 15629.05, 0.0, 252.0, 393.0, 1006.0, 203209.0],
 ['$wurde', 15629.05, 0.0, 252.0, 393.0, 1006.0, 203209.0],
 ['urd', 15629.05, 0.0, 252.0, 393.0, 1006.0, 203209.0],
 ['urde', 15629.05, 0.0, 252.0, 393.0, 1006.0, 203209.0],
 ['wurd', 15629.05, 0.0, 252.0, 393.0, 1006.0, 203209.0],
 ['wurde', 15629.05, 0.0, 252.0, 393.0, 1006.0, 203209.0],
 ['$wur', 15214.88, 0.0, 252.0, 410.0, 1006.0, 203192.0],
 ['wur', 15027.26, 0.0, 252.0, 418.0, 1006.0, 203184.0],
 ['$wu', 13526.43, 0.0, 256.0, 509.0, 1002.0, 203093.0],
 ['ar@', 13413.96, 0.0, 279.0, 634.0, 979.0, 202968.0],
 ['wu', 12730.54, 0.0, 256.0, 555.0, 1002.0, 203047.0],
 ['$war', 11250.2, 0.0, 286.0, 848.0, 972.0, 202754.0],
 ['war', 10526.66, 0.0, 286.0, 922.0, 972.0, 202680.0],
 ['rde@', 9782.8, 0