Four words
==========
Determine which of four words is the odd one out. 

Data is read from a comma-separated file with four columns where the fourth column contains
the odd-one-out. The task is to identify that one
(without using the fact that it is in this specific column)

In [1]:
from everything import *
from dasem.semantic import Semantic

In [30]:
# Load semantic method
semantic = Semantic()

In [2]:
# Read dataset
four_words = read_csv('../dasem/data/four_words.csv', encoding='utf-8')

In [4]:
four_words.head()

Unnamed: 0,word1,word2,word3,word4
0,æble,pære,kirsebær,stol
1,stol,bord,reol,græs
2,græs,træ,blomst,bil
3,bil,cykel,tog,vind
4,vind,regn,solskin,mandag


In [71]:
# Identify outlier
outlier = []
for idx, words in four_words.iterrows():
    sorted_words = semantic.sort_by_outlierness(words.values[:4])
    outlier.append(sorted_words[0])

four_words['outlier'] = outlier

In [72]:
# Accuracy
mean(four_words.word4 == outlier)

0.78000000000000003

In [73]:
four_words

Unnamed: 0,word1,word2,word3,word4,outlier
0,æble,pære,kirsebær,stol,stol
1,stol,bord,reol,græs,græs
2,græs,træ,blomst,bil,bil
3,bil,cykel,tog,vind,vind
4,vind,regn,solskin,mandag,mandag
5,mandag,tirsdag,søndag,tømrer,tømrer
6,tømrer,vvs-mand,snedker,barn,barn
7,barn,far,mormor,lampe,lampe
8,lampe,stearinlys,lommelygte,jern,jern
9,jern,guld,magnesium,sjov,sjov


Parameters that might affect performance
----------------------------------------

- Number of pages read
- Use of stopwords
- Exclusion of short pages
- Scaling of matrix tfidf/count
- Normalization of document
- Factorization of matrix

In [3]:
def compute_accuracy(semantic, four_words):
    outlier = []
    for idx, words in four_words.iterrows():
        sorted_words = semantic.sort_by_outlierness(words.values[:4])
        outlier.append(sorted_words[0])

    accuracy = mean(four_words.word4 == outlier)
    return accuracy

In [4]:
max_n_pagess = [10, 100, 1000, 3000]
norms = ['l1', 'l2', None]
stop_wordss = [None, set(nltk.corpus.stopwords.words('danish'))]
use_idfs = [True, False]
sublinear_tfs = [True, False]

columns = ['accuracy', 'stop_words', 'use_idf', 'norm', 'sublinear_tf', 'max_n_pages']

n_total = len(max_n_pagess) * len(norms) * len(stop_wordss) * len(use_idfs) * \
    len(sublinear_tfs)
results = DataFrame(dtype=float, index=range(n_total), columns=columns)
n = 0
for stop_words_index, stop_words in (enumerate(stop_wordss)):
    for norm in (norms):
        for use_idf in (use_idfs):
            for sublinear_tf in (sublinear_tfs):
                for max_n_pages in (max_n_pagess):
                    results.ix[n, 'max_n_pages'] = max_n_pages
                    results.ix[n, 'stop_words'] = stop_words_index
                    results.ix[n, 'norm'] = str(norm)
                    results.ix[n, 'use_idf'] = use_idf
                    results.ix[n, 'sublinear_tf'] = sublinear_tf
                    semantic = Semantic(stop_words=stop_words, norm=norm,
                                        use_idf=use_idf, sublinear_tf=sublinear_tf,
                                        max_n_pages=max_n_pages)
                    results.ix[n, 'accuracy'] = compute_accuracy(semantic, four_words)
                    n += 1

In [5]:
results

Unnamed: 0,accuracy,stop_words,use_idf,norm,sublinear_tf,max_n_pages
0,0.02,0.0,True,l1,True,10.0
1,0.08,0.0,True,l1,True,100.0
2,0.28,0.0,True,l1,True,1000.0
3,0.36,0.0,True,l1,True,3000.0
4,0.02,0.0,True,l1,False,10.0
5,0.08,0.0,True,l1,False,100.0
6,0.26,0.0,True,l1,False,1000.0
7,0.34,0.0,True,l1,False,3000.0
8,0.00,0.0,False,l1,True,10.0
9,0.06,0.0,False,l1,True,100.0


In [6]:
formula = 'accuracy ~ stop_words + use_idf + norm + sublinear_tf + max_n_pages'
model = smf.glm(formula, data=results).fit()
model.summary()

0,1,2,3
Dep. Variable:,accuracy,No. Observations:,96.0
Model:,GLM,Df Residuals:,89.0
Model Family:,Gaussian,Df Model:,6.0
Link Function:,identity,Scale:,0.00336693637528
Method:,IRLS,Log-Likelihood:,140.72
Date:,"Fri, 30 Sep 2016",Deviance:,0.29966
Time:,14:16:05,Pearson chi2:,0.3
No. Iterations:,2,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.0778,0.015,5.067,0.000,0.048,0.108
use_idf[T.True],0.0008,0.012,0.070,0.944,-0.022,0.024
norm[T.l1],-0.0125,0.015,-0.862,0.389,-0.041,0.016
norm[T.l2],3.296e-17,0.015,2.27e-15,1.000,-0.028,0.028
sublinear_tf[T.True],-0.0025,0.012,-0.211,0.833,-0.026,0.021
stop_words,7.199e-17,0.012,6.08e-15,1.000,-0.023,0.023
max_n_pages,0.0001,4.92e-06,23.076,0.000,0.000,0.000
