In [84]:
import codecs
import numpy as np
import random
from sklearn import metrics
import matplotlib.pyplot as plt
from nltk import FreqDist
from sklearn.feature_selection import SelectKBest, SelectFdr
from sklearn.feature_selection import chi2


class Chi2Anlaysis(object):
    # X^2 is statistically significant at the p-value level
    def __init__(self,X, Y, feature_names):
        self.X=X
        self.Y=Y
        self.feature_names=feature_names
        
    def extract_features_kbest(self, N):
        
        selector = SelectKBest(chi2,k='all')
        selector.fit_transform(self.X, self.Y )
        scores = {self.feature_names[i]: (x,selector.pvalues_[i]) for i, x in enumerate(list(selector.scores_))}
        scores = sorted(scores.items(), key=lambda x: x[1][0], reverse=True)[0:N]
        f = codecs.open('test_kbest.txt','w')
        f.write('\t'.join(['feature', 'score', 'p-value', '# I', '# O'])+'\n')
        for w, score in scores:
            feature_array=self.X[:,self.feature_names.index(w)]
            pos=[feature_array[idx] for idx, x in enumerate(self.Y) if x==1]
            neg=[feature_array[idx] for idx, x in enumerate(self.Y) if x==0]
            f.write('\t'.join([str(w), str(score[0]), str(score[1]), str(round(np.average(pos),2))+'(+/-)'+str(round(np.std(pos),2)), str(round(np.average(neg),2))+'(+/-)'+str(round(np.std(neg),2))])+'\n')        
        f.close()
    
    def extract_features_fdr(self, N):
        #https://brainder.org/2011/09/05/fdr-corrected-fdr-adjusted-p-values/
        #Filter: Select the p-values for an estimated false discovery rate
        #This uses the Benjamini-Hochberg procedure. alpha is an upper bound on the expected false discovery rate.
        selector = SelectFdr(chi2,alpha=5e-2)
        selector.fit_transform(self.X, self.Y )
        scores = {self.feature_names[i]: (x,selector.pvalues_[i]) for i, x in enumerate(list(selector.scores_))}
        scores = sorted(scores.items(), key=lambda x: x[1][1])[0:N]
        f = codecs.open('test_fdr.txt','w')
        f.write('\t'.join(['feature', 'score', 'p-value', 'avg I', 'avg O'])+'\n')
        for w, score in scores:
            feature_array=self.X[:,self.feature_names.index(w)]
            pos=[feature_array[idx] for idx, x in enumerate(self.Y) if x==1]
            neg=[feature_array[idx] for idx, x in enumerate(self.Y) if x==0]
            f.write('\t'.join([str(w), str(score[0]), str(score[1]), str(round(np.average(pos),2))+'(+/-)'+str(round(np.std(pos),2)), str(round(np.average(neg),2))+'(+/-)'+str(round(np.std(neg),2))])+'\n')        
        f.close()
        
        
from sklearn.feature_extraction.text import TfidfVectorizer
    
class TextFeature(object):
    def __init__(self, corpus, analyzer='word', ngram=(1,1)):
        tfm = TfidfVectorizer(use_idf=False, analyzer=analyzer, ngram_range=ngram, norm=None, stop_words=[], lowercase=False)
        self.tf_vec = tfm.fit_transform(corpus).toarray()
        self.feature_names = tfm.get_feature_names()
        

In [83]:
# Surah_test
data=[(l.split()[1].split('##')[0],l.strip().split()[2::]) for l in codecs.open('test/id_quran_roots.txt','r','utf-8').readlines()]
corpus=[' '.join(x) for y,x in data]
label=[1 if y=='36' else 0 for y,x in data]
CA=Chi2Anlaysis(TF.tf_vec, label,TF.feature_names)
TF=TextFeature(corpus,ngram=(1,5))
CA.extract_features_kbest(100)
CA.extract_features_fdr(100)

In [109]:
A={'a':(2.00,2),'d':(1.00,0.5),'c':(23,0.1)}

In [110]:
import operator
sorted_x = sorted(A.items(), key=operator.itemgetter([1][0]),reverse=True)

In [112]:
for x,y in sorted_x:
    print(y[0])

23
2.0
1.0
