In [18]:
#Feature Extraction with 
#1) SelectKBest function with algorithm f_regression -> F-value between label/feature for regression tasks.
#2) Recursive Feature Elimination
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.linear_model import LogisticRegression
from copy import deepcopy
from patsy import dmatrices

import os
import sys
import pickle
import cPickle

In [2]:
print "pandas version, needs to be 0.18+", pd.__version__
#print "sklearn version, needs to be 0.18+", sklearn.__version__

pandas version, needs to be 0.18+ 0.19.0


In [3]:
#args: rootdir -> root directory, ending -> file ending
#return: list of pathes in rootdir 
def lsdir(rootdir, ending):
    pathlist = []
    for root,  dirs,  files in os.walk(rootdir,  topdown=False):
        for filename in files:
            if ending in filename:
                pathlist.append(filename)
    return pathlist

#read in a pickled dataframe for an episode in TAKE
#input: speaker, episode number, directorypath
#return: unpickled dataframe for episode
def open_pkl_ep(speaker,ep,ep_path):
    
    fname = ep_path+'r'+str(int(speaker))+'_'+str(ep)+'.pkl'
    
    with open(fname,'rb') as fp:
        ep_df = pickle.load(fp)
    #ep_df = pd.read_pickle(fname)
    ep_df = ep_df[['speaker', 'episode', 'time_in_sec',\
                   'label','label_dur',\
                   'pcm_LOGenergy_sma', 'pcm_RMSenergy_sma',\
                   'pcm_intensity_sma', 'intensity_mean','intensity_slope', \
                   'pcm_loudness_sma', \
                   'phones',\
                   'duration', 'zscore', \
                   'wml',  'wml_trigram',\
                   'rms_minus_four',  'rms_minus_one', \
                   'rms_minus_three',  'rms_minus_two',\
                   'voicingFinalUnclipped_sma',\
                   'voicingFinalUnclipped_slope', 'words' ]]
    #if ep==2.0:
    #    print ep_df.keys()
    return ep_df

In [4]:
#create the training set
#input: speaker, path to directory of pickled episodes
#return: dataframe for train
def get_train_all(eps_path,return_individual_fold=False, limit_of_final_trps=.75):
      
    train = pd.DataFrame() if not return_individual_fold else [] #list if folds
    pathlist = []
    
    print 'Get list of paths...'
    for sp in range(2,8):
        pathlist += lsdir(eps_path+'r'+str(int(sp))+'/', '.pkl')

    print 'Load pickled episodes into Dataframe...'
    lsp=1
    speaker_dict = {}
    out=[]
    
    for path in pathlist:
        ep = path.split('_')[1].split('.pkl')[0]
        speak = path.split('_')[0][1]
        eppath = eps_path+'r'+str(int(speak))+'/'#+path
        

        try:
            train_ep = open_pkl_ep(speak,ep,eppath)
        except:
            print speak, ep
            continue

        train_ep = train_ep.drop(train_ep[(train_ep.label == 2)&(train_ep.label_dur>limit_of_final_trps)].index)
        train_ep = train_ep.drop(train_ep[(train_ep.time_in_sec < 1.01)].index)
        #print len(train_ep)
        #break
        
        if int(speak) != int(lsp):
            print "speaker", speak
            speaker_dict[int(speak)] = pd.DataFrame()
            lsp = speak
            #print train_ep.keys()
        
        if return_individual_fold:
            speaker_dict[int(speak)] = speaker_dict[int(speak)].append(deepcopy(train_ep))
        else:
            out_tupel = (int(speak),int(float(ep)))
            if out_tupel in out:
                print 'out ',speak,ep
                continue
            else:
                train = train.append(deepcopy(train_ep))
    print "training data loaded"
    
    if return_individual_fold:
        train = [speaker_dict[key] for key in sorted(speaker_dict.keys())]
    return train

In [29]:
eps_path = './../../Data/pickled_episodes_1/'
train_all = get_train_all(eps_path,return_individual_fold=False, limit_of_final_trps=.75)

Get list of paths...
Load pickled episodes into Dataframe...
speaker 2
speaker 3
speaker 4
speaker 5
speaker 6
6 129
6 138
6 150
speaker 7
training data loaded


In [14]:
#some possible feature combinations
rmsfeats = 'label ~ time_in_sec +\
pcm_LOGenergy_sma + pcm_loudness_sma +\
pcm_RMSenergy_sma + rms_minus_one + rms_minus_two + rms_minus_three + rms_minus_four'

rmsfeats_raw = 'label ~ time_in_sec +\
pcm_RMSenergy_sma + rms_minus_one + rms_minus_two + rms_minus_three + rms_minus_four'

durfeats = 'label ~ time_in_sec +\
zscore + duration'

intensity = 'label ~ time_in_sec +\
pcm_intensity_sma  + intensity_mean + intensity_slope'

rmsintens = 'label ~ time_in_sec +\
pcm_RMSenergy_sma + rms_minus_one + rms_minus_two + rms_minus_three + rms_minus_four+\
pcm_intensity_sma  + intensity_mean + intensity_slope'

lm_features = 'label ~ time_in_sec + wml + wml_trigram'

rmsfeatslm = rmsfeats + " + "+ lm_features[lm_features.find('time_in_sec +')+13:]
durfeatslm = durfeats + " + "+ lm_features[lm_features.find('time_in_sec +')+13:]
intensitylm = intensity + " + "+ lm_features[lm_features.find('time_in_sec +')+13:]
rmsintenslm = rmsintens + " + "+ lm_features[lm_features.find('time_in_sec +')+13:]

In [24]:
##################################################################################
##################################################################################
##################################################################################
########################acoustic + lm features:###################################
#the best features are changing with respect to the heldout speaker!

acoustic = 'label ~ time_in_sec +\
pcm_RMSenergy_sma + rms_minus_one + rms_minus_two + rms_minus_three + rms_minus_four +\
pcm_LOGenergy_sma + pcm_loudness_sma + pcm_intensity_sma  +\
zscore + duration + intensity_mean + intensity_slope + voicingFinalUnclipped_sma + \
voicingFinalUnclipped_slope'

lm_features = 'label ~ time_in_sec + wml + wml_trigram'
acousticlm = acoustic + " + " + lm_features[lm_features.find('time_in_sec +')+13:]

features = acousticlm

##################################################################################
##################################################################################
##################################################################################

In [25]:
names = [feat.strip() for feat in features.split('~')[1].split('+')]
print names

['time_in_sec', 'pcm_RMSenergy_sma', 'rms_minus_one', 'rms_minus_two', 'rms_minus_three', 'rms_minus_four', 'pcm_LOGenergy_sma', 'pcm_loudness_sma', 'pcm_intensity_sma', 'zscore', 'duration', 'intensity_mean', 'intensity_slope', 'voicingFinalUnclipped_sma', 'voicingFinalUnclipped_slope', 'wml', 'wml_trigram']


# SelectKBest

In [31]:
'''Apply the SelectKBest function on the data'''
for out in range(2,8):
    #transform data
    print out
    train = deepcopy(train_all)
    train = train[train['speaker']!=out]
    #transform data
    y, X = dmatrices(features, train, return_type="dataframe")
    y = np.ravel(y)
    # feature extraction
    number_best_features = 6
    test = SelectKBest(score_func=f_regression, k=number_best_features)
    fit = test.fit(X, y)
    ###################
    # summarize scores
    np.set_printoptions(precision=3)
    name_dict = {str(i+1):names[i] for i in range(len(names))}
    name_dict['0'] = 'NAN'
    all_results = sorted([(list(fit.scores_)[i],name_dict[str(i)]) for i in range(len(list(fit.scores_)))])
    all_results = list(reversed(all_results)) 
    results = all_results[:number_best_features+1]
    print "\n".join([str(tupel[0])+"\t"+tupel[1] for tupel in results])
    #ranked_features = fit.transform(X)
    # summarize selected features
    #print(ranked_features[0:5,:])

2
24181.2743604	duration
17407.3516293	wml
6195.8368421	time_in_sec
1975.45283182	zscore
1861.78644428	wml_trigram
1654.67304781	pcm_LOGenergy_sma
1034.57107893	voicingFinalUnclipped_sma
3
35656.2548934	duration
21178.3788933	pcm_LOGenergy_sma
13316.8443968	pcm_loudness_sma
8356.45248668	zscore
6938.69133701	time_in_sec
6244.50874177	intensity_mean
4804.60936546	wml
4
35023.6663408	duration
17068.6481341	pcm_LOGenergy_sma
10382.9180605	time_in_sec
9454.72793341	pcm_loudness_sma
5525.67335702	zscore
4515.13170003	intensity_mean
3310.14603072	rms_minus_two
5
45064.8041336	duration
15113.1079834	pcm_LOGenergy_sma
9413.79196808	time_in_sec
8171.16782011	pcm_loudness_sma
5335.71987502	zscore
3996.53030784	intensity_mean
3617.02651878	wml
6
39240.9185981	duration
9500.41290694	time_in_sec
8151.80435382	wml
5843.40671776	pcm_LOGenergy_sma
3130.97534668	pcm_loudness_sma
1846.37825995	zscore
1437.2505186	intensity_mean
7
36435.0740785	duration
18552.3220307	pcm_LOGenergy_sma
11908.8013552	pcm_l

# Recursive Feature Elimination (RFE)

In [32]:
#this algorithm takes a little more time (~5min)... so don't get impatient
out = 2
train = deepcopy(train_all)
train = train[train['speaker']!=out]
#transform data
y, X = dmatrices(features, train, return_type="dataframe")
y = np.ravel(y)
# feature extraction
number_of_Features = 3
model = LogisticRegression()
rfe = RFE(model, number_of_Features)
fit = rfe.fit(X, y)

print("Num Features: %d") % fit.n_features_
print("Selected Features: %s") % fit.support_
print("Feature Ranking: %s") % fit.ranking_

#name_dict = {str(i+1):names[i] for i in range(len(names))}
#name_dict['0'] = 'NAN'
#results = sorted([(list(fit.scores_)[i],name_dict[str(i)]) for i in range(len(list(fit.scores_)))])
#results = list(reversed(results)) 
#print "\n".join([str(tupel[0])+"\t"+tupel[1] for tupel in results])


Num Features: 3
Selected Features: [False False  True  True False False  True False False False False False
 False False False False False False]
Feature Ranking: [ 7 13  1  1  3  2  1  9  4 16 12  5 15 14  6  8 10 11]


In [39]:
print features

label ~ time_in_sec +pcm_RMSenergy_sma + rms_minus_one + rms_minus_two + rms_minus_three + rms_minus_four +pcm_LOGenergy_sma + pcm_loudness_sma + pcm_intensity_sma  +zscore + duration + intensity_mean + intensity_slope + voicingFinalUnclipped_sma + voicingFinalUnclipped_slope +  wml + wml_trigram


In [49]:
name_dict = {str(i+1):names[i] for i in range(len(names))}
name_dict['0'] = 'label'
frank = '7 13  1  1  3  2  1  9  4 16 12  5 15 14  6  8 10 11'.split(' ')
frank = [f for f in frank if f != ""]
print len(frank)
print len(names)
tupel_rank = [(int(frank[i]), name_dict[str(i)]) for i in range(len(frank))]
tupel_rank = sorted(tupel_rank)
#if feat != '':
    #    print name_dict[feat.strip()]
print "\n".join([str(t[0])+'\t'+t[1] for t in tupel_rank])

18
17
1	pcm_RMSenergy_sma
1	rms_minus_four
1	rms_minus_one
2	rms_minus_three
3	rms_minus_two
4	pcm_loudness_sma
5	duration
6	voicingFinalUnclipped_sma
7	label
8	voicingFinalUnclipped_slope
9	pcm_LOGenergy_sma
10	wml
11	wml_trigram
12	zscore
13	time_in_sec
14	intensity_slope
15	intensity_mean
16	pcm_intensity_sma


In [None]:
############################
#Don't know how useful the following is
#it's really really slow -> i let it run an hour and there was no output
#it's another option to get the n best features
###########################
#from sklearn.svm import SVR
#n_features = 5
#estimator = SVR(kernel="linear")
#selector = RFE(estimator, n_features, step=1)
#selector = selector.fit(X, y)
#print selector.support_ 
#print selector.ranking_