### Import of Functions

In [11]:
#basics
import os
import pandas as pd
import numpy as np
#import prep
import math
from statsmodels.stats.descriptivestats import sign_test
        
from __future__ import division # correcti dividing
from sklearn import preprocessing


#model builiding
from sklearn import metrics, svm, linear_model
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR


#model selection
from sklearn.metrics import roc_curve, auc, precision_recall_curve, mean_absolute_error, mean_squared_error
from sklearn.model_selection import GridSearchCV, LeaveOneOut, StratifiedKFold, StratifiedShuffleSplit, train_test_split
from sklearn.grid_search import GridSearchCV, ParameterGrid
from sklearn.model_selection import LeaveOneOut, StratifiedKFold, StratifiedShuffleSplit, train_test_split, KFold

from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
from scipy import stats, interp
from scipy.stats import skew, kurtosis
from statsmodels.stats.descriptivestats import sign_test

#graphics
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt 
import seaborn as sns
# fix random seed for reproducibility

np.random.seed(42)

In [12]:
# load dataframe
def load(study):
   
    if study=='mb':
        path='./mb/mb_EMG.csv'
    if study=='charite':
        path='./charite/charite_OpenFace_2.0.0_final.csv'
        audio_path='./charite/charite_audio.csv'

    #Audio
    df_audio=pd.read_csv(audio_path, sep=',', na_values=['?'])
    df_audio['frame']=df_audio.counter
        
    #Face
    df=pd.read_csv(path, sep=',', na_values=['?'])
    
    df=df[df.frame<=11310].reset_index(drop=True)
    
    ## Probandin 35 should be excluded as for her the sucess was only around 70% and all other were around 99%
    df=df[df.vpn!=35].reset_index(drop=True)
    
    
    action_r=['AU01_r', 'AU02_r', 'AU04_r', 'AU05_r', 'AU06_r', 'AU07_r', 'AU09_r', 
    'AU10_r', 'AU12_r', 'AU14_r', 'AU15_r', 'AU17_r', 'AU20_r', 'AU23_r',
    'AU25_r', 'AU26_r', 'AU45_r']
    
    action_c=['AU01_c', 'AU02_c', 'AU04_c', 'AU05_c', 'AU06_c',
           'AU07_c', 'AU09_c', 'AU10_c', 'AU12_c', 'AU14_c', 'AU15_c',
           'AU17_c', 'AU20_c', 'AU23_c', 'AU25_c', 'AU26_c', 'AU28_c',
           'AU45_c']

    gaze=['gaze_angle_x', 'gaze_angle_y']
        
    audio=['pitch','spectrum_0', 'spectrum_1', 'spectrum_2', 'spectrum_4', 'spectrum_3', 
                     'spectrum_5', 'spectrum_6', 'spectrum_7', 'spectrum_8', 'spectrum_9,',
                     'spectrum_10', 'spectrum_11', 'spectrum_12',  'spectrum_13', 'spectrum_14',
                     'spectrum_15', 'spectrum_16', 'spectrum_17', 'spectrum_18', 'spectrum_19',
                     'spectrum_20', 'spectrum_21',  'spectrum_22', 'spectrum_23', 'spectrum_24', 
                     'spectrum_25', 'spectrum_26', 'spectrum_27', 'spectrum_28', 'spectrum_29', 
                     'spectrum_30', 'spectrum_31', 'spectrum_32', 'spectrum_33', 'spectrum_34',
                     'spectrum_35', 'spectrum_36', 'spectrum_37', 'spectrum_38', 'spectrum_39']



    ## Probandin 38 could be excluded due to inclusion criteria
    #df=df[df.vpn!=38].reset_index(drop=True)

    #to make the sequences comparable in length as there is around one second variation

    return df, df_audio, action_r, action_c, gaze, audio

def aq_charite(df):
    path='./charite/asq.csv'

    aq=pd.read_csv(path, sep=';', na_values=['-99']) 
    aq = aq.rename(columns={'Probanden-ID': 'vpn', 'ASQ': 'asq'})
    aq.vpn=aq.vpn.astype(str)
    aq.vpn=aq.vpn.str[-9:-7]
    aq.vpn=aq.vpn.astype(str).astype(int)
    df.vpn=df.vpn.astype(int)
    final=pd.merge(df, aq, on='vpn')

    return final

def exclude_outlier(df):
    df['success_rate']=df['success'].groupby(df['vpn']).transform('mean') 
    print (set(df[df['success_rate']<0.9].vpn))
    df=df[df['success_rate']>0.9].reset_index(drop=True)
    ## Probandin 35 should be excluded as for her the sucess was only around 70% and all other were around 99%
    df['duration_total']=df['frame'].groupby(df['vpn']).transform('max') 
    print (set(df[df['duration_total']<11310].vpn))
    df=df[df['duration_total']>11300].reset_index(drop=True)
    return df
def get_convparts(df):

    conversation_parts=['intro', 'neutral_speaker' ,'neutral_proband', 'joy_speaker', 
                        'joy_proband', 'ekel_speaker', 'ekel_proband']

    expected_times=['183s', '40s' ,'26s', '29s', 
                        '26s', '29s', '26s']

    start=[]
    end=[]
    n=0
    for i in conversation_parts:
        print (expected_times[n])
        n=n+1
        start_time=np.min(df[df[i]==True].groupby('vpn').min()['frame'])
        end_time=np.min(df[df[i]==True].groupby('vpn').max()['frame'])

        print ((end_time-start_time)) #np.float(30) #ist gecheckt: entspricht den Zeiten in der Tabelle von Chris

        start.append(start_time) 
        end.append(end_time)

        # Nans entstehen, weil es uneindeutige Abschnitte gibt
        # die Zuordnungen der Abschnitte ist nicht hundertpro genau und varriert, avber nicht mehr als +- 1 seconds

    n=0

    for i in conversation_parts:
        df.loc[df[i]==True, 'conversation']=i
        n=n+1

    #df=df.dropna(subset=['conversation'])

    return df, start, end

### BUILT FEATURES

In [13]:
def features(df, df_audio, action_r, action_c, gaze, audio):
    
    features_AU_intense=[]
    features_AU_occurence=[]
    features_gaze=[]
    features_audio=[]

    operation=['mean', 'max', 'std', 
               lambda x: (skew(x)), 
               lambda x: (kurtosis(x))]
    operation_name=['mean', 'max', 'std', 'argmax', 'skew', 'kurtosis']
                
    # for each conversation-part  
    #try:
    for part in set(df.conversation.dropna()):
        for unit in action_r: #for every action unit (intensity )
            for i, op in enumerate(operation):
                feature_name=unit+'_'+operation_name[i]+'_'+str(part)
                df[feature_name]=df[df.conversation==part].groupby('vpn')[unit].transform(op)                
                features_AU_intense.append(feature_name)
                print (feature_name + "_has_been_calculated" )

        for unit in action_c: #for every action unit (calculates mean occurence)
            feature_name=unit+'_occ'+'_'+str(part)
            df[feature_name]=df[df.conversation==part].groupby(df['vpn'])[unit].transform('mean') 
            features_AU_occurence.append(feature_name)
            print (feature_name + "_has_been_calculated") 

        for unit in gaze: #for every gaze value
            for i, op in enumerate(operation):
                feature_name=unit+'_'+operation_name[i]+'_'+str(part)
                df[feature_name]=df[df.conversation==part].groupby('vpn')[unit].transform(op)                
                features_gaze.append(feature_name)
                print (feature_name + "_has_been_calculated" )
#except:
    print ('no face features')

#try:
    for part in set(df_audio.conversation.dropna()):   #for every audio value              
        for unit in audio: #for every action unit
            feature_name=unit+'_mean_'+str(part)
            df_audio[feature_name]=df_audio[df_audio.conversation==part].groupby('vpn')[unit].transform('mean')                
            features_audio.append(feature_name)
            print (feature_name + "_has_been_calculated" )           
#except:
    print ('no audio features')
            
    return df, df_audio, features_AU_intense, features_AU_occurence, features_gaze, features_audio

    # es entstehen NaNs dadurch, dass nur für jeden Abschnitt die jeweiligen Abschnitte berechnet werden.

# Regression (ADOS and ADI-R)

In [14]:
def ados_charite(df):
    path='./ados.csv'
    ados=pd.read_csv(path, sep=';', na_values=['-99']) 
    ados=ados.rename(columns={'id': 'vpn'})
    ados.vpn=ados.vpn.astype(str)
    ados.vpn=ados.vpn.str[-9:-7]
    ados.vpn=ados.vpn.astype(str).astype(int)
    df.vpn=df.vpn.astype(int)
    final=pd.merge(df, ados, on='vpn')
    return final

In [15]:
def prepare_for_reg(df, df_audio, features_AU_intense, features_AU_occurence, features_gaze, features_audio, goal='ADOS', rescaled=False):

    #different feature spaces  

    df=df.groupby('vpn').mean().reset_index()
    
    
    if goal=='ADOS':        
        df=df[~df.ados_total.isnull()].reset_index(drop=True)
        df=df[df.ados_total!=-97].reset_index(drop=True)
        df_audio=df_audio[~df_audio.ados_total.isnull()].reset_index(drop=True)
        df_audio=df_audio[df_audio.ados_total!=-97].reset_index(drop=True)
        
        df_audio=df_audio.groupby('vpn').mean()
        df_total=pd.merge(df, df_audio[features_audio].reset_index(), how='inner', on='vpn')
    
        y=np.array(df.ados_total) 
        y_audio=np.array(df_audio.ados_total)
        y_total=np.array(df_total.ados_total)

    if goal=='ADOS_social':     
        df=df[~df.ados_total.isnull()].reset_index(drop=True)
        df=df[df.ados_total!=-97].reset_index(drop=True)
        df_audio=df_audio[~df_audio.ados_total.isnull()].reset_index(drop=True)
        df_audio=df_audio[df_audio.ados_total!=-97].reset_index(drop=True)
      
        
        df_audio=df_audio.groupby('vpn').mean()
        df_total=pd.merge(df, df_audio[features_audio].reset_index(), how='inner', on='vpn')
        
        y=np.array(df.ados_social)
        y_audio=np.array(df_audio.ados_social)
        y_total=np.array(df_total.ados_social)

    if goal=='ADOS_communication':      
        df=df[~df.ados_total.isnull()].reset_index(drop=True)
        df=df[df.ados_total!=-97].reset_index(drop=True)
        df_audio=df_audio[~df_audio.ados_total.isnull()].reset_index(drop=True)
        df_audio=df_audio[df_audio.ados_total!=-97].reset_index(drop=True)
        
        
        df_audio=df_audio.groupby('vpn').mean()
        df_total=pd.merge(df, df_audio[features_audio].reset_index(), how='inner', on='vpn')
        
        y=np.array(df.ados_commu)
        y_audio=np.array(df_audio.ados_commu)
        y_total=np.array(df_total.ados_commu)

    if goal=='ADI_social': 
        df=df[~df.adir_social.isnull()].reset_index(drop=True)
        df=df[df.adir_social>0].reset_index(drop=True)
        
        
        df_audio=df_audio.groupby('vpn').mean()
        df_total=pd.merge(df, df_audio[features_audio].reset_index(), how='inner', on='vpn')
        
        y=np.array(df.adir_social)
        y_audio=np.array(df_audio.adir_social)
        y_total=np.array(df_total.adir_social)
        
    if goal=='ADI': 
        df=df[~df.adir_behav.isnull()].reset_index(drop=True)
        df=df[df.adir_behav>0].reset_index(drop=True)
        
        
        df_audio=df_audio.groupby('vpn').mean()
        df_total=pd.merge(df, df_audio[features_audio].reset_index(), how='inner', on='vpn')
        
        y=np.array(df.adir_behav)
        y_audio=np.array(df_audio.adir_behav)
        y_total=np.array(df_total.adir_behav)

    diagnosis=np.array(df['asq'])
    
    
    X_face = np.array(df[features_AU_intense + features_AU_occurence + features_gaze])
     
    X_int=np.array(df[features_AU_intense])
    X_occ=np.array(df[features_AU_occurence])
    X_gaze=np.array(df[features_gaze])
    
    X_audio=np.array(df_audio[features_audio])
    
    df_total['asc']=df_total.asc
    
    X_total = np.array(df_total[features_AU_intense + features_AU_occurence + features_gaze + features_audio])

    
    #shuffle
    index=np.array(np.random.choice(len(y), size=len(y), replace=False))
    index_audio=np.array(np.random.choice(len(y_audio), size=len(y_audio), replace=False))
    index_total=np.array(np.random.choice(len(y_total), size=len(y_total), replace=False))
    
    print (index)
    y=y[index]
   
    X_occ = X_occ[index]
    X_int = X_int[index]
    X_gaze = X_gaze[index]
    
    X_face = X_face[index]
    
    y_audio=y_audio[index_audio]   
    X_audio = X_audio[index_audio]
    
    y_total=y_total[index_total]
    X_total = X_total[index_total]
 

    return X_occ, X_int, X_gaze, X_face, X_audio, y, y_audio, X_total,  y_total
    

## Regression Functions

In [28]:
def regression(X, y, name='name'):
    #Initialisierung verschiedener Variablen
    rf_parameter=[]
    svm_parameter=[]
    
    y_pred_svr = []
    
    y_pred_tree = []
    
    y_true=np.zeros(len(y)) 
    
    y_base=np.zeros(len(y)) 
       
    #plt.figure(1)

    #Nested Cross-Validation
    i=0   
    loo = LeaveOneOut()
    
    X=preprocessing.scale(X)

    for train, test in loo.split(X):
        
        y_true[i]=int(y[test])
        y_base[i]=np.mean(y[train])
            
        tree = GridSearchCV(RandomForestRegressor(n_estimators=5000), cv=3,
                   param_grid={"max_depth": [2, 4, 5, 10, 15, 20, 30]})
        tree.fit(X[train], y[train])
        
        y_pred_tree = np.append(y_pred_tree, tree.predict(X[test])) 
        
        svm_clf =  GridSearchCV(SVR(epsilon=0.001,  kernel='rbf'), cv=3,
                   param_grid={"C": [0.001, 0.01, 0.1, 1.0, 10., 100],
                                'gamma': [0.001, 0.01, 0.1, 1.0, 10., 100]})
        svm_clf.fit(X[train], y[train])
        y_pred_svr = np.append(y_pred_svr, svm_clf.predict(X[test])) 
   
        rf_parameter=np.append(rf_parameter, tree.best_params_)   
        svm_parameter=np.append(svm_parameter, svm_clf.best_params_)
        i=i+1   
     
    #if y_pred_tree=

    print ('Crossvalidierte Ergebnisse fuer RFR')
    print (math.sqrt(mean_squared_error(y_true, y_pred_tree)))
    print ('Mean_Absolute_Error:')
    errors=np.abs(y_true-y_pred_tree)
    print (np.mean(errors))
    print ('Standard Deviation of the Error:')
    print (np.std(errors))
             
    print ('Crossvalidierte Ergebnisse fuer SVR')
    print (math.sqrt(mean_squared_error(y_true, y_pred_svr))   ) 
    print ('Mean_Absolute_Error:')
    errors=np.abs(y_true-y_pred_svr)
    print (np.mean(errors))
    print ('Standard Deviation of the Error:')
    print (np.std(errors))
    
    print ('Baseline: Root Mean Squared Error')
    print (math.sqrt(mean_squared_error(y_true, y_base)))
    print ('Mean_Absolute_Error:')
    errors=np.abs(y_true-y_base)
    print (np.mean(errors))
    print ('Standard Deviation of the Error:')
    print (np.std(errors))
    
    results=pd.DataFrame([y_true, y_pred_tree, rf_parameter, y_pred_svr, svm_parameter])
    results.to_csv(name+'regression_rf.csv')

    evaluate(y_pred_svr, y_pred_tree, y_true, y_base)
    
    return y_pred_svr, y_pred_tree, y_true, y_base, tree, svm_clf

In [17]:
def evaluate(y_pred_svr, y_pred_tree, y_true, y_base):

    tree_error=np.abs(y_true-y_pred_tree)
    svr_error=np.abs(y_true-y_pred_svr)
    base_error=np.abs(y_true-y_base)
    
    print ('SVR: ' + str(np.mean(svr_error)))
    print ('Tree: ' + str(np.mean(tree_error)))
    print ('base: ' + str(np.mean(base_error)))

    print ('Tree better then Baseline')
    print (stats.ttest_rel(tree_error, base_error, axis=0, nan_policy='omit'))
    
    print (stats.kruskal(tree_error, base_error))

    print ('SVR better then Baseline')
    print (stats.ttest_rel(svr_error, base_error, axis=0, nan_policy='omit'))
    
    print (stats.kruskal(svr_error, base_error))
    

In [18]:
def reg_cor(x, y, x_label, y_label):
    plt.figure(figsize=(12,8))
    sns.regplot(x, y)
    plt.yticks(fontsize=14)    
    plt.xticks(fontsize=14)  
    plt.xlabel(x_label, fontsize=16) 
    plt.ylabel(y_label, fontsize=16) 
    plt.savefig(x_label + y_label + '_Regression_Correlation.png')
    plt.show()
    plt.close()

## Main Code for Regression

In [19]:
df, df_audio, action_r, action_c, gaze, audio = load('charite', 'lab')

df=aq_charite(df, 'lab')
df=exclude_outlier(df)
df, start, end=get_convparts(df)
df=ados_charite(df)

df_audio, start_audio, end_audio=get_convparts(df_audio)
df_audio=aq_charite(df_audio, 'lab')
df_audio=ados_charite(df_audio)

set([])
set([2, 4, 8, 12, 15, 16, 18, 23, 27, 28, 33, 34, 36, 51, 53, 54, 56, 60, 61, 65, 72, 75])
183s
5578
40s
1289
26s
839
29s
929
26s
869
29s
959
26s
779
183s
7995
40s
1866
26s
1184
29s
1355
26s
1248
29s
1377
26s
1119


In [20]:
df, df_audio, features_AU_intense, features_AU_occurence, features_gaze, features_audio=features(df, df_audio, action_r, action_c, gaze, audio)

AU01_r_mean_0.0_has_been_calculated
AU01_r_max_0.0_has_been_calculated
AU01_r_std_0.0_has_been_calculated
AU01_r_argmax_0.0_has_been_calculated
AU01_r_skew_0.0_has_been_calculated
AU01_r_kurtosis_0.0_has_been_calculated
AU02_r_mean_0.0_has_been_calculated
AU02_r_max_0.0_has_been_calculated
AU02_r_std_0.0_has_been_calculated
AU02_r_argmax_0.0_has_been_calculated
AU02_r_skew_0.0_has_been_calculated
AU02_r_kurtosis_0.0_has_been_calculated
AU04_r_mean_0.0_has_been_calculated
AU04_r_max_0.0_has_been_calculated
AU04_r_std_0.0_has_been_calculated
AU04_r_argmax_0.0_has_been_calculated
AU04_r_skew_0.0_has_been_calculated
AU04_r_kurtosis_0.0_has_been_calculated
AU05_r_mean_0.0_has_been_calculated
AU05_r_max_0.0_has_been_calculated
AU05_r_std_0.0_has_been_calculated
AU05_r_argmax_0.0_has_been_calculated
AU05_r_skew_0.0_has_been_calculated
AU05_r_kurtosis_0.0_has_been_calculated
AU06_r_mean_0.0_has_been_calculated
AU06_r_max_0.0_has_been_calculated
AU06_r_std_0.0_has_been_calculated
AU06_r_argmax_

### ADOS

In [21]:
X_occ_ados, X_int_ados, X_gaze_ados, X_face_ados, X_audio_ados, y_ados, y_audio_ados, X_total_ados,  y_total_ados=prepare_for_reg(df, df_audio, features_AU_intense, features_AU_occurence, features_gaze, features_audio, 'ADOS')
X_occ_ados_social, X_int_ados_social, X_gaze_ados_social, X_face_ados_social, X_audio_ados_social, y_ados_social, y_audio_ados_social, X_total_ados_social,  y_total_ados_social=prepare_for_reg(df, df_audio, features_AU_intense, features_AU_occurence, features_gaze, features_audio, 'ADOS_social')
X_occ_ados_communication, X_int_ados_communication, X_gaze_ados_communication, X_face_ados_communication, X_audio_ados_communication, y_ados_communication, y_audio_ados_communication, X_total_ados_communication,  y_total_ados_communication=prepare_for_reg(df, df_audio, features_AU_intense, features_AU_occurence, features_gaze, features_audio, 'ADOS_communication')

[35 13 26 30 16 31 21 12  8 17  9 34  0  4 29 15 19  5 11  1 24  2 33  3
 32 23 27 10 22 18 25  6 20  7 14 28]
[10 28  3 19 20 12  5 31 35  9 26 17 16 13 14 24 18 33 21 30 29  2 34 23
 25 22 15  0  1 11  7  8  6 27  4 32]
[12 15 19 34  6  7  9 13 24  2 17 21 28  8 33 14 23 35 30  4 26 16 10 27
  3 22  5 25 11 20  1 29 18  0 32 31]


In [29]:
#ados?# Prediction of ADOS - Social
print ('social')
y_pred_svr_ADOS_social, y_pred_tree_ADOS_social, y_true_ADOS_social, y_base_ADOS_social, tree_ADOS_social, svr_ADOS_social=regression(X_total_ados_social, y_total_ados_social, name='ADOS_social')

## Prediction of ADOS - Communication
print ('communication')
y_pred_svr_ADOS_communication, y_pred_tree_ADOS_communication, y_true_ADOS_communication, y_base_ADOS_communication, tree_ADOS_communication, svr_ADOS_communication=regression(X_total_ados_communication, y_total_ados_communication, name='ADOS_communication')


print ('total')
y_pred_svr_ADOS, y_pred_tree_ADOS, y_true_ADOS, y_base_ADOS, tree_ADOS, svr_ADOS=regression(X_total_ados, y_total_ados, name='ADOS_total')

social
Crossvalidierte Ergebnisse fuer RFR
2.14851573007
Mean_Absolute_Error:
1.84804271306051
Standard Deviation of the Error:
1.095836654364687
Crossvalidierte Ergebnisse fuer SVR
2.08520731063
Mean_Absolute_Error:
1.7787956880087767
Standard Deviation of the Error:
1.0881063498676251
Baseline: Root Mean Squared Error
2.03379608875
Mean_Absolute_Error:
1.7428571428571429
Standard Deviation of the Error:
1.0482249330196654
SVR: 1.7787956880087767
Tree: 1.84804271306051
base: 1.7428571428571429
Tree better then Baseline
Ttest_relResult(statistic=2.7899834031279167, pvalue=0.00847347469007062)
KruskalResult(statistic=0.0795700081587108, pvalue=0.7778809700534696)
SVR better then Baseline
Ttest_relResult(statistic=1.0978201623027093, pvalue=0.27977841030077444)
KruskalResult(statistic=0.0, pvalue=1.0)
communication
Crossvalidierte Ergebnisse fuer RFR
1.99376401498
Mean_Absolute_Error:
1.6329969071157937
Standard Deviation of the Error:
1.1438601526257177
Crossvalidierte Ergebnisse fuer S

In [23]:
#reg_cor(x=y_pred_svr_ADOS, y=y_true_ADOS, x_label='Predicition of SVR', y_label='ADOS Value')
#reg_cor(x=y_pred_tree_ADOS, y=y_true_ADOS, x_label='Predicition of RF', y_label='ADOS Value')

In [24]:
### ADOS

#sns.regplot(df_Reg[df_Reg.ados_total>0].groupby('vpn').mean().asq, df_Reg[df_Reg.ados_total>0].groupby('vpn').mean().ados_total)
#plt.show()

#sns.regplot(df_Reg[df_Reg.ados_total>0].groupby('vpn').mean().asq, df_Reg[df_Reg.ados_total>0].groupby('vpn').mean().ados_social)
#plt.show()

#plt.scatter(df_Reg[df_Reg.ados_total>0].groupby('vpn').mean().asq, df_Reg[df_Reg.ados_total>0].groupby('vpn').mean().ados_commu)
#plt.show()

### ADI-R

In [25]:
X_occ_adi, X_int_adi, X_gaze_adi, X_face_adi, X_audio_adi, y_adi, y_audio_adi, X_total_adi,  y_total_adi=prepare_for_reg(df, df_audio, features_AU_intense, features_AU_occurence, features_gaze, features_audio, 'ADI')
X_occ_adi_social, X_int_adi_social, X_gaze_adi_social, X_face_adi_social, X_audio_adi_social, y_adi_social, y_audio_adi_social, X_total_adi_social,  y_total_adi_social=prepare_for_reg(df, df_audio, features_AU_intense, features_AU_occurence, features_gaze, features_audio, 'ADI_social')

[16 14  2 19 15  1 18  8  4  7  3  5 11 20 17 13  9  6 12  0 10]
[14 11 21 17  2 13  9 10 16  0  1 15  4 19 12  7  3 18  8  6  5 20]


In [30]:
print 'social'
y_pred_svr_ADI_social, y_pred_tree_ADI_social, y_true_ADI_social, y_base_ADI_social, tree_ADI_social, svr_ADI_social=regression(X_total_adi_social,  y_total_adi_social, name='ADIsocial')
print 'total'
y_pred_svr_ADI_total, y_pred_tree_ADI_total, y_true_ADI_total, y_base_ADI_total, tree_ADI_total, svr_ADI_total=regression(X_total_adi,  y_total_adi, name='ADIsocial')


## Predicti
## Prediction of ADI Social based on AQ
#print 'social (based on AQ)'
#y_pred_svr_AQ, y_pred_tree_AQ, y_true_AQ, y_base_AQ, tree_AQ, svr_AQ=regression(diagnosis[diagnosis>0].reshape(-1, 1), y_adi_rf_social[diagnosis>0], name='ADI')

social
Crossvalidierte Ergebnisse fuer RFR
7.73481948253
Mean_Absolute_Error:
6.41339273605183
Standard Deviation of the Error:
4.323867023909681
Crossvalidierte Ergebnisse fuer SVR
7.78391075283
Mean_Absolute_Error:
6.253126662467727
Standard Deviation of the Error:
4.635479862022532
Baseline: Root Mean Squared Error
7.73263543649
Mean_Absolute_Error:
6.415584415584414
Standard Deviation of the Error:
4.316703302308507
SVR: 6.253126662467727
Tree: 6.41339273605183
base: 6.415584415584414
Tree better then Baseline
Ttest_relResult(statistic=-0.008865317386010853, pvalue=0.9930102737608972)
KruskalResult(statistic=0.013783818438259121, pvalue=0.9065395121163305)
SVR better then Baseline
Ttest_relResult(statistic=-0.9214448759430756, pvalue=0.3672836033413154)
KruskalResult(statistic=0.17863828695985415, pvalue=0.6725460961836786)
total
Crossvalidierte Ergebnisse fuer RFR
3.13019699423
Mean_Absolute_Error:
2.499485155904631
Standard Deviation of the Error:
1.8843320243734476
Crossvalidier

In [31]:
evaluate(y_pred_svr_ADI_social[diagnosis>0], y_pred_tree_ADI_social[diagnosis>0], y_adi_rf_social[diagnosis>0], y_pred_svr_AQ)

NameError: name 'diagnosis' is not defined

In [None]:
evaluate(y_pred_svr_ADI_social[diagnosis>0], y_pred_tree_ADI_social[diagnosis>0], y_adi_rf_social[diagnosis>0], y_pred_svr_AQ)
print ('ADI Prediction vs. True Value')
plt.scatter(y_pred_tree_ADI_social[diagnosis>0], y_adi_rf_social[diagnosis>0])
print (stats.spearmanr(y_pred_tree_ADI_social, y_adi_rf_social))
plt.show()

In [None]:
plt.savefig('ADI_WTAS')

In [None]:
plt.scatter(df_Reg[df_Reg.asc==1].groupby('vpn').adir_social.mean(), df_Reg[df_Reg.asc==1].groupby('vpn').asq.mean())
plt.show()

plt.figure(figsize=(12,8))
plt.scatter(df_Reg[df_Reg.asc==1].groupby('vpn').ados_total.mean(), df_Reg[df_Reg.asc==1].groupby('vpn').asq.mean())
plt.yticks(fontsize=16)    
plt.xticks(fontsize=16)  
plt.xlabel('ADOS', fontsize=18)
plt.ylabel('AQ', fontsize=18)
plt.savefig('ADOSandAQ.png')