### Importing the necessary dependencies:

In [55]:
import pandas as pd
import numpy as np

In [56]:
from sklearn.utils import resample
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler

### Importing the data files:

In [57]:
#statistical analysis on features:
df=pd.read_csv('/home/vboxuser/VoiceFiles/Voice4PD-MSA-Tool-global_V2/ArticulationData/stat_ar/Articulationfeaturesst_train_stat.csv')
#train and test data:
df_train=pd.read_csv('/home/vboxuser/VoiceFiles/Voice4PD-MSA-Tool-global_V2/ArticulationData/articulationfeaturesst_seg_dev.csv')
df_test=pd.read_csv('/home/vboxuser/VoiceFiles/Voice4PD-MSA-Tool-global_V2/ArticulationData/articulationfeaturesst_seg_test.csv')

## Building a model with feature selection:

### Preprocessing the data:

#### 1. Extract the features that are a good discriminant:

In [58]:
df.head()

Unnamed: 0,feature,p_HC_DP,p_HC_DP.1
0,avg BBEon_1,0.023689,-0.440402
1,avg BBEon_2,0.068999,-0.355201
2,avg BBEon_3,0.027615,-0.381407
3,avg BBEon_4,0.006813,-0.469835
4,avg BBEon_5,0.002928,-0.504544


#### 2. Obtain the selected good features as a list:

In [59]:
#from the {stat..} file obtained earlier from feature_analysis we take those values that have p_val<0.05 and effect_size>|0.4|
req_features=df[(df['p_HC_DP']<0.05) & (abs(df['p_HC_DP.1'])>0.4)]

In [60]:
feature_list=req_features['feature'].tolist()
print(len(feature_list))

139


#### 3. Obtaining X_train & Y_train:

In [61]:
X_train=df_train[df_train.columns.intersection(feature_list)]
X_train.head()

Unnamed: 0,avg BBEon_1,avg BBEon_4,avg BBEon_5,avg BBEon_6,avg BBEon_7,avg BBEon_8,avg BBEon_9,avg BBEon_10,avg BBEon_11,avg BBEon_12,...,skewness DDMFCCon_5,skewness DDMFCCon_10,skewness BBEoff_6,skewness BBEoff_22,skewness MFCCoff_5,skewness DMFCCoff_12,skewness DDMFCCoff_5,kurtosis BBEon_22,kurtosis BBEoff_2,kurtosis BBEoff_22
0,-1.236051,-2.007595,-2.151657,-2.313118,-2.74141,-3.02162,-3.317513,-3.573329,-3.479685,-3.259063,...,-0.390031,-0.64142,-0.331088,-0.006502,0.004985,-0.081187,-0.487728,-0.815727,-0.681074,-1.390721
1,-0.094454,-1.256991,-1.342239,-1.376264,-1.561052,-1.530774,-2.029228,-2.03029,-1.702845,-1.903537,...,0.168796,0.4646,0.736957,0.949908,0.260764,0.355529,-0.104616,0.807542,-1.264207,-0.079672
2,-0.279615,-1.148637,-1.395024,-1.330388,-1.775505,-1.99457,-2.286245,-2.571636,-2.583259,-2.723721,...,0.790789,-0.613135,-0.710361,0.895773,-0.589876,-0.673822,-0.129467,-0.92305,-0.515471,1.220242
3,-0.869392,-1.789387,-2.052317,-2.407308,-2.76429,-2.708282,-3.032211,-3.328638,-3.416836,-3.325427,...,0.524319,0.313464,0.810267,-0.092846,-1.419273,0.196726,0.469045,0.434434,-1.09241,-1.136694
4,-1.470171,-2.429003,-2.444694,-2.423398,-2.645003,-2.80451,-2.778626,-3.112789,-3.369516,-3.530014,...,-1.154369,-0.171489,-0.430283,-0.602846,0.765616,1.338386,-0.414994,-1.350462,-0.921273,-0.813865


In [62]:
#replace string values to 0s & 1s in target column
df_train.replace({'Group':{"HC":0,"DP":1}},inplace=True)

In [63]:
Y_train=df_train['Group']
Y_train.head()

0    0
1    0
2    0
3    0
4    0
Name: Group, dtype: int64

#### 4. Resampling the data: 

In [64]:
df_train.isnull().sum()

subject          0
avg BBEon_1      0
avg BBEon_2      0
avg BBEon_3      0
avg BBEon_4      0
                ..
kurtosis DDF1    0
kurtosis F2      0
kurtosis DF2     0
kurtosis DDF2    0
Group            0
Length: 490, dtype: int64

In [65]:
#we notice an imbalance in the data classes 
val_count=df_train.groupby(['Group'])['Group'].count()
print(val_count)

Group
0    3878
1    1145
Name: Group, dtype: int64


##### 4.1. Upsampling the data

In [66]:
#hence we resample the data to obtain equal number of data points for each of the two classes:
df_1=df_train[df_train['Group']==1]
df_other=df_train[df_train['Group']!=1]
df_upsampled=resample(df_1,random_state=42,n_samples=3878,replace=True)
df_train_upsampled=pd.concat([df_upsampled,df_other])
#df_train_upsampled.reset_index(inplace=True)
#we upsample the minority class to the number of data points available for majority class

In [67]:
X_train_resampled=df_train_upsampled[df_train_upsampled.columns.intersection(feature_list)]
Y_train_resampled=df_train_upsampled['Group']

##### 4.2. Normalizing the data

In [68]:
#for normalization:
std_Scaler=StandardScaler()
std_Scaler

In [69]:
#standard scalar normalization on feature set:
X=std_Scaler.fit_transform(X_train_resampled)

In [70]:
Y=Y_train_resampled

In [71]:
#checking if normalized data has a mean~0 & standard deviation~1:
print(np.mean(X))
print(np.std(X))

1.2588412640404563e-17
1.0


In [72]:
df_train_upsampled['Group'].value_counts()

1    3878
0    3878
Name: Group, dtype: int64

### Building the model:

#### 1. Random Forest Classifier:

In [73]:
RF_model=RandomForestClassifier(n_estimators=100,class_weight='balanced')

In [74]:
#for unsampled data:
##RF_model.fit(X_train,Y_train)

In [75]:
#for resampled data:
RF_model.fit(X,Y)

#### 1.1 Predicting the output on test data:

In [76]:
df_test.head(3)

Unnamed: 0,avg BBEon_1,avg BBEon_2,avg BBEon_3,avg BBEon_4,avg BBEon_5,avg BBEon_6,avg BBEon_7,avg BBEon_8,avg BBEon_9,avg BBEon_10,...,kurtosis DDMFCCoff_11,kurtosis DDMFCCoff_12,kurtosis F1,kurtosis DF1,kurtosis DDF1,kurtosis F2,kurtosis DF2,kurtosis DDF2,subject,Group
0,-1.703007,-2.309823,-2.475417,-2.664827,-2.76341,-2.814644,-3.196568,-3.554883,-4.059189,-4.299266,...,-0.249404,-0.036873,11.851328,14.477019,9.393443,1.989389,6.540897,3.62254,600_AUDIO_0.wav,HC
1,-1.766605,-2.153507,-2.443244,-2.806753,-3.044377,-3.240641,-3.615207,-3.814603,-4.026244,-4.265777,...,-0.428055,-0.122785,1.518593,2.88077,1.509239,0.05555,1.513768,0.952246,600_AUDIO_1.wav,HC
2,-1.145307,-1.52091,-1.464773,-1.700818,-2.19517,-2.509134,-2.902016,-3.090786,-3.373614,-3.423954,...,-0.858743,-0.339214,0.86943,3.148013,3.679037,0.693122,2.969877,2.440902,600_AUDIO_10.wav,HC


In [77]:
df_test.isnull().sum()

avg BBEon_1      0
avg BBEon_2      0
avg BBEon_3      0
avg BBEon_4      0
avg BBEon_5      0
                ..
kurtosis F2      0
kurtosis DF2     0
kurtosis DDF2    0
subject          0
Group            0
Length: 490, dtype: int64

In [78]:
df_test=df_test.fillna(0)

In [79]:
#obtaining X & Y for test data:
X_test=df_test[df_test.columns.intersection(feature_list)]
df_test.replace({'Group':{"HC":0,"DP":1}},inplace=True)
Y_test=df_test['Group']

In [80]:
X_test=std_Scaler.transform(X_test)

In [81]:
#predicting on test data:
Y_predict_RF=RF_model.predict(X_test)

In [82]:
#viewing accuracy score
print("The RF_model accuracy is given as : ",metrics.accuracy_score(Y_test,Y_predict_RF))

The RF_model accuracy is given as :  0.6886462882096069


#### 1.2 Obtaining a Classification Report:

In [83]:
from sklearn.metrics import classification_report

In [84]:
print(classification_report(Y_test,Y_predict_RF))

              precision    recall  f1-score   support

           0       0.72      0.95      0.81      3311
           1       0.12      0.02      0.03      1269

    accuracy                           0.69      4580
   macro avg       0.42      0.48      0.42      4580
weighted avg       0.55      0.69      0.60      4580



##### 1.2.1 inference:

* Precision : gives accuracy of positive predictions
* Recall    : gives fraction of correctly identified positive predictions
* F1_score  : gives harmonic mean of precision and recall
* Support   : gives number of occurences of each class in Y_test

--> high precision indicates not many true values were predicted as false.                        
--> high recall indicates most true values were predicted correctly.

#### 2. Decision Tree Classifier:

In [85]:
DF_model=DecisionTreeClassifier(max_depth=7,random_state=42,class_weight='balanced')

In [86]:
DF_model.fit(X,Y)

#### 2.1 Predicting the output on test data:

In [87]:
#predicting on test data:
Y_predict_DF=DF_model.predict(X_test)

In [88]:
#viewing accuracy score:
print("The DF_model accuracy is given as : ",metrics.accuracy_score(Y_test,Y_predict_DF))

The DF_model accuracy is given as :  0.527292576419214


#### 2.2 Obtaining a Classification Report:

In [89]:
print(classification_report(Y_test,Y_predict_DF))

              precision    recall  f1-score   support

           0       0.71      0.59      0.64      3311
           1       0.26      0.37      0.30      1269

    accuracy                           0.53      4580
   macro avg       0.48      0.48      0.47      4580
weighted avg       0.58      0.53      0.55      4580



#### 3. Logistic Regression Model:

In [90]:
LR_model=LogisticRegression(class_weight='balanced')

In [91]:
LR_model.fit(X,Y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### 3.1 Predicting the output on test data:

In [92]:
#predicting on test data:
Y_predict_LR=LR_model.predict(X_test)

In [93]:
#viewing accuracy score:
print("The LR_model accuracy is given as : ",metrics.accuracy_score(Y_test,Y_predict_LR))

The LR_model accuracy is given as :  0.5194323144104803


#### 3.2 Obtaining a Classification Report:

In [94]:
print(classification_report(Y_test,Y_predict_LR))

              precision    recall  f1-score   support

           0       0.70      0.58      0.63      3311
           1       0.25      0.37      0.30      1269

    accuracy                           0.52      4580
   macro avg       0.48      0.47      0.47      4580
weighted avg       0.58      0.52      0.54      4580



#### 4. Support Vector Machine Model:

In [95]:
SVM_model = SVC(kernel='rbf',C=10, gamma=1, class_weight='balanced')
#rbf: gaussian radial basis function

In [96]:
SVM_model.fit(X,Y)

#### 4.1 Predicting the output on test data:

In [97]:
#predicting on test data:
Y_predict_SVM=SVM_model.predict(X_test)

In [98]:
#viewing accuracy score:
print("The SVM_accuracy is given as : ",metrics.accuracy_score(Y_test,Y_predict_SVM))

The SVM_accuracy is given as :  0.7229257641921397


#### 4.2 Obtaining a Classification Report:

In [99]:
print(classification_report(Y_test,Y_predict_SVM))

              precision    recall  f1-score   support

           0       0.72      1.00      0.84      3311
           1       0.00      0.00      0.00      1269

    accuracy                           0.72      4580
   macro avg       0.36      0.50      0.42      4580
weighted avg       0.52      0.72      0.61      4580



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


<font color=red>**C=10 & gamma=1 with resampled gives the best results**</font>

#### 4.3 Hyperparameter Tuning (by Grid Search):

In [100]:
from sklearn.model_selection import GridSearchCV

#### 4.3.1 Building the model:

In [101]:
param_grid= {'C':[0.1,1,10,100,1000],
             'gamma':[1,0.1,0.01,0.001,0.0001],
             'kernel':['rbf']
            }

In [102]:
grid= GridSearchCV(SVC(),param_grid=param_grid,refit=True,verbose=3)

In [103]:
grid.fit(X,Y)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
[CV 1/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.865 total time=  21.4s
[CV 2/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.581 total time=  21.4s
[CV 3/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.567 total time=  25.6s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.500 total time=  28.6s
[CV 5/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.500 total time=  21.1s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.965 total time=  19.6s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.881 total time=  17.8s
[CV 3/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.894 total time=  18.9s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.500 total time=  18.1s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.500 total time=  14.9s
[CV 1/5] END .....C=0.1, gamma=0.01, kernel=rbf;, score=0.779 total time=  14.6s
[CV 2/5] END .....C=0.1, gamma=0.01, kernel=rbf

KeyboardInterrupt: 

In [None]:
#finding the best parameters:
print(grid.best_params_)

#### 4.4 Understanding Feature Importance for selected features:

## Building a model without feature selection:

### Feature Combination:

#### PCA (on selected features):

In [104]:
from sklearn.decomposition import PCA

#### reducing to a single dimension:

In [105]:
pca=PCA(n_components=1)

In [106]:
x_trainpca=pca.fit_transform(X)
x_testpca=pca.transform(X_test)

In [107]:
#convert principal components for each sample from numpy array to df:
pca_df=pd.DataFrame(x_trainpca,columns=["PCA1"])
pca_df['y']=Y
pca_df.tail()

  pca_df['y']=Y


ValueError: cannot reindex on an axis with duplicate labels

In [None]:
#evr provides the amount of info or variance each principal component holds after projecting the data to a lower dimensional space:
print("evr is {}".format(pca.explained_variance_ratio_))

__inference__: 
* principal component 1 holds 52.9%

#### reducing to 2 dimensions:

In [None]:
pca=PCA(n_components=2)

In [None]:
x_trainpca=pca.fit_transform(X)
x_testpca=pca.transform(X_test)

In [None]:
##convert principal components for each sample from numpy array to df:
pca_df=pd.DataFrame(x_trainpca,columns=["PCA1","PCA2"])
#pca_df['y']=Y
pca_df.head()

In [None]:
new_df=pd.DataFrame(x_trainpca)
y=Y.tolist()
new_df['y']=y
new_df.tail(3)
#type(new_df)

In [None]:
#evr provides the amount of info or variance each principal component holds after projecting the data to a lower dimensional space:
print("evr is {}".format(pca.explained_variance_ratio_))

__inference__: 
* principal component 1 holds 52.9%
* principal component 2 holds 24.3%

In [None]:
ClassAIndices=np.where(Y==0) #this gives an array
ClassAIndices=ClassAIndices[0].tolist()
ClassBIndices=np.where(Y==1)
ClassBIndices=ClassBIndices[0].tolist()

In [None]:
x_HC=x_trainpca[ClassAIndices,:]
x_DP=x_trainpca[ClassBIndices,:]
y_HC=Y[ClassAIndices]
y_DP=Y[ClassBIndices]

In [None]:
plt.scatter(x_HC[:,0],x_HC[:,1], color='green',s=50, marker='o',alpha=0.9,edgecolor='k',label='HC')
plt.scatter(x_DP[:,0],x_DP[:,1], color='red',s=50, marker='o',alpha=0.9,edgecolor='k',label='DP') 
plt.xlabel('Principal Component 1') 
plt.ylabel('Principal Component 2') 
plt.legend()
plt.show()

#### reducing to 3 dimensions:

In [None]:
pca=PCA(n_components=3)

In [None]:
x_trainpca=pca.fit_transform(X)
x_testpca=pca.transform(X_test)

In [None]:
##convert principal components for each sample from numpy array to df:
pca_df=pd.DataFrame(x_trainpca,columns=["PCA1","PCA2","PCA3"])
pca_df.head()

In [None]:
#evr provides the amount of info or variance each principal component holds after projecting the data to a lower dimensional space:
print("evr is {}".format(pca.explained_variance_ratio_))

In [None]:
pca=PCA(n_components=7)

In [None]:
x_trainpca=pca.fit_transform(X)
x_testpca=pca.transform(X_test)

In [None]:
##convert principal components for each sample from numpy array to df:
pca_df=pd.DataFrame(x_trainpca,columns=["PCA1","PCA2","PCA3","PCA4","PCA4","PCA4","PCA7"])

pca_df.head()

In [None]:
#evr provides the amount of info or variance each principal component holds after projecting the data to a lower dimensional space:
print("evr is {}".format(pca.explained_variance_ratio_))

__inference__: 
* principal component 1 holds 52.9%
* principal component 2 holds 24.3%
* principal component 3 holds 9.91%
* principal component 4 holds 5.33% 

adding them all up we get over 92.4%(>90). Hence  we decide to take 4 components.

In [None]:
import seaborn as sns

In [None]:
sns.pairplot(pca_df,hue="y")

##### RF model:

In [None]:
RF_model.fit(x_trainpca,Y)

In [None]:
Y_predict_RFpca=RF_model.predict(x_testpca)

In [None]:
#viewing accuracy score
print("The RF_model accuracy is given as : ",metrics.accuracy_score(Y_test,Y_predict_RFpca))

In [None]:
print(classification_report(Y_test,Y_predict_RFpca))

##### DF model:

In [None]:
DF_model.fit(x_trainpca,Y)

In [None]:
Y_predict_DFpca=DF_model.predict(x_testpca)

In [None]:
print("The DF_model accuracy is given as : ",metrics.accuracy_score(Y_test,Y_predict_DFpca))

In [None]:
print(classification_report(Y_test,Y_predict_DFpca))

##### LR model:

In [None]:
LR_model.fit(x_trainpca,Y)

In [None]:
Y_predict_LRpca=LR_model.predict(x_testpca)

In [None]:
print("The LR_model accuracy is given as : ",metrics.accuracy_score(Y_test,Y_predict_LRpca))

In [None]:
print(classification_report(Y_test,Y_predict_LRpca))

##### SVM model:

In [None]:
grid.fit(x_trainpca,Y)

In [None]:
#finding the best parameters:
print(grid.best_params_)

In [None]:
SVM_modelpca= SVC(kernel='rbf',C=10, gamma=1, class_weight='balanced')
SVM_modelpca.fit(x_trainpca,Y)

In [None]:
Y_predict_SVMpca=SVM_modelpca.predict(x_testpca)

In [None]:
#viewing accuracy score
print("The SVM_model accuracy is given as : ",metrics.accuracy_score(Y_test,Y_predict_SVMpca))

In [None]:
print(classification_report(Y_test,Y_predict_SVMpca))

In [None]:
print(type(feature_list))

In [None]:
import itertools

flist = feature_list
for l in range(len(flist) + 1):
    for subset in itertools.combinations(flist, l):
        print(subset)