### Importing the necessary dependencies:

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [3]:
prosody_train=pd.read_csv('/home/vboxuser/VoiceFiles/Voice4PD-MSA-Tool-global_V2/ProsodyData/prosodyfeaturesst_seg_train.csv')
prosody_test=pd.read_csv('/home/vboxuser/VoiceFiles/Voice4PD-MSA-Tool-global_V2/ProsodyData/prosodyfeaturesst_seg_test.csv')

### Preprocessing training data:

In [4]:
prosody_train.head(3)

Unnamed: 0,subject,F0avg,F0std,F0max,F0min,F0skew,F0kurt,F0tiltavg,F0mseavg,F0tiltstd,...,kurtosisdurpause,maxdurpause,mindurpause,PVU,PU,UVU,VVU,VP,UP,Group
0,302_P_0.wav,205.149139,83.769669,327.51477,84.000198,-0.1684,-1.775526,-124.519433,56.990329,313.762164,...,-1.595486,0.42,0.24,1.406408,7.372219,0.190771,0.809229,0.575387,0.135644,HC
1,302_P_1.wav,102.336777,12.75884,162.222671,84.562759,1.034953,1.608071,-68.756823,14.322919,134.003465,...,-1.093525,0.86,0.26,1.573629,4.208813,0.373889,0.626111,0.397877,0.237597,HC
2,302_P_10.wav,100.019585,8.637549,128.070557,78.891289,0.045826,-0.090325,-52.007956,6.935584,55.938566,...,-1.510324,0.749977,0.16,1.157104,4.461451,0.259356,0.740644,0.640084,0.224142,HC


__note: 30 columns: 103 features(X), 1 target label(Y) and no null values__

#### 1. Checking if the classes have a balanced amount of samples:

In [5]:
#replace string values to 0s & 1s in target column
prosody_train.replace({'Group':{"HC":0,"DP":1}},inplace=True)

In [6]:
val_count=prosody_train.groupby(['Group'])['Group'].count()
print(val_count)

Group
0    13090
1     3529
Name: Group, dtype: int64


In [7]:
df1=prosody_train[prosody_train['Group']==1]
df2=prosody_train[prosody_train['Group']!=1]
df_upsampled=resample(df1,random_state=42,n_samples=13090,replace=True)
pr_train_upsampled=pd.concat([df_upsampled,df2])

In [8]:
val_count=pr_train_upsampled.groupby(['Group'])['Group'].count()
print(val_count)

Group
0    13090
1    13090
Name: Group, dtype: int64


#### 2. Obtaining X & Y values:

In [9]:
X=pr_train_upsampled.drop(['subject','Group'],axis=1)
Y=pr_train_upsampled['Group']

#### 3. Normalizing the data:

In [10]:
import Normalization as n

In [11]:
X=n.normalize_train(X)

In [12]:
print(np.mean(X))
print(np.std(X))

3.133561606228393e-17
1.0000000000000002


### Preprocessing test data:

In [13]:
prosody_test.head(3)

Unnamed: 0,F0avg,F0std,F0max,F0min,F0skew,F0kurt,F0tiltavg,F0mseavg,F0tiltstd,F0msestd,...,maxdurpause,mindurpause,PVU,PU,UVU,VVU,VP,UP,subject,Group
0,217.328567,43.345333,285.029968,103.759995,-1.099916,0.759685,-81.82057,84.405609,285.252458,205.829912,...,0.56,0.229977,1.974597,5.086207,0.388226,0.611774,0.309822,0.19661,600_AUDIO_0.wav,HC
1,209.782654,19.156879,279.613312,175.59906,0.854379,0.991213,-457.414492,51.429299,745.770969,54.816928,...,0.450023,0.209977,1.332862,4.485377,0.297157,0.702843,0.527319,0.222947,600_AUDIO_1.wav,HC
2,212.537704,27.163887,331.514069,170.454742,1.845468,5.051382,-233.376614,60.06242,347.079622,49.608685,...,0.369977,0.22,0.992319,3.77746,0.262695,0.737305,0.743012,0.264728,600_AUDIO_10.wav,HC


In [14]:
prosody_test.isnull().sum()

F0avg      0
F0std      0
F0max      0
F0min      0
F0skew     0
          ..
VVU        0
VP         0
UP         0
subject    0
Group      0
Length: 105, dtype: int64

In [15]:
pr_test=prosody_test.fillna(0)

In [16]:
pr_test.isnull().sum()

F0avg      0
F0std      0
F0max      0
F0min      0
F0skew     0
          ..
VVU        0
VP         0
UP         0
subject    0
Group      0
Length: 105, dtype: int64

In [17]:
#obtaining X & Y for test data:
pr_test.replace({'Group':{"HC":0,"DP":1}},inplace=True)
X_test=pr_test.drop(['subject','Group'],axis=1)
Y_test=pr_test['Group']

In [18]:
X_test=n.normalize_test(X_test)

In [19]:
print(np.mean(X_test))
print(np.std(X_test))

-0.07571469008695046
1.0242422252747432


### Finding Best Model:

In [20]:
import model 

In [21]:
obj=model.classification_models()

#### 1. Random Forest Classifier:

In [30]:
obj.RF_model(X,Y,X_test,Y_test)

The RF model accuracy is given as :  0.7168122270742358
              precision    recall  f1-score   support

           0       0.72      0.99      0.83      3311
           1       0.18      0.01      0.01      1269

    accuracy                           0.72      4580
   macro avg       0.45      0.50      0.42      4580
weighted avg       0.57      0.72      0.61      4580



#### 2. Decision Tree Classifier:

In [31]:
obj.DF_model(X,Y,X_test,Y_test)

The DF model accuracy is given as :  0.6026200873362445
              precision    recall  f1-score   support

           0       0.74      0.70      0.72      3311
           1       0.31      0.36      0.33      1269

    accuracy                           0.60      4580
   macro avg       0.52      0.53      0.52      4580
weighted avg       0.62      0.60      0.61      4580



#### 3. Logistic Regression Model:

In [32]:
obj.LR_model(X,Y,X_test,Y_test)

The LR_model accuracy is given as :  0.6576419213973799
              precision    recall  f1-score   support

           0       0.78      0.74      0.76      3311
           1       0.39      0.44      0.41      1269

    accuracy                           0.66      4580
   macro avg       0.58      0.59      0.59      4580
weighted avg       0.67      0.66      0.66      4580



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### 4. Support Vector Machine:

In [1]:
#import grid_search as g

In [None]:
#parameters=g.best_param(X,Y)

In [None]:
obj.SVM_model(X,Y,10,1,X_test,Y_test)

### Extracting Feature Importance:

In [20]:
from sklearn.inspection import permutation_importance
from matplotlib import pyplot as plt

In [None]:
SVM_model = SVC(kernel='rbf',C=10, gamma=1, class_weight='balanced')
SVM_model.fit(X,Y)

In [22]:
r=pr_train_upsampled.drop(['subject','Group'],axis=1)
feature_names=r.columns.tolist()
features=np.array(feature_names)

In [22]:
len(X)

26180

In [None]:
feat_imp_all=permutation_importance(SVM_model,X_test,Y_test)
#sorted_importance_all=feat_imp_all.importances_mean.argsort()
#f=features[sorted_importance_all]
#v=feat_imp_all.importances_mean[sorted_importance_all]
#plt.barh(f,v)
#plt.xlabel("Feature Importance for all Features")

In [None]:
def feat_imp_all():
    i=0
    x=[]
    for i in range (0,28):
        if v[i] <-0.005 or v[i]>0.005:
            x.append(f[i])
    return x

imp_features_all=feat_imp_all()
print(type(imp_features_all))

In [None]:
sorted_importance_all