### Importing the necessary dependencies:

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.utils import resample
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [3]:
articulation_train=pd.read_csv('/home/vboxuser/VoiceFiles/Voice4PD-MSA-Tool-global_V2/ArticulationData/articulationfeaturesst_seg_dev.csv')
articulation_test=pd.read_csv('/home/vboxuser/VoiceFiles/Voice4PD-MSA-Tool-global_V2/ArticulationData/articulationfeaturesst_seg_test.csv')

### Preprocessing training data:

In [4]:
articulation_train.head(3)

Unnamed: 0,subject,avg BBEon_1,avg BBEon_2,avg BBEon_3,avg BBEon_4,avg BBEon_5,avg BBEon_6,avg BBEon_7,avg BBEon_8,avg BBEon_9,...,kurtosis DDMFCCoff_10,kurtosis DDMFCCoff_11,kurtosis DDMFCCoff_12,kurtosis F1,kurtosis DF1,kurtosis DDF1,kurtosis F2,kurtosis DF2,kurtosis DDF2,Group
0,300_P_0.wav,-1.236051,-1.141281,-1.401868,-2.007595,-2.151657,-2.313118,-2.74141,-3.02162,-3.317513,...,0.204765,-0.634935,-1.341562,1.619932,3.838965,3.611199,-0.82367,6.217124,4.769782,HC
1,300_P_1.wav,-0.094454,-0.378113,-0.758611,-1.256991,-1.342239,-1.376264,-1.561052,-1.530774,-2.029228,...,0.373218,-0.68417,-0.839666,1.249121,3.385462,3.629113,0.910676,5.565311,4.313787,HC
2,300_P_10.wav,-0.279615,-0.118771,-0.317394,-1.148637,-1.395024,-1.330388,-1.775505,-1.99457,-2.286245,...,1.121168,0.033208,-0.894862,1.845131,2.680086,2.076892,0.1345,4.527984,1.688521,HC


In [5]:
articulation_train.isnull().sum()

subject          0
avg BBEon_1      0
avg BBEon_2      0
avg BBEon_3      0
avg BBEon_4      0
                ..
kurtosis DDF1    0
kurtosis F2      0
kurtosis DF2     0
kurtosis DDF2    0
Group            0
Length: 490, dtype: int64

__note: 30 columns: 490 features(X), 1 target label(Y) and no null values__

#### 1. Checking if the classes have a balanced amount of samples:

In [6]:
#replace string values to 0s & 1s in target column
articulation_train.replace({'Group':{"HC":0,"DP":1}},inplace=True)

In [7]:
val_count=articulation_train.groupby(['Group'])['Group'].count()
print(val_count)

Group
0    3878
1    1145
Name: Group, dtype: int64


In [8]:
df1=articulation_train[articulation_train['Group']==1]
df2=articulation_train[articulation_train['Group']!=1]
df_upsampled=resample(df1,random_state=42,n_samples=3878,replace=True)
ar_train_upsampled=pd.concat([df_upsampled,df2])

In [9]:
val_count=ar_train_upsampled.groupby(['Group'])['Group'].count()
print(val_count)

Group
0    3878
1    3878
Name: Group, dtype: int64


#### 2. Obtaining X & Y values:

In [10]:
X=ar_train_upsampled.drop(['subject','Group'],axis=1)
Y=ar_train_upsampled['Group']

In [11]:
X.head()

Unnamed: 0,avg BBEon_1,avg BBEon_2,avg BBEon_3,avg BBEon_4,avg BBEon_5,avg BBEon_6,avg BBEon_7,avg BBEon_8,avg BBEon_9,avg BBEon_10,...,kurtosis DDMFCCoff_9,kurtosis DDMFCCoff_10,kurtosis DDMFCCoff_11,kurtosis DDMFCCoff_12,kurtosis F1,kurtosis DF1,kurtosis DDF1,kurtosis F2,kurtosis DF2,kurtosis DDF2
4956,-0.475175,-0.859556,-1.297662,-1.717759,-1.866748,-2.151923,-2.144907,-2.618861,-3.230052,-3.392544,...,0.358259,1.385686,1.010749,-0.162845,5.6678,8.230013,7.784058,1.12135,6.557898,6.224988
3968,0.231278,-0.069804,-0.499173,-0.421388,-0.836844,-1.303022,-1.253273,-1.801145,-2.170398,-2.335746,...,0.78158,1.391514,-0.503886,-0.235511,0.971578,4.701117,4.703937,1.004097,4.950843,4.069507
4960,-0.692121,-1.113524,-1.760829,-1.725387,-1.733169,-1.829505,-2.114013,-2.468663,-2.874081,-3.166756,...,0.176264,3.327044,-0.518708,-0.802192,1.58153,4.529301,4.935638,1.442199,4.375305,5.791429
4925,-1.131536,-1.267419,-1.541213,-1.855265,-1.839662,-2.00619,-2.132606,-2.599513,-2.861912,-2.757145,...,-0.386593,-1.218741,1.172943,1.512261,0.524117,7.787244,8.766577,-0.098337,12.643663,6.863167
4874,-0.650918,-1.012266,-1.530301,-1.64453,-1.88513,-2.319021,-2.440954,-2.743552,-2.967133,-3.027859,...,4.82719,1.249371,2.030302,-0.800158,3.093182,4.502417,3.367101,1.064401,3.252496,3.437303


#### 3. Normalizing the data:

In [12]:
#checking if normalized data has a mean~0 & standard deviation~1:
print(np.mean(X))
print(np.std(X))

  return mean(axis=axis, dtype=dtype, out=out, **kwargs)


avg BBEon_1     -0.801600
avg BBEon_2     -1.181056
avg BBEon_3     -1.633905
avg BBEon_4     -1.901343
avg BBEon_5     -2.073679
                   ...   
kurtosis DF1     5.171507
kurtosis DDF1    4.431384
kurtosis F2      0.166724
kurtosis DF2     4.264533
kurtosis DDF2    3.310257
Length: 488, dtype: float64


In [16]:
import Normalization as n

In [17]:
X=n.normalize_train(X)

In [18]:
print(np.mean(X))
print(np.std(X))

5.721995923295728e-18
0.9999999999999999


### Preprocessing test data:

In [19]:
articulation_test.head(3)

Unnamed: 0,avg BBEon_1,avg BBEon_2,avg BBEon_3,avg BBEon_4,avg BBEon_5,avg BBEon_6,avg BBEon_7,avg BBEon_8,avg BBEon_9,avg BBEon_10,...,kurtosis DDMFCCoff_11,kurtosis DDMFCCoff_12,kurtosis F1,kurtosis DF1,kurtosis DDF1,kurtosis F2,kurtosis DF2,kurtosis DDF2,subject,Group
0,-1.703007,-2.309823,-2.475417,-2.664827,-2.76341,-2.814644,-3.196568,-3.554883,-4.059189,-4.299266,...,-0.249404,-0.036873,11.851328,14.477019,9.393443,1.989389,6.540897,3.62254,600_AUDIO_0.wav,HC
1,-1.766605,-2.153507,-2.443244,-2.806753,-3.044377,-3.240641,-3.615207,-3.814603,-4.026244,-4.265777,...,-0.428055,-0.122785,1.518593,2.88077,1.509239,0.05555,1.513768,0.952246,600_AUDIO_1.wav,HC
2,-1.145307,-1.52091,-1.464773,-1.700818,-2.19517,-2.509134,-2.902016,-3.090786,-3.373614,-3.423954,...,-0.858743,-0.339214,0.86943,3.148013,3.679037,0.693122,2.969877,2.440902,600_AUDIO_10.wav,HC


In [20]:
articulation_test.isnull().sum()

avg BBEon_1      0
avg BBEon_2      0
avg BBEon_3      0
avg BBEon_4      0
avg BBEon_5      0
                ..
kurtosis F2      0
kurtosis DF2     0
kurtosis DDF2    0
subject          0
Group            0
Length: 490, dtype: int64

In [21]:
#obtaining X & Y for test data:
articulation_test.replace({'Group':{"HC":0,"DP":1}},inplace=True)
X_test=articulation_test.drop(['subject','Group'],axis=1)
Y_test=articulation_test['Group']

In [22]:
X_test=n.normalize_test(X_test)

In [23]:
print(np.mean(X_test))
print(np.std(X_test))

-0.010282629917583129
1.0440515436666373


### Finding Best Model:

In [26]:
import model 

In [27]:
obj=model.classification_models()

#### 1. Random Forest Classifier:

In [28]:
obj.RF_model(X,Y,X_test,Y_test)

The RF model accuracy is given as :  0.6951965065502184
              precision    recall  f1-score   support

           0       0.72      0.95      0.82      3311
           1       0.19      0.03      0.05      1269

    accuracy                           0.70      4580
   macro avg       0.45      0.49      0.44      4580
weighted avg       0.57      0.70      0.61      4580



#### 2. Decision Tree Classifier:

In [29]:
obj.DF_model(X,Y,X_test,Y_test)

The DF model accuracy is given as :  0.518995633187773
              precision    recall  f1-score   support

           0       0.70      0.60      0.64      3311
           1       0.23      0.32      0.27      1269

    accuracy                           0.52      4580
   macro avg       0.46      0.46      0.46      4580
weighted avg       0.57      0.52      0.54      4580



#### 3. Logistic Regression Model:

In [30]:
obj.LR_model(X,Y,X_test,Y_test)

The LR_model accuracy is given as :  0.5331877729257642
              precision    recall  f1-score   support

           0       0.70      0.63      0.66      3311
           1       0.23      0.29      0.25      1269

    accuracy                           0.53      4580
   macro avg       0.46      0.46      0.46      4580
weighted avg       0.57      0.53      0.55      4580



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### 4. Support Vector Machine:

In [31]:
#import grid_search as g

ModuleNotFoundError: No module named 'grid_search'

In [None]:
#parameters=g.best_param(X,Y)

In [33]:
SVM_model = SVC(kernel='rbf',C=10, gamma=1, class_weight='balanced')
SVM_model.fit(X,Y)

### Extracting Feature Importance:

In [34]:
from sklearn.inspection import permutation_importance
from matplotlib import pyplot as plt

In [35]:
r=ar_train_upsampled.drop(['subject','Group'],axis=1)
feature_names=r.columns.tolist()
features=np.array(feature_names)

In [None]:
feat_imp_all=permutation_importance(SVM_model,X_test,Y_test)
sorted_importance_all=feat_imp_all.importances_mean.argsort()
f=features[sorted_importance_all]
v=feat_imp_all.importances_mean[sorted_importance_all]
plt.barh(f,v)
plt.xlabel("Feature Importance for all Features")

In [None]:
def feat_imp_all():
    i=0
    x=[]
    for i in range (0,28):
        if v[i] <-0.005 or v[i]>0.005:
            x.append(f[i])
    return x

imp_features_all=feat_imp_all()
print(type(imp_features_all))