In [1]:
baseDIR = '/home/pataki/synapse/gitParkinson' # base directory of the github repo

In [2]:
cd $baseDIR/featureSelectors/

/home/pataki/synapse/gitParkinson/featureSelectors


In [3]:
%run -i ../src/sc2FitModels.py

Welcome, Balint Armin Pataki!



In [4]:
def score(phenotype, trainX, trainY, testX, testY):
    ensemble = train_ensemble(trainX, trainY)
    
    results, y_score, y_true = getNonLinearInterpAupr(testX, testY,
            np.arange(len(CATEGORY_WEIGHTS[phenotype])), ensemble)
    if phenotype == 'tremor':
        weighted_aupr = getWeightedMean(phenotype, results)
    else:
        weighted_aupr = results[0]

    return weighted_aupr, y_score, y_true

In [5]:
def getFeatureImportances(clf, DF):
    tmpDF = pd.DataFrame({'imp': clf.feature_importances_, 
                  'feature':DF.columns.tolist()}).sort_values('imp', ascending=False)
    return(tmpDF)

In [6]:
def featureEvaluator(classifier, features, featureDB, repeatNum = 5, baseSeed = 4242, 
                     testPatientNum = 5, importance = True):
    prAUClist = []
    basePRauc = []
    for i in pb(range(repeatNum)):
        rndState = np.random.RandomState(seed=baseSeed+137*i)
        testPatients = list(rndState.choice(pd.unique(featureDB.patient), testPatientNum, replace=False))

        trainX = featureDB[~featureDB.patient.isin(testPatients)]
        trainY = trainX.pop('bradykinesiaScore')
        trainX.pop('patient')
        trainX.pop('dataFileHandleId')
        trainX = trainX[features]

        testX = featureDB[featureDB.patient.isin(testPatients)]
        testY = testX.pop('bradykinesiaScore')
        testX.pop('patient')
        testX.pop('dataFileHandleId')
        testX = testX[features]

        classifier.fit(trainX, list(trainY))

        prAUClist.append(nonLinearInterpAupr(y_true=list(testY), y_score=classifier.predict_proba(testX).T[1])[0])
        basePRauc.append(sum(testY)/len(testY)) # score for random guessing
        
    print([ '%.3f' % elem[0] for elem in prAUClist])
    print([ '%.3f' % elem    for elem in basePRauc])
    
    if(importance):
        return(getFeatureImportances(classifier, testX))

In [7]:
%run -i ../src/helperFuncs.py

In [8]:
mainDF = mainDFCreator('bradykinesiaScore')

mainDF = mainDFtrimmer(mainDF, fileMinLen=1, plot=False)
mainDF = mainDF[['dataFileHandleId', 'bradykinesiaScore', 'patient']]

Train shape: (3016, 12)
Test shape:  (1409, 12)
Merged:      (4425, 12)
Remained shape: (4280, 14)


In [9]:
baseDF            = pd.read_csv('../sub2.3_brad/featureDB/baseFeatures.tsv', sep='\t')
baseFeatures      = list(set(baseDF.columns.tolist()) - set(['dataFileHandleId']))

empiricalDF       = pd.read_csv('../sub2.3_brad/featureDB/empiricalFeature.tsv', sep='\t')
empiricalFeatures = list(set(empiricalDF.columns.tolist()) - set(['dataFileHandleId']))

fourierDF         = pd.read_csv('../sub2.3_brad/featureDB/fourierFeatures.tsv', sep='\t')
fourierFeatures   = list(set(fourierDF.columns.tolist()) - set(['dataFileHandleId']))

rangeStdDF        = pd.read_csv('../sub2.3_brad/featureDB/rangeStdFeatures.tsv', sep='\t')
rangeStdFeatures  = list(set(rangeStdDF.columns.tolist()) - set(['dataFileHandleId']))

tsfreshDF         = pd.read_csv('../sub2.3_brad/featureDB/tsFresh_fillNA_dropConstant.tsv', sep='\t')
tsfreshFeatures   = list(set(tsfreshDF.columns.tolist()) - set(['dataFileHandleId']))

autocorrDF        = pd.read_csv('../sub2.3_brad/featureDB/autoCorrFeatures.tsv', sep='\t')
autocorrFeatures  = list(set(autocorrDF.columns.tolist()) - set(['dataFileHandleId']))

In [10]:
mainDF = pd.merge(mainDF, baseDF,      on = 'dataFileHandleId', how='inner')
mainDF = pd.merge(mainDF, empiricalDF, on = 'dataFileHandleId', how='inner')
mainDF = pd.merge(mainDF, fourierDF,   on = 'dataFileHandleId', how='inner')
mainDF = pd.merge(mainDF, rangeStdDF,  on = 'dataFileHandleId', how='inner')
mainDF = pd.merge(mainDF, tsfreshDF,   on = 'dataFileHandleId', how='inner')
mainDF = pd.merge(mainDF, autocorrDF,  on = 'dataFileHandleId', how='inner')

In [11]:
allDF = mainDF.copy(deep=True)
allDF[allDF.columns.tolist()[3:]] = StandardScaler().fit_transform(allDF[allDF.columns.tolist()[3:]])
mainDF = allDF[allDF.bradykinesiaScore != 'Score']
mainDF.shape

(2911, 3015)

## Check scores

In [12]:
clf = RandomForestClassifier(n_estimators=500, n_jobs=30, random_state=42)

featImp0 = featureEvaluator(clf, baseFeatures, mainDF) 

100%|██████████| 5/5 [00:24<00:00,  4.81s/it]


['0.850', '0.704', '0.735', '0.766', '0.790']
['0.212', '0.132', '0.316', '0.318', '0.302']


In [13]:
clf = RandomForestClassifier(n_estimators=500, n_jobs=30, random_state=42)

featImp0 = featureEvaluator(clf, baseFeatures + empiricalFeatures + fourierFeatures + 
                                rangeStdFeatures + tsfreshFeatures + autocorrFeatures, mainDF) 

100%|██████████| 5/5 [00:37<00:00,  7.56s/it]


['0.735', '0.419', '0.772', '0.817', '0.840']
['0.212', '0.132', '0.316', '0.318', '0.302']


In [14]:
featImp400 = featImp0.head(400).feature.tolist()
featImp200 = featImp0.head(200).feature.tolist()

In [15]:
clf = RandomForestClassifier(n_estimators=500, n_jobs=30, random_state=42)

featImp0 = featureEvaluator(clf, featImp400, mainDF) 

100%|██████████| 5/5 [00:30<00:00,  6.12s/it]


['0.819', '0.599', '0.825', '0.854', '0.873']
['0.212', '0.132', '0.316', '0.318', '0.302']


In [16]:
clf = RandomForestClassifier(n_estimators=500, n_jobs=30, random_state=42)

featImp0 = featureEvaluator(clf, featImp200, mainDF) 

100%|██████████| 5/5 [00:27<00:00,  5.62s/it]


['0.831', '0.608', '0.847', '0.870', '0.879']
['0.212', '0.132', '0.316', '0.318', '0.302']


In [17]:
featImp100 = featImp0.head(100).feature.tolist()
featImp50 = featImp0.head(50).feature.tolist()

In [18]:
clf = RandomForestClassifier(n_estimators=500, n_jobs=30, random_state=42)

featImp0 = featureEvaluator(clf, featImp100, mainDF) 

100%|██████████| 5/5 [00:26<00:00,  5.39s/it]


['0.858', '0.677', '0.860', '0.882', '0.886']
['0.212', '0.132', '0.316', '0.318', '0.302']


In [19]:
clf = RandomForestClassifier(n_estimators=500, n_jobs=30, random_state=42)

featImp0 = featureEvaluator(clf, featImp50, mainDF) 

100%|██████████| 5/5 [00:26<00:00,  5.46s/it]


['0.878', '0.697', '0.875', '0.892', '0.895']
['0.212', '0.132', '0.316', '0.318', '0.302']


In [None]:
featImp30 = featImp0.head(30).feature.tolist()
featImp20 = featImp0.head(20).feature.tolist()
featImp10 = featImp0.head(10).feature.tolist()

In [21]:
clf = RandomForestClassifier(n_estimators=500, n_jobs=30, random_state=42)

featImp0 = featureEvaluator(clf, featImp30, mainDF) 

100%|██████████| 5/5 [00:25<00:00,  5.06s/it]


['0.886', '0.683', '0.878', '0.894', '0.895']
['0.212', '0.132', '0.316', '0.318', '0.302']


In [22]:
clf = RandomForestClassifier(n_estimators=500, n_jobs=30, random_state=42)

featImp0 = featureEvaluator(clf, featImp20, mainDF) 

100%|██████████| 5/5 [00:24<00:00,  4.98s/it]


['0.878', '0.681', '0.873', '0.889', '0.907']
['0.212', '0.132', '0.316', '0.318', '0.302']


In [23]:
clf = RandomForestClassifier(n_estimators=500, n_jobs=30, random_state=42)

featImp0 = featureEvaluator(clf, featImp10, mainDF) 

100%|██████████| 5/5 [00:23<00:00,  4.80s/it]


['0.855', '0.629', '0.859', '0.889', '0.922']
['0.212', '0.132', '0.316', '0.318', '0.302']


## The best is top50-30-20-10

In [24]:
rf = OneVsRestClassifier(RandomForestClassifier(n_estimators=500, random_state=42))
lr = OneVsRestClassifier(LogisticRegressionCV())
svm = OneVsRestClassifier(SVC(probability=True))
ensemble = VotingClassifier(estimators=[('rf', rf), ('lr', lr), ('svm', svm)], voting='soft')

featureEvaluator(ensemble, featImp50, mainDF, importance=False) 

100%|██████████| 5/5 [01:20<00:00, 16.15s/it]

['0.898', '0.739', '0.855', '0.876', '0.888']
['0.212', '0.132', '0.316', '0.318', '0.302']





In [25]:
rf = OneVsRestClassifier(RandomForestClassifier(n_estimators=500, random_state=42))
lr = OneVsRestClassifier(LogisticRegressionCV())
svm = OneVsRestClassifier(SVC(probability=True))
ensemble = VotingClassifier(estimators=[('rf', rf), ('lr', lr), ('svm', svm)], voting='soft')

featureEvaluator(ensemble, featImp30, mainDF, importance=False) 

100%|██████████| 5/5 [01:15<00:00, 15.12s/it]

['0.896', '0.726', '0.874', '0.894', '0.894']
['0.212', '0.132', '0.316', '0.318', '0.302']





In [26]:
rf = OneVsRestClassifier(RandomForestClassifier(n_estimators=500, random_state=42))
lr = OneVsRestClassifier(LogisticRegressionCV())
svm = OneVsRestClassifier(SVC(probability=True))
ensemble = VotingClassifier(estimators=[('rf', rf), ('lr', lr), ('svm', svm)], voting='soft')

featureEvaluator(ensemble, featImp20, mainDF, importance=False) 

100%|██████████| 5/5 [01:09<00:00, 14.01s/it]

['0.876', '0.718', '0.871', '0.888', '0.897']
['0.212', '0.132', '0.316', '0.318', '0.302']





In [27]:
rf = OneVsRestClassifier(RandomForestClassifier(n_estimators=500, random_state=42))
lr = OneVsRestClassifier(LogisticRegressionCV())
svm = OneVsRestClassifier(SVC(probability=True))
ensemble = VotingClassifier(estimators=[('rf', rf), ('lr', lr), ('svm', svm)], voting='soft')

featureEvaluator(ensemble, featImp10, mainDF, importance=False) 

100%|██████████| 5/5 [01:07<00:00, 13.57s/it]

['0.846', '0.645', '0.853', '0.885', '0.917']
['0.212', '0.132', '0.316', '0.318', '0.302']





In [34]:
len(allDF)

4278

In [35]:
subTemplate = pd.read_csv('../bradykinesiaSubmissionTemplate.csv', sep=',')[['dataFileHandleId']]
len(subTemplate)

4166

In [36]:
fullDF = pd.merge(subTemplate, allDF, how = 'left', on = 'dataFileHandleId')
fullDF = fullDF.fillna(fullDF.mean())

In [37]:
len(fullDF)

4166

In [38]:
fullDF[['dataFileHandleId'] + featImp50].to_csv('../featureDB/final_brad_50.csv', sep=',', index=False)
fullDF[['dataFileHandleId'] + featImp30].to_csv('../featureDB/final_brad_30.csv',   sep=',', index=False)