In [1]:
#baseDIR = '/home/pataki/synapse/gitParkinson/' # base directory of the github repo
#uncomment the line above (+ update) if you are running this notebook in an empty namespace
try: baseDIR
except NameError:
    print('Error: baseDIR not found!')
    
try: coreNum
except NameError:
    coreNum = 4 # default CPU cores

In [2]:
cd $baseDIR/featureSelectors/

/home/pataki/synapse/gitParkinson/featureSelectors


In [3]:
%run -i ../src/sc2FitModels.py

In [4]:
def score(phenotype, trainX, trainY, testX, testY):
    ensemble = train_ensemble(trainX, trainY)
    
    results, y_score, y_true = getNonLinearInterpAupr(testX, testY,
            np.arange(len(CATEGORY_WEIGHTS[phenotype])), ensemble)
    if phenotype == 'tremor':
        weighted_aupr = getWeightedMean(phenotype, results)
    else:
        weighted_aupr = results[0]

    return weighted_aupr, y_score, y_true

In [5]:
def getFeatureImportances(clf, DF):
    tmpDF = pd.DataFrame({'imp': clf.feature_importances_, 
                  'feature':DF.columns.tolist()}).sort_values('imp', ascending=False)
    return(tmpDF)

In [6]:
def featureEvaluator(classifier, features, featureDB, repeatNum = 5, baseSeed = 4242, 
                     testPatientNum = 5, importance = True):
    prAUClist = []
    basePRauc = []
    for i in pb(range(repeatNum)):
        rndState = np.random.RandomState(seed=baseSeed+137*i)
        testPatients = list(rndState.choice(pd.unique(featureDB.patient), testPatientNum, replace=False))

        trainX = featureDB[~featureDB.patient.isin(testPatients)]
        trainY = trainX.pop('dyskinesiaScore')
        trainX.pop('patient')
        trainX.pop('dataFileHandleId')
        trainX = trainX[features]

        testX = featureDB[featureDB.patient.isin(testPatients)]
        testY = testX.pop('dyskinesiaScore')
        testX.pop('patient')
        testX.pop('dataFileHandleId')
        testX = testX[features]

        classifier.fit(trainX, list(trainY))

        prAUClist.append(nonLinearInterpAupr(y_true=list(testY), y_score=classifier.predict_proba(testX).T[1])[0])
        basePRauc.append(sum(testY)/len(testY)) # score for random guessing
        
    print([ '%.3f' % elem[0] for elem in prAUClist])
    print([ '%.3f' % elem    for elem in basePRauc])
    
    if(importance):
        return(getFeatureImportances(classifier, testX))

In [7]:
%run -i ../src/helperFuncs.py

In [8]:
mainDF = mainDFCreator('dyskinesiaScore')

mainDF = mainDFtrimmer(mainDF, fileMinLen=1, plot=False)
mainDF = mainDF[['dataFileHandleId', 'dyskinesiaScore', 'patient']]

Train shape: (1556, 12)
Test shape:  (660, 12)
Merged:      (2216, 12)
Remained shape: (2143, 14)


In [9]:
baseDF            = pd.read_csv('../sub2.2_dysk/featureDB/baseFeatures.tsv', sep='\t')
baseFeatures      = list(set(baseDF.columns.tolist()) - set(['dataFileHandleId']))

empiricalDF       = pd.read_csv('../sub2.2_dysk/featureDB/empiricalFeature.tsv', sep='\t')
empiricalFeatures = list(set(empiricalDF.columns.tolist()) - set(['dataFileHandleId']))

fourierDF         = pd.read_csv('../sub2.2_dysk/featureDB/fourierFeatures.tsv', sep='\t')
fourierFeatures   = list(set(fourierDF.columns.tolist()) - set(['dataFileHandleId']))

rangeStdDF        = pd.read_csv('../sub2.2_dysk/featureDB/rangeStdFeatures.tsv', sep='\t')
rangeStdFeatures  = list(set(rangeStdDF.columns.tolist()) - set(['dataFileHandleId']))

tsfreshDF         = pd.read_csv('../sub2.2_dysk/featureDB/tsFresh_fillNA_dropConstant.tsv', sep='\t')
tsfreshFeatures   = list(set(tsfreshDF.columns.tolist()) - set(['dataFileHandleId']))

autocorrDF        = pd.read_csv('../sub2.2_dysk/featureDB/autoCorrFeatures.tsv', sep='\t')
autocorrFeatures  = list(set(autocorrDF.columns.tolist()) - set(['dataFileHandleId']))

baseFeatures.sort()     # to avoid randomness
empiricalFeatures.sort()
fourierFeatures.sort()
rangeStdFeatures.sort()
tsfreshFeatures.sort()
autocorrFeatures.sort()

In [10]:
mainDF = pd.merge(mainDF, baseDF,      on = 'dataFileHandleId', how='inner')
mainDF = pd.merge(mainDF, empiricalDF, on = 'dataFileHandleId', how='inner')
mainDF = pd.merge(mainDF, fourierDF,   on = 'dataFileHandleId', how='inner')
mainDF = pd.merge(mainDF, rangeStdDF,  on = 'dataFileHandleId', how='inner')
mainDF = pd.merge(mainDF, tsfreshDF,   on = 'dataFileHandleId', how='inner')
mainDF = pd.merge(mainDF, autocorrDF,  on = 'dataFileHandleId', how='inner')

In [11]:
allDF = mainDF.copy(deep=True)
allDF[allDF.columns.tolist()[3:]] = StandardScaler().fit_transform(allDF[allDF.columns.tolist()[3:]])
mainDF = allDF[allDF.dyskinesiaScore != 'Score']
mainDF.shape

(1499, 3025)

## Check scores

In [16]:
clf = RandomForestClassifier(n_estimators=500, n_jobs=coreNum, random_state=42)

featImp0 = featureEvaluator(clf, baseFeatures, mainDF) 

100%|██████████| 2/2 [00:05<00:00,  2.90s/it]


['0.416', '0.262']
['0.276', '0.069']


In [13]:
clf = RandomForestClassifier(n_estimators=500, n_jobs=coreNum, random_state=42)

featImp0 = featureEvaluator(clf, baseFeatures + empiricalFeatures + fourierFeatures + 
                                rangeStdFeatures + tsfreshFeatures + autocorrFeatures, mainDF) 

100%|██████████| 5/5 [01:06<00:00, 13.21s/it]


['0.761', '0.115', '0.473', '0.493', '0.699']
['0.276', '0.069', '0.056', '0.170', '0.226']


In [14]:
featImp400 = featImp0.head(400).feature.tolist()
featImp200 = featImp0.head(200).feature.tolist()

In [15]:
clf = RandomForestClassifier(n_estimators=500, n_jobs=coreNum, random_state=42)

featImp0 = featureEvaluator(clf, featImp400, mainDF) 

100%|██████████| 5/5 [00:37<00:00,  7.47s/it]


['0.781', '0.125', '0.427', '0.522', '0.725']
['0.276', '0.069', '0.056', '0.170', '0.226']


In [16]:
clf = RandomForestClassifier(n_estimators=500, n_jobs=coreNum, random_state=42)

featImp0 = featureEvaluator(clf, featImp200, mainDF) 

100%|██████████| 5/5 [00:34<00:00,  6.81s/it]


['0.785', '0.128', '0.517', '0.544', '0.752']
['0.276', '0.069', '0.056', '0.170', '0.226']


In [17]:
featImp100 = featImp0.head(100).feature.tolist()
featImp50 = featImp0.head(50).feature.tolist()

In [18]:
clf = RandomForestClassifier(n_estimators=500, n_jobs=coreNum, random_state=42)

featImp0 = featureEvaluator(clf, featImp100, mainDF) 

100%|██████████| 5/5 [00:30<00:00,  6.05s/it]


['0.781', '0.152', '0.431', '0.546', '0.772']
['0.276', '0.069', '0.056', '0.170', '0.226']


In [19]:
clf = RandomForestClassifier(n_estimators=500, n_jobs=coreNum, random_state=42)

featImp0 = featureEvaluator(clf, featImp50, mainDF) 

100%|██████████| 5/5 [00:33<00:00,  6.78s/it]


['0.790', '0.184', '0.477', '0.530', '0.797']
['0.276', '0.069', '0.056', '0.170', '0.226']


In [20]:
featImp30 = featImp0.head(30).feature.tolist()
featImp20 = featImp0.head(20).feature.tolist()
featImp10 = featImp0.head(10).feature.tolist()
featImp5 = featImp0.head(5).feature.tolist()

In [21]:
clf = RandomForestClassifier(n_estimators=500, n_jobs=coreNum, random_state=42)

featImp0 = featureEvaluator(clf, featImp30, mainDF) 

100%|██████████| 5/5 [00:24<00:00,  4.82s/it]


['0.792', '0.320', '0.520', '0.530', '0.793']
['0.276', '0.069', '0.056', '0.170', '0.226']


In [22]:
clf = RandomForestClassifier(n_estimators=500, n_jobs=coreNum, random_state=42)

featImp0 = featureEvaluator(clf, featImp20, mainDF) 

100%|██████████| 5/5 [00:32<00:00,  6.42s/it]

['0.815', '0.307', '0.614', '0.602', '0.826']
['0.276', '0.069', '0.056', '0.170', '0.226']





In [23]:
clf = RandomForestClassifier(n_estimators=500, n_jobs=coreNum, random_state=42)

featImp0 = featureEvaluator(clf, featImp10, mainDF) 

100%|██████████| 5/5 [00:24<00:00,  4.86s/it]


['0.806', '0.337', '0.646', '0.599', '0.850']
['0.276', '0.069', '0.056', '0.170', '0.226']


In [24]:
clf = RandomForestClassifier(n_estimators=500, n_jobs=coreNum, random_state=42)

featImp0 = featureEvaluator(clf, featImp5, mainDF) 

100%|██████████| 5/5 [00:21<00:00,  4.30s/it]

['0.804', '0.267', '0.616', '0.565', '0.778']
['0.276', '0.069', '0.056', '0.170', '0.226']





### Thes best is the top10, top30, top20, top50

In [25]:
rf = OneVsRestClassifier(RandomForestClassifier(n_estimators=500, random_state=42))
lr = OneVsRestClassifier(LogisticRegressionCV())
svm = OneVsRestClassifier(SVC(probability=True))
ensemble = VotingClassifier(estimators=[('rf', rf), ('lr', lr), ('svm', svm)], voting='soft')

featureEvaluator(ensemble, featImp50, mainDF, importance=False) 

100%|██████████| 5/5 [01:45<00:00, 21.06s/it]

['0.749', '0.208', '0.417', '0.519', '0.708']
['0.276', '0.069', '0.056', '0.170', '0.226']





In [26]:
rf = OneVsRestClassifier(RandomForestClassifier(n_estimators=500, random_state=42))
lr = OneVsRestClassifier(LogisticRegressionCV())
svm = OneVsRestClassifier(SVC(probability=True))
ensemble = VotingClassifier(estimators=[('rf', rf), ('lr', lr), ('svm', svm)], voting='soft')

featureEvaluator(ensemble, featImp30, mainDF, importance=False) 

100%|██████████| 5/5 [01:20<00:00, 16.07s/it]

['0.764', '0.239', '0.404', '0.487', '0.700']
['0.276', '0.069', '0.056', '0.170', '0.226']





In [27]:
rf = OneVsRestClassifier(RandomForestClassifier(n_estimators=500, random_state=42))
lr = OneVsRestClassifier(LogisticRegressionCV())
svm = OneVsRestClassifier(SVC(probability=True))
ensemble = VotingClassifier(estimators=[('rf', rf), ('lr', lr), ('svm', svm)], voting='soft')

featureEvaluator(ensemble, featImp20, mainDF, importance=False) 

100%|██████████| 5/5 [01:01<00:00, 12.35s/it]

['0.777', '0.332', '0.439', '0.560', '0.766']
['0.276', '0.069', '0.056', '0.170', '0.226']





In [28]:
rf = OneVsRestClassifier(RandomForestClassifier(n_estimators=500, random_state=42))
lr = OneVsRestClassifier(LogisticRegressionCV())
svm = OneVsRestClassifier(SVC(probability=True))
ensemble = VotingClassifier(estimators=[('rf', rf), ('lr', lr), ('svm', svm)], voting='soft')

featureEvaluator(ensemble, featImp10, mainDF, importance=False) 

100%|██████████| 5/5 [00:42<00:00,  8.49s/it]

['0.817', '0.344', '0.601', '0.563', '0.806']
['0.276', '0.069', '0.056', '0.170', '0.226']





In [29]:
subTemplate = pd.read_csv('../dyskinesiaSubmissionTemplate.csv', sep=',')[['dataFileHandleId']]
len(subTemplate)

2143

In [30]:
len(allDF)

2142

In [31]:
fullDF = pd.merge(subTemplate, allDF, how = 'left', on = 'dataFileHandleId')
fullDF = fullDF.fillna(fullDF.mean())

In [32]:
len(fullDF)

2143

In [33]:
fullDF[['dataFileHandleId'] + featImp10].to_csv('../featureDB/final_dysk_10.csv', sep=',', index=False)
fullDF[['dataFileHandleId'] + featImp30].to_csv('../featureDB/final_dysk_30.csv',   sep=',', index=False)