In [1]:
baseDIR = '/home/pataki/synapse/gitParkinson' # base directory of the github repo

In [2]:
cd $baseDIR/featureSelectors/

/home/pataki/synapse/gitParkinson/featureSelectors


In [3]:
%run -i ../src/sc2FitModels.py

Welcome, Balint Armin Pataki!



In [4]:
def score(phenotype, trainX, trainY, testX, testY):
    ensemble = train_ensemble(trainX, trainY)
    
    results, y_score, y_true = getNonLinearInterpAupr(testX, testY,
            np.arange(len(CATEGORY_WEIGHTS[phenotype])), ensemble)
    if phenotype == 'tremor':
        weighted_aupr = getWeightedMean(phenotype, results)
    else:
        weighted_aupr = results[0]

    return weighted_aupr, y_score, y_true

In [5]:
def getFeatureImportances(clf, DF):
    tmpDF = pd.DataFrame({'imp': clf.feature_importances_, 
                  'feature':DF.columns.tolist()}).sort_values('imp', ascending=False)
    return(tmpDF)

In [6]:
def featureEvaluator(classifier, features, featureDB, repeatNum = 5, baseSeed = 4242, 
                     testPatientNum = 5, importance = True):
    prAUClist = []
    basePRauc = []
    for i in pb(range(repeatNum)):
        rndState = np.random.RandomState(seed=baseSeed+137*i)
        testPatients = list(rndState.choice(pd.unique(featureDB.patient), testPatientNum, replace=False))

        trainX = featureDB[~featureDB.patient.isin(testPatients)]
        trainY = trainX.pop('tremorScore')
        trainX.pop('patient')
        trainX.pop('dataFileHandleId')
        trainX = trainX[features]

        testX = featureDB[featureDB.patient.isin(testPatients)]
        testY = testX.pop('tremorScore')
        testX.pop('patient')
        testX.pop('dataFileHandleId')
        testX = testX[features]

        classifier.fit(trainX, list(trainY))

        prAUClist.append(nonLinearInterpAupr(y_true=list(testY), y_score=classifier.predict_proba(testX).T[1])[0])
        basePRauc.append(sum(testY)/len(testY)) # score for random guessing
        
    print([ '%.3f' % elem[0] for elem in prAUClist])
    print([ '%.3f' % elem    for elem in basePRauc])
    
    if(importance):
        return(getFeatureImportances(classifier, testX))

In [7]:
%run -i ../src/helperFuncs.py

In [8]:
mainDF = mainDFCreator('tremorScore')

mainDF = mainDFtrimmer(mainDF, fileMinLen=1, plot=False)
mainDF = mainDF[['dataFileHandleId', 'tremorScore', 'patient']]

Train shape: (3667, 12)
Test shape:  (1500, 12)
Merged:      (5167, 12)
Remained shape: (5005, 14)


In [9]:
baseDF            = pd.read_csv('../sub2.1_tremor/featureDB/baseFeatures.tsv', sep='\t')
baseFeatures      = list(set(baseDF.columns.tolist()) - set(['dataFileHandleId']))

empiricalDF       = pd.read_csv('../sub2.1_tremor/featureDB/empiricalFeature.tsv', sep='\t')
empiricalFeatures = list(set(empiricalDF.columns.tolist()) - set(['dataFileHandleId']))

fourierDF         = pd.read_csv('../sub2.1_tremor/featureDB/fourierFeatures.tsv', sep='\t')
fourierFeatures   = list(set(fourierDF.columns.tolist()) - set(['dataFileHandleId']))

rangeStdDF        = pd.read_csv('../sub2.1_tremor/featureDB/rangeStdFeatures.tsv', sep='\t')
rangeStdFeatures  = list(set(rangeStdDF.columns.tolist()) - set(['dataFileHandleId']))

tsfreshDF         = pd.read_csv('../sub2.1_tremor/featureDB/tsFresh_fillNA_dropConstant.tsv', sep='\t')
tsfreshFeatures   = list(set(tsfreshDF.columns.tolist()) - set(['dataFileHandleId']))

autocorrDF        = pd.read_csv('../sub2.1_tremor/featureDB/autoCorrFeatures.tsv', sep='\t')
autocorrFeatures  = list(set(autocorrDF.columns.tolist()) - set(['dataFileHandleId']))

In [10]:
mainDF = pd.merge(mainDF, baseDF,      on = 'dataFileHandleId', how='inner')
mainDF = pd.merge(mainDF, empiricalDF, on = 'dataFileHandleId', how='inner')
mainDF = pd.merge(mainDF, fourierDF,   on = 'dataFileHandleId', how='inner')
mainDF = pd.merge(mainDF, rangeStdDF,  on = 'dataFileHandleId', how='inner')
mainDF = pd.merge(mainDF, tsfreshDF,   on = 'dataFileHandleId', how='inner')
mainDF = pd.merge(mainDF, autocorrDF,  on = 'dataFileHandleId', how='inner')

In [11]:
allDF = mainDF.copy(deep=True)
allDF[allDF.columns.tolist()[3:]] = StandardScaler().fit_transform(allDF[allDF.columns.tolist()[3:]])
mainDF = allDF[allDF.tremorScore != 'Score']
mainDF.shape

(3545, 3015)

In [12]:
score01234df = mainDF.copy(deep=True)

In [13]:
score0df = score01234df.copy(deep=True)
score0df['tremorScore'] = [int(i==0) for i in score0df.tremorScore]

score1df = score01234df.copy(deep=True)
score1df['tremorScore'] = [int(i==1) for i in score1df.tremorScore]

score2df = score01234df.copy(deep=True)
score2df['tremorScore'] = [int(i==2) for i in score2df.tremorScore]

score3df = score01234df.copy(deep=True)
score3df['tremorScore'] = [int(i==3) for i in score3df.tremorScore]

score4df = score01234df.copy(deep=True)
score4df['tremorScore'] = [int(i==4) for i in score4df.tremorScore]

## Check scores

In [14]:
clf = RandomForestClassifier(n_estimators=500, n_jobs=30, random_state=42)

featImp0 = featureEvaluator(clf, baseFeatures, score0df) # score 0 vs other
featImp1 = featureEvaluator(clf, baseFeatures, score1df) # score 1 vs other
featImp2 = featureEvaluator(clf, baseFeatures, score2df) # score 2 vs other

100%|██████████| 5/5 [00:32<00:00,  6.32s/it]


['0.872', '0.963', '0.845', '0.844', '0.793']
['0.668', '0.820', '0.584', '0.557', '0.521']


100%|██████████| 5/5 [00:31<00:00,  6.20s/it]


['0.468', '0.279', '0.547', '0.551', '0.608']
['0.280', '0.097', '0.306', '0.326', '0.377']


100%|██████████| 5/5 [00:21<00:00,  4.27s/it]


['0.139', '0.775', '0.448', '0.306', '0.246']
['0.052', '0.083', '0.110', '0.117', '0.102']


In [15]:
clf = RandomForestClassifier(n_estimators=500, n_jobs=30, random_state=42)

featImp0 = featureEvaluator(clf, baseFeatures + empiricalFeatures + fourierFeatures + 
                                rangeStdFeatures + tsfreshFeatures + autocorrFeatures, score0df) # score 0 vs other
featImp1 = featureEvaluator(clf, baseFeatures + empiricalFeatures + fourierFeatures + 
                                rangeStdFeatures + tsfreshFeatures + autocorrFeatures, score1df) # score 1 vs other
featImp2 = featureEvaluator(clf, baseFeatures + empiricalFeatures + fourierFeatures + 
                                rangeStdFeatures + tsfreshFeatures + autocorrFeatures, score2df) # score 2 vs other

100%|██████████| 5/5 [00:38<00:00,  7.86s/it]


['0.884', '0.971', '0.904', '0.872', '0.865']
['0.668', '0.820', '0.584', '0.557', '0.521']


100%|██████████| 5/5 [00:35<00:00,  7.21s/it]


['0.459', '0.360', '0.560', '0.575', '0.563']
['0.280', '0.097', '0.306', '0.326', '0.377']


100%|██████████| 5/5 [00:34<00:00,  6.95s/it]


['0.080', '0.530', '0.545', '0.515', '0.454']
['0.052', '0.083', '0.110', '0.117', '0.102']


In [16]:
impFeat400_0 = set(featImp0.head(400).feature.tolist())
impFeat400_1 = set(featImp1.head(200).feature.tolist())
impFeat400_2 = set(featImp2.head(200).feature.tolist())
impFeat400   = list(impFeat400_0.union(impFeat400_1).union(impFeat400_2))

impFeat200_0 = set(featImp0.head(200).feature.tolist())
impFeat200_1 = set(featImp1.head(100).feature.tolist())
impFeat200_2 = set(featImp2.head(100).feature.tolist())
impFeat200   = list(impFeat200_0.union(impFeat200_1).union(impFeat200_2))

In [17]:
clf = RandomForestClassifier(n_estimators=500, n_jobs=30, random_state=42)

featImp0 = featureEvaluator(clf, impFeat400, score0df)
featImp1 = featureEvaluator(clf, impFeat400, score1df)
featImp2 = featureEvaluator(clf, impFeat400, score2df)

100%|██████████| 5/5 [00:34<00:00,  6.97s/it]


['0.900', '0.980', '0.910', '0.886', '0.867']
['0.668', '0.820', '0.584', '0.557', '0.521']


100%|██████████| 5/5 [00:29<00:00,  5.86s/it]


['0.526', '0.386', '0.556', '0.596', '0.599']
['0.280', '0.097', '0.306', '0.326', '0.377']


100%|██████████| 5/5 [00:25<00:00,  5.19s/it]


['0.080', '0.633', '0.521', '0.514', '0.456']
['0.052', '0.083', '0.110', '0.117', '0.102']


In [18]:
clf = RandomForestClassifier(n_estimators=500, n_jobs=30, random_state=42)

featImp0 = featureEvaluator(clf, impFeat200, score0df)
featImp1 = featureEvaluator(clf, impFeat200, score1df)
featImp2 = featureEvaluator(clf, impFeat200, score2df)

100%|██████████| 5/5 [00:34<00:00,  7.04s/it]


['0.912', '0.980', '0.905', '0.893', '0.872']
['0.668', '0.820', '0.584', '0.557', '0.521']


100%|██████████| 5/5 [00:28<00:00,  5.78s/it]


['0.555', '0.396', '0.572', '0.597', '0.611']
['0.280', '0.097', '0.306', '0.326', '0.377']


100%|██████████| 5/5 [00:23<00:00,  4.86s/it]


['0.085', '0.670', '0.514', '0.494', '0.428']
['0.052', '0.083', '0.110', '0.117', '0.102']


In [19]:
impFeat100_0 = set(featImp0.head(100).feature.tolist())
impFeat100_1 = set(featImp1.head(50).feature.tolist())
impFeat100_2 = set(featImp2.head(50).feature.tolist())
impFeat100   = list(impFeat100_0.union(impFeat100_1).union(impFeat100_2))

impFeat50_0 = set(featImp0.head(50).feature.tolist())
impFeat50_1 = set(featImp1.head(25).feature.tolist())
impFeat50_2 = set(featImp2.head(25).feature.tolist())
impFeat50   = list(impFeat50_0.union(impFeat50_1).union(impFeat50_2))

In [20]:
clf = RandomForestClassifier(n_estimators=500, n_jobs=30, random_state=42)

featImp0 = featureEvaluator(clf, impFeat100, score0df)
featImp1 = featureEvaluator(clf, impFeat100, score1df)
featImp2 = featureEvaluator(clf, impFeat100, score2df)

100%|██████████| 5/5 [00:33<00:00,  6.73s/it]


['0.914', '0.979', '0.903', '0.899', '0.871']
['0.668', '0.820', '0.584', '0.557', '0.521']


100%|██████████| 5/5 [00:28<00:00,  5.63s/it]


['0.567', '0.400', '0.575', '0.595', '0.630']
['0.280', '0.097', '0.306', '0.326', '0.377']


100%|██████████| 5/5 [00:22<00:00,  4.76s/it]


['0.094', '0.720', '0.499', '0.484', '0.423']
['0.052', '0.083', '0.110', '0.117', '0.102']


In [21]:
clf = RandomForestClassifier(n_estimators=500, n_jobs=30, random_state=42)

featImp0 = featureEvaluator(clf, impFeat50, score0df)
featImp1 = featureEvaluator(clf, impFeat50, score1df)
featImp2 = featureEvaluator(clf, impFeat50, score2df)

100%|██████████| 5/5 [00:32<00:00,  6.71s/it]


['0.914', '0.981', '0.917', '0.899', '0.878']
['0.668', '0.820', '0.584', '0.557', '0.521']


100%|██████████| 5/5 [00:28<00:00,  5.78s/it]


['0.584', '0.423', '0.599', '0.604', '0.627']
['0.280', '0.097', '0.306', '0.326', '0.377']


100%|██████████| 5/5 [00:23<00:00,  4.84s/it]


['0.110', '0.775', '0.507', '0.484', '0.437']
['0.052', '0.083', '0.110', '0.117', '0.102']


In [22]:
impFeat30_0   = set(featImp0.head(30).feature.tolist())
impFeat30_1   = set(featImp1.head(15).feature.tolist())
impFeat30_2   = set(featImp2.head(15).feature.tolist())
impFeat30 = list(impFeat30_0.union(impFeat30_1).union(impFeat30_2))

In [23]:
clf = RandomForestClassifier(n_estimators=500, n_jobs=30, random_state=42)

featImp0 = featureEvaluator(clf, impFeat30, score0df)
featImp1 = featureEvaluator(clf, impFeat30, score1df)
featImp2 = featureEvaluator(clf, impFeat30, score2df)

100%|██████████| 5/5 [00:32<00:00,  6.67s/it]


['0.914', '0.979', '0.916', '0.898', '0.881']
['0.668', '0.820', '0.584', '0.557', '0.521']


100%|██████████| 5/5 [00:28<00:00,  5.76s/it]


['0.587', '0.458', '0.605', '0.605', '0.629']
['0.280', '0.097', '0.306', '0.326', '0.377']


100%|██████████| 5/5 [00:23<00:00,  4.88s/it]


['0.102', '0.801', '0.493', '0.479', '0.408']
['0.052', '0.083', '0.110', '0.117', '0.102']


In [24]:
subTemplate = pd.read_csv('../tremorSubmissionTemplate.csv', sep=',')[['dataFileHandleId']]

In [25]:
fullDF = pd.merge(subTemplate, allDF, how = 'outer', on = 'dataFileHandleId')
fullDF = fullDF.fillna(fullDF.mean())

In [26]:
len(fullDF)

5005

In [28]:
fullDF[['dataFileHandleId'] + impFeat30].to_csv('../featureDB/final_tremor_50.csv', sep=',', index=False)
fullDF[['dataFileHandleId'] + impFeat50].to_csv('../featureDB/final_tremor_30.csv',   sep=',', index=False)