In [2]:
from functions import *

Read in the file of ensemble predictions. For each survey, predictions are generated by the following:
- Bayesian Ridge Regression
- Elastic Net
- Linear Regression
- Random Forest

We then create a csv for each state where each column is the prediction for one of these classifiers. Each state also includes the unweighted average of the four classifiers.

In [4]:
infile = '../Data (Algebra 1)/YearEnsemblePredictions.csv'
states = ['Happiness','Frustration','Confusion','Hopefulness','Contentment',
         'Disappointment','Relief','Pride','Pleasantness','Anxiety','Engagement',
         'Interest','Sadness','Mind Wandering','Boredom','Arousal',
         'Curiosity','Surprise']
classifiers = ['Bayes Ridge', 'Elastic Net', 'Linear Regression', 'Random Forest']


In [3]:
ensemblePreds = readData(infile)

for state in states:
    outfile = '../Data (Algebra 1)/State data/'+state+' ensemble predictions.csv'
    statedf = ensemblePreds[ensemblePreds['Construct'] == state]
    data = np.zeros((len(np.unique(statedf['survey_id'].values)),len(classifiers)))
   
    for i in range(len(classifiers)):
        classifydf = statedf[statedf['Classifier'] == classifiers[i]]
        pred = classifydf['Prediction'].values
        data[:,i] = pred
        
    ind = classifydf.index
    outdf = pd.DataFrame(data = data, index = ind, columns = classifiers)
    outdf['Average'] = outdf.mean(axis=1)
    outdf.to_csv(outfile)

We then want to calculate a weighted average score. This is done by calculating the correlation of each classifier to the ground truth and using these as the weights.

In [4]:
surveyfile = '../Data (Algebra 1)/YearSurvey.csv'
surveydf = readData(surveyfile)

for state in states:
    statefile = '../Data (Algebra 1)/State data/'+state+' ensemble predictions.csv'
    preds = readData(statefile)
    
    _, trueLabels = filterByState(state,surveydf)
    
    bayesPreds = preds['Bayes Ridge'].values
    bCor = evaluateSpearman(trueLabels.values,bayesPreds)
    if bCor < 0: bcor = 0

    linearPreds = preds['Linear Regression'].values
    lCor = evaluateSpearman(trueLabels.values,linearPreds)
    if lCor < 0: lcor = 0

    forestPreds = preds['Random Forest'].values
    fCor = evaluateSpearman(trueLabels.values,forestPreds)
    if fCor < 0: fcor = 0

    elasticPreds = preds['Elastic Net'].values
    eCor = evaluateSpearman(trueLabels.values,elasticPreds)
    if eCor < 0: ecor = 0
    
    preds['Weighted Average'] = (bCor*preds['Bayes Ridge'] + lCor*preds['Linear Regression'] + fCor*preds['Random Forest'] + eCor*preds['Elastic Net'])/(bCor+lCor+fCor+eCor)
    
    preds.to_csv(statefile)

Lastly, we want to find the correlation with true labels using the weighted average and unweighted average.

In [6]:
scores = np.zeros((len(states),6))
surveyfile = '../Data (Algebra 1)/YearSurvey.csv'
surveydf = readData(surveyfile)

for i in range(len(states)):
    statefile = '../Data (Algebra 1)/State data/'+states[i]+' ensemble predictions.csv'
    preds = readData(statefile)
    
    _, trueLabels = filterByState(states[i],surveydf)
    aPreds = preds['Average'].values
    wPreds = preds['Weighted Average'].values
    
    bPreds = preds['Bayes Ridge'].values
    ePreds = preds['Elastic Net'].values
    fPreds = preds['Random Forest'].values
    lPreds = preds['Linear Regression'].values
    
    scores[i,0] = evaluateSpearman(trueLabels.values,bPreds)
    scores[i,1] = evaluateSpearman(trueLabels.values,ePreds)
    scores[i,2] = evaluateSpearman(trueLabels.values,fPreds)
    scores[i,3] = evaluateSpearman(trueLabels.values,lPreds)
    scores[i,4] = evaluateSpearman(trueLabels.values,aPreds)
    scores[i,5] = evaluateSpearman(trueLabels.values,wPreds)

ensembleScores = pd.DataFrame(data=scores, index=states, columns=['Bayes Ridge','Elastic Net', 'Random Forest',
                                                                  'Linear Regression','Average','Weighted Average'])

In [29]:
stateMean = ensembleScores.mean(axis=0)
toadd = stateMean.rename('Classifier mean')

In [31]:
ensembleScores.append(toadd).round(2)

Unnamed: 0,Bayes Ridge,Elastic Net,Random Forest,Linear Regression,Average,Weighted Average
Happiness,0.28,0.24,0.21,0.28,0.26,0.27
Frustration,0.26,0.17,0.17,0.25,0.24,0.24
Confusion,0.28,0.16,0.19,0.27,0.27,0.27
Hopefulness,0.26,0.23,0.2,0.26,0.26,0.26
Contentment,0.28,0.19,0.23,0.27,0.28,0.28
Disappointment,0.28,0.21,0.22,0.27,0.27,0.28
Relief,0.25,0.1,0.18,0.25,0.24,0.25
Pride,0.26,0.15,0.18,0.26,0.24,0.25
Pleasantness,0.24,0.2,0.18,0.24,0.23,0.23
Anxiety,0.2,0.05,0.12,0.2,0.18,0.19


In [32]:
ensembleScores.append(toadd).to_csv('../Data (Algebra 1)/Scores ensemble.csv')