Import functions from other modules

In [1]:
from functions import *

%matplotlib inline

In [2]:
activityFile = '../Data (Algebra 1)/YearUsageActivity.csv'
surveyFile = '../Data (Algebra 1)/YearSurvey.csv'
states = ['Happiness','Frustration','Confusion','Hopefulness','Contentment',
         'Disappointment','Relief','Pride','Pleasantness','Anxiety','Engagement',
         'Interest','Sadness','Mind Wandering','Boredom','Arousal',
         'Curiosity','Surprise']
bayes = [0.28,0.27,0.28,0.27,0.29,0.28,0.26,0.28,0.25,0.20,0.21,0.20,0.17,0.16,
        0.13,0.12,0.09,0.08]
nClusters = 1
clusterAlgorithm = KMeans(n_clusters = nClusters)
nFolds = 10
replace = True

Process data

In [3]:
# load data
activitydf = readData(activityFile)
surveydf = readData(surveyFile)
# remove unnecessary features
surveydf = dropColumns(surveydf,['survey_id','time_window','question_id'])
# normalize activity data
activitydf = replaceDataFrameOutliers(activitydf,['sessionTime','NumSessions'])
activitydf, scalers = scaleDataframe(activitydf)

In [4]:
list(surveydf)

['survey_answer',
 'bio_video_watch',
 'karma_awarded',
 'leaderboard_load',
 'personal_profile_picture',
 'tys_answer',
 'tys_finish',
 'tys_load',
 'tys_previous',
 'tys_review_correct_question',
 'tys_review_incorrect_question',
 'tys_review_solution_video',
 'tys_review_topic_video',
 'tys_unload',
 'video_caption',
 'video_completed',
 'video_pause',
 'video_play',
 'video_seek',
 'video_watch',
 'wall_load_more',
 'wall_make_post',
 'wall_page_load',
 'survey_question']

In [None]:
activitydf.to_csv('../Data (Algebra 1)/Processed Usage Features.csv')

In [None]:
reverseScale(activitydf,scalers).to_csv('../Data (Algebra 1)/Unscaled Usage Features.csv')

Replicate Stephen's results by only using one cluster and training one classifier for each affective state.

In [4]:
scores = {}
clusterSeries, _ = cluster(clusterAlgorithm,activitydf)
for state in states:
    print(state)
    featuredf, surveySeries = filterByState(state,surveydf)
    featureDict, labelDict, groupDict = splitClusters(featuredf,surveySeries,clusterSeries)
    
    performance = np.zeros((len(featureDict),nFolds))
    kfold = GroupKFold(n_splits = nFolds)
    # do cross-validation
    for c in featureDict:
        features = featureDict[c]
        labels = labelDict[c]
        groups = groupDict[c]
        for i, (train, test) in enumerate(kfold.split(features,labels,groups)):
            predictions, model = classify(features[train],labels[train],features[test])
            rho = evaluateSpearman(predictions, labels[test])
            performance[c,i] = rho
    # key for each state, array nClusters x nFolds
    scores[state] = performance.reshape((nFolds,))

Happiness
Frustration
Confusion
Hopefulness
Contentment
Disappointment
Relief
Pride
Pleasantness
Anxiety
Engagement
Interest
Sadness
Mind Wandering
Boredom
Arousal
Curiosity
Surprise


In [5]:
scoredf = dataframeFromDict(scores)

In [6]:
scoredf.to_csv('../Data (Algebra 1)/Scores No Clusters.csv')

In [7]:
comparison = pd.DataFrame(scoredf.mean(axis=1),columns=['No Cluster'])
comparison['Paper'] = bayes
comparison.round(2)

Unnamed: 0,No Cluster,Paper
Happiness,0.29,0.28
Frustration,0.28,0.27
Confusion,0.29,0.28
Hopefulness,0.27,0.27
Contentment,0.29,0.29
Disappointment,0.28,0.28
Relief,0.26,0.26
Pride,0.27,0.28
Pleasantness,0.25,0.25
Anxiety,0.2,0.2


Now use 5 clusters, train one model per state and predict each cluster separately.

In [None]:
# run this...
nClusters = 5
clusterAlgorithm = KMeans(n_clusters = nClusters)
clusterSeries, model = cluster(clusterAlgorithm, activitydf)
joblib.dump(model,'kmeans.joblib')

In [8]:
#... or this
nClusters = 5
model = joblib.load('kmeans.joblib')
clusterSeries = pd.Series(model.predict(activitydf.values),index=activitydf.index)

In [None]:
displayClusters(clusterSeries.values, model.cluster_centers_,list(activitydf))

In [None]:
model.cluster_centers_

In [None]:
centersdf = pd.DataFrame(data=model.cluster_centers_,columns=list(activitydf))
reverseScale(centersdf,scalers).round(2)

In [None]:
centersdf.to_csv('../Data (Algebra 1)/KMeans cluster centers.csv')

In [9]:
clusterScores = {}
for state in states:
    print(state)
    # get students who answered surveys on current state
    featuredf, surveySeries = filterByState(state,surveydf)
    # get activity data from just these students
    usagedf = filterByIndex(activitydf, surveySeries.index)
    featuredf = filterByIndex(featuredf,usagedf.index)
    surveySeries = filterByIndex(surveySeries,usagedf.index)
    # split into clusters
    stateClusterSeries = filterByIndex(clusterSeries,usagedf.index)
    featureDict, labelDict, groupDict = splitClusters(featuredf,surveySeries,stateClusterSeries)
    # convert training data to arrays
    features = featuredf.values
    labels = surveySeries.values
    groups = featuredf.index.values
    # set up training
    performance = np.zeros((nClusters,nFolds))
    kfold = GroupKFold(nFolds)
    
    for i, (train, test) in enumerate(kfold.split(features,labels,groups)):
        _, model = classify(features[train],labels[train],features[test])
        for c in labelDict:
            trueLabels = labelDict[c]
            predLabels = model.predict(featureDict[c])
            rho = evaluateSpearman(trueLabels, predLabels)
            performance[c,i] = rho
    clusterScores[state] = performance

Happiness
Frustration
Confusion
Hopefulness
Contentment
Disappointment
Relief
Pride
Pleasantness
Anxiety
Engagement
Interest
Sadness
Mind Wandering
Boredom
Arousal
Curiosity
Surprise


In [10]:
for state in states:
    exportname = '../Data (Algebra 1)/'+state+' train on all scores.csv'
    pd.DataFrame(clusterScores[state]).to_csv(exportname)

In [11]:
comparison['Trained on all data'] = pd.Series(np.array([np.nanmean(clusterScores[state]) for state in clusterScores.keys()]),index=comparison.index)
comparison.round(2)

Unnamed: 0,No Cluster,Paper,Trained on all data
Happiness,0.29,0.28,0.28
Frustration,0.28,0.27,0.27
Confusion,0.29,0.28,0.27
Hopefulness,0.27,0.27,0.32
Contentment,0.29,0.29,0.28
Disappointment,0.28,0.28,0.28
Relief,0.26,0.26,0.27
Pride,0.27,0.28,0.28
Pleasantness,0.25,0.25,0.24
Anxiety,0.2,0.2,0.21


Now want to train one model per cluster per state

In [12]:
stateClusterScores = {}
for state in states:
    print(state)
    # get students who answered surveys on current state
    featuredf, surveySeries = filterByState(state,surveydf)
    # get activity data from just these students
    usagedf = filterByIndex(activitydf, surveySeries.index)
    featuredf = filterByIndex(featuredf,usagedf.index)
    surveySeries = filterByIndex(surveySeries,usagedf.index)
    # split into clusters
    stateClusterSeries = filterByIndex(clusterSeries,usagedf.index)
    featureDict, labelDict, groupDict = splitClusters(featuredf,surveySeries,stateClusterSeries)
    # set up training
    performance = np.zeros((nClusters,nFolds))
    kfold = KFold(nFolds)
    
    for c in labelDict:
        features = featureDict[c]
        labels = labelDict[c]
        groups = groupDict[c]
        for i, (train, test) in enumerate(kfold.split(features,labels,groups)):
            predLabels, _ = classify(features[train],labels[train],features[test])
            trueLabels = labels[test]
            rho = evaluateSpearman(trueLabels, predLabels)
            performance[c,i] = rho
    stateClusterScores[state] = performance

Happiness
Frustration
Confusion
Hopefulness
Contentment
Disappointment
Relief
Pride
Pleasantness
Anxiety
Engagement
Interest
Sadness
Mind Wandering
Boredom
Arousal
Curiosity
Surprise


In [13]:
for state in states:
    exportname = '../Data (Algebra 1)/'+state+' train on clusters scores.csv'
    pd.DataFrame(stateClusterScores[state]).to_csv(exportname)

In [15]:
comparison['Trained on clusters'] = pd.Series(np.array([np.mean(stateClusterScores[state]) for state in clusterScores.keys()]),index=comparison.index)
comparison.round(2)

Unnamed: 0,No Cluster,Paper,Trained on all data,Trained on clusters
Happiness,0.29,0.28,0.28,0.28
Frustration,0.28,0.27,0.27,0.25
Confusion,0.29,0.28,0.27,0.25
Hopefulness,0.27,0.27,0.32,0.3
Contentment,0.29,0.29,0.28,0.26
Disappointment,0.28,0.28,0.28,0.23
Relief,0.26,0.26,0.27,0.27
Pride,0.27,0.28,0.28,0.27
Pleasantness,0.25,0.25,0.24,0.23
Anxiety,0.2,0.2,0.21,0.16


In [None]:
pd.DataFrame(comparison.mean(),columns=['Mean correlation']).round(2)

In [None]:
d = dropColumns(comparison,['No Cluster','Paper'])
avg = d.mean(axis=0)
avg.name = 'Mean'
d = d.append(avg)
d.round(2)

Get mean survey response for each cluster for each state. Use clusters defined above.

In [None]:
counts, means = surveySummary(states, nClusters, surveydf, clusterSeries)

In [None]:
countdf = pd.DataFrame(counts,index=states)
countdf

In [None]:
meandf = pd.DataFrame(means,index=states).round(2)
meandf

In [None]:
countdf.to_csv('../Data (Algebra 1)/survey counts.csv')
meandf.to_csv('../Data (Algebra 1)/survey means.csv')

In [None]:
for state in states:
    exportname = '../Data (Algebra 1)/'+state+' student responses.csv'
    _, surveySeries = filterByState(state, surveydf)
    surveyVals = surveySeries.values
    clusterVals = filterByIndex(clusterSeries, surveySeries.index)
    exportdf = pd.DataFrame({'response':surveyVals, 'cluster':clusterVals}, index = surveySeries.index)
    exportdf.to_csv(exportname)

Difference in means for the following:
- Arousal (p = 0.013, F = 3.16)
- Boredom (p = 0.017, F = 3.02)
- Confusion (p = 0, F = 8.52)
- Contentment (p = 0.00028, F = 5.32)
- Curiosity (p = 0.013, F = 3.19)
- Disappointment (p = 0.017, F = 3.19)
- Frustration (p = 0.025, F = 1.38)
- Happiness (p = 0.00029, F = 5.31)
- Hopefulness (p = 0.021, F = 2.88)
- Interest (p = 0.033, F = 2.62)
- Mind Wandering (p = 0.00034, F = 5.22)
- Pleasantness (p = 0, F = 11.1)
- Pride (p = 0.04, F = 2.51)
- Relief (p = 0.00014, F = 5.73)
- Sadness (p = 0.00025, F = 5.39)
- Surprise (p = 0, F = 6.93)

No difference in means for the following:
- Anxiety
- Engagement

Compare personalized and general models with same sample size. Use the smallest cluster size for training. Only do positive/negative instead of each affective state.

In [5]:
featuredfP = pd.concat([surveydf.loc[surveydf['survey_question']=='Happiness'],surveydf.loc[surveydf['survey_question']=='Hopefulness'],
                       surveydf.loc[surveydf['survey_question']=='Contentment'],surveydf.loc[surveydf['survey_question']=='Relief'],
                       surveydf.loc[surveydf['survey_question']=='Pride'],surveydf.loc[surveydf['survey_question']=='Pleasantness'],
                       surveydf.loc[surveydf['survey_question']=='Interest'],surveydf.loc[surveydf['survey_question']=='Arousal'],
                       surveydf.loc[surveydf['survey_question']=='Engagement']]).sort_index()
featuredfN = pd.concat([surveydf.loc[surveydf['survey_question']=='Frustration'],surveydf.loc[surveydf['survey_question']=='Confusion'],
                       surveydf.loc[surveydf['survey_question']=='Disappointment'],surveydf.loc[surveydf['survey_question']=='Anxiety'],
                       surveydf.loc[surveydf['survey_question']=='Sadness'],surveydf.loc[surveydf['survey_question']=='Mind Wandering'],
                       surveydf.loc[surveydf['survey_question']=='Boredom']]).sort_index()

In [6]:
featuredfA = surveydf

labelSeriesP = featuredfP['survey_answer']
featuredfP = dropColumns(featuredfP,['survey_answer','survey_question'])

labelSeriesN = featuredfN['survey_answer']
featuredfN = dropColumns(featuredfN,['survey_answer','survey_question'])

labelSeriesA = featuredfA['survey_answer']
featuredfA = dropColumns(featuredfA,['survey_answer','survey_question'])

In [7]:
clustersP = filterByIndex(clusterSeries,featuredfP.index)
clustersN = filterByIndex(clusterSeries,featuredfN.index)
clustersA = filterByIndex(clusterSeries,featuredfA.index)

In [8]:
featureDictP, labelDictP, groupDictP = splitClusters(featuredfP,labelSeriesP,clustersP)
featureDictN, labelDictN, groupDictN = splitClusters(featuredfN,labelSeriesN,clustersN)
featureDictA, labelDictA, groupDictA = splitClusters(featuredfA,labelSeriesA,clustersA)

In [17]:
size = 1880
kfold = GroupKFold(n_splits = nFolds)
#(gen/personalized, cluster, pos/neg)
scoresP = np.zeros((2,5))
scoresN = scoresP.copy()

testFeatures = [featureDictP,featureDictN]
testLabels = [labelDictP,labelDictN]
testGroups = [groupDictP,groupDictN]
scores = [scoresP,scoresN]

In [18]:
# train general model
trainFeatures = [featuredfP,featuredfN]
trainLabels = [labelSeriesP,labelSeriesN]
# for positive and negative
for s in range(len(trainFeatures)):
    features = trainFeatures[s]
    groups = features.index.values
    features = features.values
    labels = trainLabels[s].values
    sample = np.random.choice(np.arange(len(labels)),size)
    features = features[sample]
    labels = labels[sample]
    groups = groups[sample]
    
    temp = np.zeros((nFolds,5))
    for i, (train, test) in enumerate(kfold.split(features,labels,groups)):
        # train a model
        _, model = classify(features[train],labels[train],features[test])
        # test on each cluster
        for cluster in testLabels[s]:
            predFeatures = testFeatures[s][cluster]
            predLabels = testLabels[s][cluster]
            preds = model.predict(predFeatures)
            rho = evaluateSpearman(predLabels,preds)
            temp[i,cluster] = rho
    scores[s][0,:] = np.mean(temp,axis=0)

In [34]:
# clusters
trainFeatures = testFeatures
trainLabels = testLabels
trainGroups = [groupDictP,groupDictN]
# for positive and negative
for s in range(len(trainFeatures)):
    temp = np.zeros((nFolds,5))
    # train a model for each cluster
    for cluster in testLabels[s]:
        features = trainFeatures[s][cluster]
        labels = trainLabels[s][cluster]
        groups = trainGroups[s][cluster]
        sample = np.random.choice(np.arange(len(labels)),size)
        features = features[sample]
        labels = labels[sample]
        groups = groups[sample]
        
        for i, (train, test) in enumerate(kfold.split(features,labels,groups)):
            # train a model
            _, model = classify(features[train],labels[train],features[test])
            predFeatures = testFeatures[s][cluster]
            predLabels = testLabels[s][cluster]
            preds = model.predict(predFeatures)
            rho = evaluateSpearman(predLabels,preds)
            temp[i,cluster] = rho
    scores[s][1,:] = np.mean(temp,axis=0)

In [35]:
dfP = pd.DataFrame(data=scores[0],index=['General','Personalized'])
dfN = pd.DataFrame(data=scores[1],index=['General','Personalized'])

In [38]:
dfP['Model Mean'] = dfP.mean(axis=1)
dfN['Model Mean'] = dfN.mean(axis=1)

In [40]:
dfN.round(2)

Unnamed: 0,0,1,2,3,4,Model Mean
General,0.18,0.19,0.19,0.17,0.17,0.18
Personalized,0.18,0.2,0.18,0.17,0.17,0.18
