# Summarizing Results 

In this notebook we will summarize the results of the unsupervised LDA topic models. We will review the coherence scores and determine whether we need to train additional models. Finally, we will select results to visualize and summarize. 

**Print Ranked Coherence for Each Corpus:**

In [2]:
import pandas as pd 
import glob 

db_root = 'D:/Student_Voices_Database/s3mirror/'
dta_root = 'C:/Projects/VirtualMachines/Student_Voices/svvm/Student_Voices/student_voices/data'

# Get all the files with ranked coherence 
ranked_coherence_files = glob.glob(db_root+'/results/ranked_coherence_*.csv')
ranked_coherence = pd.DataFrame() 
for file in ranked_coherence_files:
    dta = pd.read_csv(file)
    ranked_coherence = ranked_coherence.append(dta)   # append all the ranked coherence data into one file 

In [3]:
from IPython.display import HTML, display_html

# Show the top 10 models by average coherence for each corpus (reivew score range)
top_n = 10 
for rng in list(ranked_coherence['range'].unique()):
    print('The top '+str(top_n)+' ave coherence scores for range '+str(rng))
    display_html(HTML(ranked_coherence[(ranked_coherence['range']==rng) & (ranked_coherence['num_topics']<15)].sort_values('ave_coherence_score', ascending=False).head(top_n).to_html()))

The top 10 ave coherence scores for range [0, 35)


Unnamed: 0.1,Unnamed: 0,num_topics,ave_coherence_score,range,setting,config
3,3,12,0.494107,"[0, 35)",LDA4,D1
5,5,12,0.491486,"[0, 35)",LDA2,D1
6,6,9,0.490155,"[0, 35)",LDA2,D1
3,3,12,0.488613,"[0, 35)",LDA1,E1
5,5,12,0.486553,"[0, 35)",LDA1,D1
1,1,12,0.484843,"[0, 35)",LDA2,A1
6,6,9,0.483085,"[0, 35)",LDA1,D1
5,5,12,0.482984,"[0, 35)",LDA1,A1
1,1,12,0.482551,"[0, 35)",LDA2,B1
5,5,9,0.480963,"[0, 35)",LDA1,B1


The top 10 ave coherence scores for range [35, 60)


Unnamed: 0.1,Unnamed: 0,num_topics,ave_coherence_score,range,setting,config
12,3,12,0.487868,"[35, 60)",LDA4,D1
14,5,9,0.485207,"[35, 60)",LDA1,D1
14,5,12,0.481115,"[35, 60)",LDA1,E1
15,6,12,0.480556,"[35, 60)",LDA1,D1
14,5,12,0.476382,"[35, 60)",LDA2,D1
11,2,12,0.47477,"[35, 60)",LDA4,A1
14,5,12,0.470782,"[35, 60)",LDA1,B1
14,5,9,0.467212,"[35, 60)",LDA4,D1
12,3,12,0.46705,"[35, 60)",LDA3,A1
13,4,12,0.466346,"[35, 60)",LDA1,A1


The top 10 ave coherence scores for range [60, 65)


Unnamed: 0.1,Unnamed: 0,num_topics,ave_coherence_score,range,setting,config
19,1,12,0.456216,"[60, 65)",LDA3,D1
21,3,12,0.450805,"[60, 65)",LDA1,B1
23,5,12,0.449455,"[60, 65)",LDA1,D1
24,6,9,0.447498,"[60, 65)",LDA1,D1
23,5,12,0.446917,"[60, 65)",LDA1,E1
23,5,12,0.446053,"[60, 65)",LDA2,D1
18,0,12,0.445722,"[60, 65)",LDA4,A1
24,6,9,0.443559,"[60, 65)",LDA1,E1
23,5,12,0.44234,"[60, 65)",LDA1,A1
20,2,12,0.441687,"[60, 65)",LDA2,E1


The top 10 ave coherence scores for range [65, 75)


Unnamed: 0.1,Unnamed: 0,num_topics,ave_coherence_score,range,setting,config
31,4,12,0.482225,"[65, 75)",LDA2,D1
33,6,9,0.473684,"[65, 75)",LDA2,D1
32,5,12,0.466591,"[65, 75)",LDA1,A1
32,5,12,0.464238,"[65, 75)",LDA1,D1
31,4,9,0.464088,"[65, 75)",LDA2,E1
32,5,9,0.462952,"[65, 75)",LDA3,D1
33,6,12,0.461505,"[65, 75)",LDA3,D1
32,5,12,0.461148,"[65, 75)",LDA1,E1
33,6,12,0.461057,"[65, 75)",LDA2,E1
27,0,12,0.460069,"[65, 75)",LDA3,B1


The top 10 ave coherence scores for range [75, 85)


Unnamed: 0.1,Unnamed: 0,num_topics,ave_coherence_score,range,setting,config
41,5,12,0.46053,"[75, 85)",LDA1,D1
36,0,12,0.456361,"[75, 85)",LDA4,D1
38,2,12,0.455921,"[75, 85)",LDA3,E1
41,5,12,0.452789,"[75, 85)",LDA1,A1
37,1,9,0.451288,"[75, 85)",LDA4,D1
41,5,12,0.448652,"[75, 85)",LDA1,B1
41,5,12,0.447857,"[75, 85)",LDA2,D1
40,4,12,0.446941,"[75, 85)",LDA3,D1
37,1,12,0.446372,"[75, 85)",LDA4,E1
40,4,12,0.445164,"[75, 85)",LDA2,E1


The top 10 ave coherence scores for range [85, 95)


Unnamed: 0.1,Unnamed: 0,num_topics,ave_coherence_score,range,setting,config
50,5,12,0.489282,"[85, 95)",LDA1,D1
50,5,12,0.482325,"[85, 95)",LDA3,D1
50,5,12,0.479773,"[85, 95)",LDA2,D1
51,6,9,0.475258,"[85, 95)",LDA3,D1
49,4,9,0.474961,"[85, 95)",LDA4,D1
51,6,6,0.474116,"[85, 95)",LDA2,D1
50,5,9,0.472118,"[85, 95)",LDA2,A1
52,7,9,0.472038,"[85, 95)",LDA2,D1
47,2,12,0.470911,"[85, 95)",LDA2,B1
48,3,12,0.470848,"[85, 95)",LDA1,E1


The top 10 ave coherence scores for range [95, 101)


Unnamed: 0.1,Unnamed: 0,num_topics,ave_coherence_score,range,setting,config
59,5,12,0.487157,"[95, 101)",LDA1,D1
60,6,9,0.475901,"[95, 101)",LDA1,D1
59,5,12,0.464439,"[95, 101)",LDA1,E1
59,5,12,0.46049,"[95, 101)",LDA1,A1
60,6,9,0.458667,"[95, 101)",LDA1,E1
57,3,12,0.455712,"[95, 101)",LDA3,B1
59,5,12,0.454805,"[95, 101)",LDA2,D1
59,5,12,0.454129,"[95, 101)",LDA1,B1
59,5,12,0.452067,"[95, 101)",LDA4,D1
59,5,9,0.44953,"[95, 101)",LDA2,E1


### Looking at Coherence Scores in More Detail

Here we take a look at the coherence scores for each topic in each analysis. With this we can review coherence based on more than just average topic coherence. 

**Append all the coherence scores for review**:

In [4]:
#%%time

import re 
import numpy as np
from student_voices import sv_utils as bn

coherence_thresholds = [0.5,0.54,0.59,0.64,0.69]

data = {} 
data['Range'] = [] 
data['Setting'] = [] 
data['Config'] = []
data['N_Topics'] = []
data['CS_Ave'] = [] 
data['CS_Med'] = []
for ct in coherence_thresholds: data['CS_'+str(ct)]=[]

file = db_root+'/results/complete_coherence.pbz2'
dta = bn.decompress_pickle(file)

ranges = list(dta.keys())
for rng in ranges:
    configs = list(dta[rng].keys())
    for config in configs: 
        settings = list(dta[rng][config].keys())
        for setting in settings: 
            for tn, css in dta[rng][config][setting]:
                scores, ave_score = css
                data['Range'].append(str(rng))
                data['Setting'].append(setting)
                data['Config'].append(config)
                data['N_Topics'].append(tn)
                data['CS_Ave'].append(ave_score)
                data['CS_Med'].append(np.median(scores))
                for ct in coherence_thresholds: data['CS_'+str(ct)].append(sum((scores>ct).astype(int))/len(scores))

full_coherence_data = pd.DataFrame(data)
bn.full_pickle(db_root+'/results/full_coherence_data', full_coherence_data)

In [5]:
full_coherence_data

Unnamed: 0,Range,Setting,Config,N_Topics,CS_Ave,CS_Med,CS_0.5,CS_0.54,CS_0.59,CS_0.64,CS_0.69
0,"[0, 35)",LDA3,D1,3,0.412061,0.407083,0.000000,0.000000,0.000000,0.00000,0.0
1,"[0, 35)",LDA3,D1,4,0.426712,0.432589,0.000000,0.000000,0.000000,0.00000,0.0
2,"[0, 35)",LDA3,D1,5,0.449169,0.437598,0.200000,0.200000,0.000000,0.00000,0.0
3,"[0, 35)",LDA3,D1,6,0.455692,0.441440,0.166667,0.166667,0.000000,0.00000,0.0
4,"[0, 35)",LDA3,D1,7,0.464874,0.470570,0.285714,0.142857,0.000000,0.00000,0.0
...,...,...,...,...,...,...,...,...,...,...,...
1500,"[0, 60)",LDA1,A1,16,0.489970,0.477574,0.250000,0.187500,0.062500,0.00000,0.0
1501,"[0, 60)",LDA1,A1,20,0.500209,0.484410,0.450000,0.300000,0.050000,0.00000,0.0
1502,"[0, 60)",LDA1,A1,24,0.490030,0.489901,0.416667,0.125000,0.041667,0.00000,0.0
1503,"[0, 60)",LDA1,A1,28,0.490145,0.488882,0.464286,0.214286,0.035714,0.00000,0.0


**Check if any models have not been scored**:

In [5]:
from student_voices import review 

models_scored, models_unscored = review.get_scored_unscored_models(db_root+'/results', 'D:/Student_Voices_Database/s3mirror/models/')
print('# of models not scored ', len(models_unscored))

both          1441
left_only       64
right_only       0
Name: _merge, dtype: int64
# of models not scored  64


**Exploring topic coherence distribution**: 

Average topic coherence is one way to check our unsupervised models. Another is to check which proportion of the topics have coherence scores that suggest high topic coherence. This method is has not been approached in the literature but may help provide clarity on how to proceed. 

In [6]:
rng = '[0, 35)'
top_n = 5
ctd = ['CS_Ave', 'CS_Med', 'CS_0.5','CS_0.54', 'CS_0.59']

for c in ctd: 
    print("Sort by",c)
    disp = full_coherence_data[full_coherence_data['Range']==rng].sort_values(c, ascending=False)
    display_html(HTML(disp.head(top_n).to_html()))

Sort by CS_Ave


Unnamed: 0,Range,Setting,Config,N_Topics,CS_Ave,CS_Med,CS_0.5,CS_0.54,CS_0.59,CS_0.64,CS_0.69
45,"[0, 35)",LDA1,D1,27,0.528722,0.530162,0.62963,0.481481,0.185185,0.037037,0.0
40,"[0, 35)",LDA1,D1,22,0.52499,0.542166,0.636364,0.545455,0.136364,0.0,0.0
89,"[0, 35)",LDA2,D1,25,0.524282,0.512153,0.56,0.48,0.24,0.04,0.0
83,"[0, 35)",LDA2,D1,19,0.52383,0.513162,0.526316,0.421053,0.210526,0.0,0.0
91,"[0, 35)",LDA2,D1,27,0.521792,0.522522,0.592593,0.407407,0.222222,0.0,0.0


Sort by CS_Med


Unnamed: 0,Range,Setting,Config,N_Topics,CS_Ave,CS_Med,CS_0.5,CS_0.54,CS_0.59,CS_0.64,CS_0.69
40,"[0, 35)",LDA1,D1,22,0.52499,0.542166,0.636364,0.545455,0.136364,0.0,0.0
38,"[0, 35)",LDA1,D1,20,0.521245,0.53995,0.65,0.5,0.2,0.0,0.0
11,"[0, 35)",LDA3,D1,16,0.510561,0.537158,0.625,0.4375,0.25,0.0,0.0
22,"[0, 35)",LDA3,D1,27,0.516089,0.531605,0.592593,0.481481,0.185185,0.0,0.0
12,"[0, 35)",LDA3,D1,17,0.512924,0.531471,0.588235,0.294118,0.235294,0.0,0.0


Sort by CS_0.5


Unnamed: 0,Range,Setting,Config,N_Topics,CS_Ave,CS_Med,CS_0.5,CS_0.54,CS_0.59,CS_0.64,CS_0.69
16,"[0, 35)",LDA3,D1,21,0.520415,0.518828,0.666667,0.380952,0.190476,0.0,0.0
38,"[0, 35)",LDA1,D1,20,0.521245,0.53995,0.65,0.5,0.2,0.0,0.0
40,"[0, 35)",LDA1,D1,22,0.52499,0.542166,0.636364,0.545455,0.136364,0.0,0.0
37,"[0, 35)",LDA1,D1,19,0.515144,0.513396,0.631579,0.315789,0.157895,0.0,0.0
45,"[0, 35)",LDA1,D1,27,0.528722,0.530162,0.62963,0.481481,0.185185,0.037037,0.0


Sort by CS_0.54


Unnamed: 0,Range,Setting,Config,N_Topics,CS_Ave,CS_Med,CS_0.5,CS_0.54,CS_0.59,CS_0.64,CS_0.69
40,"[0, 35)",LDA1,D1,22,0.52499,0.542166,0.636364,0.545455,0.136364,0.0,0.0
38,"[0, 35)",LDA1,D1,20,0.521245,0.53995,0.65,0.5,0.2,0.0,0.0
22,"[0, 35)",LDA3,D1,27,0.516089,0.531605,0.592593,0.481481,0.185185,0.0,0.0
45,"[0, 35)",LDA1,D1,27,0.528722,0.530162,0.62963,0.481481,0.185185,0.037037,0.0
89,"[0, 35)",LDA2,D1,25,0.524282,0.512153,0.56,0.48,0.24,0.04,0.0


Sort by CS_0.59


Unnamed: 0,Range,Setting,Config,N_Topics,CS_Ave,CS_Med,CS_0.5,CS_0.54,CS_0.59,CS_0.64,CS_0.69
11,"[0, 35)",LDA3,D1,16,0.510561,0.537158,0.625,0.4375,0.25,0.0,0.0
89,"[0, 35)",LDA2,D1,25,0.524282,0.512153,0.56,0.48,0.24,0.04,0.0
85,"[0, 35)",LDA2,D1,21,0.514685,0.498531,0.428571,0.333333,0.238095,0.0,0.0
12,"[0, 35)",LDA3,D1,17,0.512924,0.531471,0.588235,0.294118,0.235294,0.0,0.0
86,"[0, 35)",LDA2,D1,22,0.519379,0.497814,0.454545,0.363636,0.227273,0.045455,0.0


## Create Topic Visualizations

In the following sections you can visualize the results of any given model for comparison. 

**Bubble plot for topics using PyLDAVis**:

In this bloc we use PyLDAVis to create an interactive graph where each bubble represents a topic, its size its frequency, and its keywords can be found by hovering over the different graphs. 

*We're going to be doing this on AWS, running as single topic graph took 35 min on my laptop*

In [2]:
from student_voices import ec2_scripts 
from spot_connect import bash_scripts

configs = ['A1']#,'A1','D1','B1']
settings = ['LDA1']#,'LDA1']#,'LDA3','LDA4']

model_dir = '/home/ec2-user/efs/models/'
config_path = '/home/ec2-user/efs/data/cleaned_data/'
fullpath = '/home/ec2-user/efs/data/'
numwords = '14' 
graphpath = '/home/ec2-user/efs/graphs/LDAGraphs/'
desdir = '/home/ec2-user/efs/results/LDAdescriptions'
vecdir = '/home/ec2-user/efs/results/LDAdistributions'

filesystem = 'student_data'  # File system to connect to 
region='us-east-2'           # Region

ntop = 'I' # In follow-ups we've added the "number of topics option, ntop" to the scripts so that we can execute custom topic number in each instance
cg = 'C' # corresponding corpus group (See "run_lda.py" for coding)

scripts = [] 
uploads = [] 
for config in configs: 
    for setting in settings:            
#         if (setting, config) in exclude: 
#             continue
        print('Prepping ',config, setting)        
        script = ec2_scripts.get_instance_setup_script(filesystem,region,run_as_user='ec2-user')
        log_file_name = 'ldavis_'+str(setting)+'_'+str(config)+'.txt'
        script = ec2_scripts.get_ldavis_script(config,setting,ntop,cg,model_dir,config_path,fullpath,numwords,graphpath,desdir, vecdir,
                        log_file_name, region, cancel_fleet=True, script=script, run_as_user='ec2-user')         
        # Convert the working script to base-64 encoded so the fleet can run it 
        user_data_script = bash_scripts.script_to_userdata(script)
        scripts.append(user_data_script)
        
n_jobs = len(scripts)

Prepping  A1 LDA1
...EFS file system already exists
Waiting for availability......Available


In [3]:
#print(n_jobs)
#print(script)

In [4]:
from spot_connect import instance_manager
aws_link = instance_manager.InstanceManager()

instance_type = 'r5.2xlarge'
n_cores = 2                  # Number of physical cores in the instance type 

account_number_file = 'C:/Users/Computer/Documents/AWS/account_number.txt'
account_num = open(account_number_file).read()
aws_link.run_distributed_jobs(account_num,
                              'student_data',                     # Instance prefix 
                              n_jobs,                             # Number of jobs 
                              instance_type,                      # Instance type to use
                              availability_zone='us-east-2c',
                              user_data=scripts,                  # List of scripts, 1 for each job 
                              instance_profile='instance_manager')

Default key-pair directory is "C:/Projects/VirtualMachines/Key_Pairs"
Key pair detected, re-using...
Security group detected, re-using...
