# Supervised Learning

In [1]:
import os
import pandas as pd
import bear_necessities as bn
import visuals as vs 
import numpy as np 

from importlib import reload
vs = reload(vs)

**Lead the data:**

In [4]:
data = bn.decompress_pickle(os.getcwd() + '/data/review_stats.pbz2')



**List the variables you want to use and display their availability over time:**

In [3]:
# Create dummy variables for year to illustrate data availability over time 
data=pd.concat([data,
                pd.get_dummies(data['DateTime'].apply(lambda x: x.split('-')[0]),
                               prefix='Year')],
                axis = 1)

characteristics = ['Clarity',
                   'Easiness',
                   'Exam Difficulty',
                   'Helpfulness',
                   'Knowledge',
                   'Textbook Use',
                   'Determination',
                   'Effective',
                   'Empathy',
                   'Homework',
                   'Integrity',
                   'Parent Relation',
                   'Respect']

# Show the availability of variables across years 
yvars = [c for c in data.columns if 'Year_' in c]
corrs = data[characteristics + yvars].corr()
corrs.loc[characteristics, yvars]

Unnamed: 0,Year_2001,Year_2002,Year_2003,Year_2004,Year_2005,Year_2006,Year_2007,Year_2008,Year_2009,Year_2010,Year_2011,Year_2012,Year_2013,Year_2014,Year_2015,Year_2016,Year_2017,Year_2018
Clarity,0.000527,0.001016,0.006069,-0.008172,-0.004438,-0.008973,-0.008382,-0.009673,0.0016,0.041908,0.023779,0.004017,-0.003509,0.001357,-0.021754,-0.013722,-0.016546,-0.004719
Easiness,-0.013498,-0.003028,-0.001202,-0.013927,-0.015329,-0.015031,-0.015084,-0.013463,0.037498,0.08891,0.058987,0.017889,-0.047095,-0.027665,-0.036051,-0.03422,-0.029205,-0.006071
Exam Difficulty,,,,,,,,,,,,,-0.014538,0.01067,-0.003946,-0.000193,0.000283,0.001059
Helpfulness,-0.00144,-0.000578,-0.001631,-0.015709,-0.009532,-0.012074,-0.011983,-0.014139,0.000373,0.040003,0.024262,0.006052,-0.000961,0.008771,-0.009562,0.003273,-0.001053,-0.002187
Knowledge,,,,,,,,,,,0.013741,0.010849,-0.00494,0.017439,-0.01909,-0.003748,-0.010738,-0.004327
Textbook Use,,,,,,,,,,,,,0.031407,0.059667,0.003305,-0.022251,-0.036976,-0.007009
Determination,,,,,,,,,,,,,,,0.001733,0.052486,-0.050895,-0.010435
Effective,,,,,,,,,,,,,,,0.002596,0.05532,-0.054122,-0.010677
Empathy,,,,,,,,,,,,,,,0.001939,0.054898,-0.0532,-0.011403
Homework,,,,,,,,,,,,,,,0.003382,0.034523,-0.034457,-0.007902


The above table shows that only `Helpfulness`, `Easiness` and `Clarity` have been around since 2001. The other variables were not added until after 2010. 

## Using Likert Scale Data 

We do not know whether a score of 1 for a single individual is equivalent to a score of 1 for another. To address this we will discretize the data. Check the distributions of the data, for the variables that do not have a large number of observations we make the variable binary, 5 being the high end and anything below the low end. 

In [4]:
chart = {} 
chart['Exam Difficulty'] = {}
chart['Exam Difficulty'][2] = 1

chart['Knowledge'] = {}
chart['Knowledge'][2] = 1
chart['Knowledge'][4] = 3

fullmaps = ['Determination','Effective','Empathy','Homework','Integrity','Parent Relation','Respect']
for c in fullmaps: 

    chart[c] = {}
    chart[c][1] = 1
    chart[c][2] = 1
    chart[c][3] = 1
    chart[c][4] = 1

    data[c] = data[c].replace(chart[c])

In [5]:
for c in characteristics:
    print(data[c].value_counts())
    print('\n')

5.0    2985925
4.0     784586
3.0     469103
1.0     348436
2.0     273265
Name: Clarity, dtype: int64


5.0    1914717
3.0    1169729
4.0     949123
2.0     521333
1.0     307276
Name: Easiness, dtype: int64


3.0    213756
5.0    158176
4.0    147273
1.0    108101
2.0     87353
Name: Exam Difficulty, dtype: int64


5.0    3147716
4.0     657923
3.0     417740
1.0     369453
2.0     268904
Name: Helpfulness, dtype: int64


5.0    790414
4.0    122363
3.0     61162
1.0     58304
2.0     33487
Name: Knowledge, dtype: int64


1.0    209921
5.0    193305
3.0    129412
4.0     94555
2.0     85917
Name: Textbook Use, dtype: int64


5.0    13612
1.0     8685
Name: Determination, dtype: int64


5.0    14048
1.0     8249
Name: Effective, dtype: int64


5.0    13728
1.0     8569
Name: Empathy, dtype: int64


5.0    11901
1.0    10396
Name: Homework, dtype: int64


5.0    14037
1.0     8260
Name: Integrity, dtype: int64


5.0    14043
1.0     8254
Name: Parent Relation, dtype: int64


5.0    139

In [6]:
# create an index for each of the attributes and store it in a dictionary
label_dict = {} 
for c in characteristics: 
    label_dict[c] = {}
    vals = list(data[data[c].notnull()][c].unique())
    val = vals[0]
    for val in vals:
        label_dict[c][val] = list(data.loc[data[c]==val].index)
        
bn.compressed_pickle(os.getcwd()+'/data/labeled_indices',label_dict)

In [2]:
label_dict = bn.decompress_pickle(os.getcwd()+'/data/labeled_indices.pbz2')

# Supervised Learning

Now that we have the document for each of the labels, we can get to predicting. First thing is first, we have different cleaning processes and each one results in an index that needs to be translated back to the original so that we can be applied it proper label. Fortunately we have everything we need to put that together.  The different cleaning setups are: 
* A1
* C1
* E1

In [5]:
configs = ['A1','C1','E1']
ranges = ['[0, 35)',
          '[35, 60)',
          '[60, 65)',
          '[65, 75)',
          '[75, 85)',
          '[85, 95)']

dconf = configs[0]
rng = ranges[0]

In [11]:
# bring in the indices to map the cleaned docs to the originals 
#clean_index = bn.decompress_pickle(os.getcwd()+'/data/cleaned_data/cleaned_index_'+rng+dconf+'.pbz2')

# import the cleaned data 
docs = bn.decompress_pickle(os.getcwd() + '/data/cleaned_data/cleaned_docs_'+rng+dconf+'.pbz2')

# we need the dictionary of indices 
range_indices = bn.loosen(os.getcwd()+'/data/by_rating_range.pickle')

# get the datapoints for this range 
rng_data = data.loc[range_indices[rng]]

# we load the full corpus of review texts
#text = bn.decompress_pickle(os.getcwd() + '/data/full_review_text.pbz2')

# get the documents pertaining to this range 
#reviews = [text[i] for i in range_indices[rng]]

In [13]:
text_data = docs[:20]

At this points things might get confusing so it's important to clarify the pieces we've got and how they fit together: 
* label_dict - a dictionary where each key is a characteristic (Clarity, Easiness, ...), each value is a dictionary of the values in that column, and each item in that second dictionary is the value mapped to the list of indices of reviews in the raw data that have that characteristic value (the indices for which that columns == value). 
<br> `column key -> value key -> <original index>`       ---- All ranges represented


* clean_index - the index where each *key* is an index in the cleaned docs and each *value* is an index in the original. Such that: <br>`cleaned_index[<cleaned document index>] = <original index>`        ---- Specific ranges represented

In [None]:
# First thing is first, we must know which value indices belong in each range 

# create an empty dictionary to store the intersections for each column value 
range_intersection = {} 
for key in label_dict: 
    range_intersection[key]={} 
    for val in label_dict[key]: 
        range_intersection[key][val] = {} 
        # create a copy of the index list for this column value 
        labels = label_dict[key][val].copy()
        for r in ranges: 
            rngidx = range_indices[r].copy() 
            # store the intersection of the labels and range indices in the dictionary 
            # we use the np.intersect1d function for crazy efficiency 
            range_intersection[key][val][r] = np.intersect1d(labels,rngidx,assume_unique=True)

In [174]:
# remove missing data from the original reviews 
dirty = [text[i] for i in range_indices[rng]]
dirty = pd.DataFrame({'text':dirty})
dirty = dirty[[type(row['text'])!=float for idx, row in dirty.iterrows()]].copy()

In [8]:
data.head(6)

Unnamed: 0,FID,Clarity,DateTime,Easiness,Exam Difficulty,Helpfulness,Knowledge,Rating,Recommended,School,...,Year_2009,Year_2010,Year_2011,Year_2012,Year_2013,Year_2014,Year_2015,Year_2016,Year_2017,Year_2018
0,0,5.0,2010-06-03,5.0,,5.0,,110,,/kingwood-christian-school/57725-s,...,0,1,0,0,0,0,0,0,0,0
1,1,5.0,2010-10-26,5.0,,5.0,,110,,/kingwood-christian-school/57725-s,...,0,1,0,0,0,0,0,0,0,0
2,2,5.0,2010-02-20,4.0,,5.0,,110,,/kingwood-christian-school/57725-s,...,0,1,0,0,0,0,0,0,0,0
3,3,5.0,2007-07-30,5.0,,5.0,,110,,/kingwood-christian-school/57725-s,...,0,0,0,0,0,0,0,0,0,0
4,4,4.0,2004-06-28,2.0,,5.0,,99,,/kingwood-christian-school/57725-s,...,0,0,0,0,0,0,0,0,0,0
5,5,5.0,2016-01-09,5.0,4.0,5.0,5.0,110,5.0,/thompson-high-school/472-s,...,0,0,0,0,0,0,0,1,0,0


In [184]:
(1,2)+(3,)

(1, 2, 3)

In [130]:

# get the original index for the key-value-range 
body = range_intersection[key][val][rng]

# identify where those indices land in the ordering of the subset of ranged documents 
# this will convert the original indices into indices that map to the cleaned documents 
converted = [range_indices[rng].index(idx) for idx in body if idx in range_indices[rng]]

assert len(body) == len(converted)

print(docs[clean_index[1][converted[len(converted)-1]]])
print(text[body[len(converted)-1]])

['cla', 'joke', 'learn', 'noth', 'far', 'direct', 'skil', 'stay', 'ofic', 'play_solitair', 'lazi', 'sem', 'care', 'play_favorit', 'glad', 'fire']
Kyle Kyle Kyle. Remember teacher is just a title...because Kyle don't teach. Once stating in class "I hope to be like Mr. Fin" He chooses to make "hard" quizzes because students exit the class and "are not nervous about their grade" This guy is something else. Constantly a distraction due to his leprechaunic stature, student struggle to pay attention simply because of this. Kyle is more concerned about handing out Ls than doing his job.


Submitted by a Parent


['watkin',
 'bad',
 'teacher',
 'explain',
 'anyth',
 'alway',
 'say',
 'okay',
 'confus',
 'expect',
 'everi_singl',
 'answer',
 'note',
 'everi_singl',
 'teach',
 'anyth',
 'talk',
 'life',
 'peopl',
 'bad',
 'grade',
 'cla',
 'favorit',
 'peopl',
 'help',
 'instead',
 'everybodi_els',
 'next',
 'god_luck']

In [165]:
text[body[2]]

"Extremely disrespectful towards the students and even staff members. I've heard he treats the teaching assistant like a slave.\r\n\r\n\r\nSubmitted by a Parent"

In [11]:
from random import sample 

# for each characteristic in the data 
for c in label_dict.keys():
    # for each value for that characteristic 
    for val in label_dict[c].keys():
        # get the indices for the reviews in the original data that had these characteristic values
        indices = label_dict[c][val]

        # for each index in the clean documents you currently have loaded, check which ones  
        labeled = [idx for idx in clean_index[1] if idx in clean_index[1] in indices]
        
        pool = [idx for idx in clean_index_[1] if idx not in labeled]

        negative_sample = sample(pool, len(labeled))

  params = attr.ib(convert=attr.converters.optional(tuple))
  ids = attr.ib(default=None, convert=_ensure_immutable_ids)


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [11]:
label_dict.keys()

dict_keys(['Clarity', 'Easiness', 'Exam Difficulty', 'Helpfulness', 'Knowledge', 'Textbook Use', 'Determination', 'Effective', 'Empathy', 'Homework', 'Integrity', 'Parent Relation', 'Respect'])

[4,
 16,
 27,
 31,
 37,
 43,
 44,
 49,
 103,
 106,
 111,
 112,
 117,
 136,
 140,
 144,
 145,
 147,
 148,
 151,
 163,
 166,
 179,
 181,
 183,
 203,
 208,
 213,
 228,
 234,
 240,
 245,
 247,
 251,
 257,
 270,
 272,
 286,
 289,
 292,
 299,
 315,
 322,
 335,
 345,
 346,
 359,
 361,
 362,
 365,
 383,
 390,
 398,
 406,
 415,
 421,
 423,
 436,
 452,
 453,
 454,
 463,
 482,
 483,
 490,
 495,
 500,
 501,
 510,
 512,
 517,
 544,
 563,
 578,
 582,
 584,
 588,
 589,
 601,
 614,
 619,
 625,
 628,
 630,
 632,
 638,
 640,
 660,
 661,
 664,
 699,
 707,
 710,
 730,
 781,
 786,
 795,
 796,
 819,
 822,
 823,
 831,
 848,
 851,
 853,
 855,
 859,
 862,
 868,
 874,
 878,
 880,
 891,
 899,
 911,
 916,
 918,
 942,
 944,
 946,
 954,
 968,
 970,
 976,
 978,
 982,
 993,
 1004,
 1005,
 1009,
 1010,
 1021,
 1025,
 1026,
 1040,
 1048,
 1054,
 1055,
 1056,
 1060,
 1063,
 1065,
 1071,
 1073,
 1074,
 1078,
 1079,
 1082,
 1083,
 1084,
 1093,
 1101,
 1105,
 1112,
 1118,
 1134,
 1140,
 1141,
 1155,
 1168,
 1174,
 1191,
 1

In [17]:
reviews[labels_in_range[:5]]

'[0, 35)'

In [None]:
import time 
st = time.time() 
labels = label_dict['Clarity'][4.0].copy()
rngidx = range_indices[rng].copy()

if len(labels)>len(rngidx):
    labels_in_range = [idx for idx in rngidx if idx in labels]
else: 
    labels_in_range = [idx for idx in labels if idx in rngidx]
duration = time.time() - st 
print(duration)

In [59]:
import numpy as np 

intersec = np.array([]) 
for r in ranges: 
    rngidx = range_indices[r].copy()
    st = time.time() 
    result = np.intersect1d(labels, rngidx, assume_unique=True)
    duration = time.time() - st 
    print(duration)
    

0.16699457168579102
0.11899900436401367
0.09100031852722168
0.08999991416931152
0.11300110816955566
0.14400172233581543
