### Description

### load 

In [1]:
import os
import random 
import pickle
import sys
import pandas as pd
import numpy as np

root_path = os.path.dirname(os.path.dirname(os.path.abspath(os.getcwd())))
# to import src is necessary to append the root_path to the path
#sys.path.append(root_path)

data_path = os.path.join(root_path,'data')
results_path = os.path.join(root_path,'results')

### Define global variables

In [3]:
WITH_STEMMING = True
REMOVE_STOPWORDS = True
MINIMUM_WORDS_PER_PHRASE = 10
GROUP = -1
SEED = 10
num_topics = 100
random.seed(SEED)

### split dataset

In [4]:
lessons_filename = 'clean_inquiry_phrases_by_phase_{}_{}_GROUP_{}_min_{}.pickle'.format(REMOVE_STOPWORDS,WITH_STEMMING,GROUP,MINIMUM_WORDS_PER_PHRASE)
with open(os.path.join(data_path,lessons_filename),'rb') as f:
    clean_phrases = pickle.load(f)

In [5]:
for key in clean_phrases:
    n = len(clean_phrases[key])
    print("key {}, total {}".format(key,n))

key 1, total 57
key 2, total 44
key 3, total 63
key 5, total 100
key 4, total 11


In [6]:
def split_sets(dataset,train=False):
    trainset = {}
    testset = {}
    for key in dataset:
        if len(dataset[key])>50 and key!=4:
            trainset[key]=[]
            testset[key]=[]
            n = len(dataset[key])
            ra = random.sample(range(1,n),int(n*0.2))
            for i in range(n):
                if i in ra:
                    testset[key].append(clean_phrases[key][i])
                else:
                    trainset[key].append(clean_phrases[key][i])                
    return trainset,testset

In [59]:
trainset,testset = split_sets(clean_phrases)

In [60]:
len(trainset[1])

46

### get topic distribution to test set and trainset

In [61]:
a_name = 'lda_textbooks_chunksize_alpha_auto_seed_{}_{}_{}_{}.pickle'.format(SEED,num_topics,REMOVE_STOPWORDS,WITH_STEMMING)
model_file = os.path.join(results_path,'lda_models',a_name)
with open(model_file,'rb') as f:
    ldamodel = pickle.load(f)

In [62]:
dict_file = os.path.join(data_path,'dictionary_seed_{}_{}_{}_{}.pickle'.format(SEED,num_topics,REMOVE_STOPWORDS,WITH_STEMMING))
with open(dict_file,'rb') as f:
    dictionary = pickle.load(f)
print("Dictionary length removing unfrequent words: {}".format(len(dictionary)))

Dictionary length removing unfrequent words: 11031


In [63]:
def get_topic_distribution(a_set):
    y = []
    X = []
    for key in a_set:
        for a_lesson in a_set[key]:
            bow = dictionary.doc2bow(a_lesson.split())
            T = ldamodel.get_document_topics(bow,minimum_probability=0,minimum_phi_value=0.001)
            X.append([x[1] for x in T])
            y.append(key)
    return X,y

In [64]:
X_train,y_train = get_topic_distribution(trainset)
X_test,y_test = get_topic_distribution(testset)

### Train decision tree


In [65]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier().fit(X_train, y_train)

print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

Accuracy of Decision Tree classifier on training set: 1.00
Accuracy of Decision Tree classifier on test set: 0.37


In [66]:
clf.predict(X_test)

array([3, 3, 5, 3, 5, 5, 5, 1, 5, 3, 5, 5, 3, 5, 1, 3, 3, 5, 3, 3, 3, 5,
       3, 5, 1, 5, 5, 5, 3, 5, 5, 3, 1, 1, 5, 1, 3, 3, 3, 3, 3, 1, 5],
      dtype=int64)

### Train without stem

In [67]:
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV

In [68]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print('Accuracy of K-NN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
     .format(knn.score(X_test, y_test)))

Accuracy of K-NN classifier on training set: 0.67
Accuracy of K-NN classifier on test set: 0.44


In [72]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)
print('Accuracy of GNB classifier on training set: {:.2f}'
     .format(gnb.score(X_train, y_train)))
print('Accuracy of GNB classifier on test set: {:.2f}'
     .format(gnb.score(X_test, y_test)))

Accuracy of GNB classifier on training set: 0.65
Accuracy of GNB classifier on test set: 0.30


In [73]:
C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
grid.fit(X_train, y_train)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))


The best parameters are {'C': 100.0, 'gamma': 10.0} with a score of 0.57


In [76]:
svm = SVC(decision_function_shape='ovo',gamma=10,C= 100.0,class_weight={1:0.4,5:0.2,3:0.4})#random_state=SEED,class_weight={1:0.8,2:0.4,3:0.7,5:1})
svm.fit(X_train, y_train)
print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svm.score(X_train, y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'
     .format(svm.score(X_test, y_test)))

Accuracy of SVM classifier on training set: 1.00
Accuracy of SVM classifier on test set: 0.40


In [77]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
pred = svm.predict(X_test)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

[[ 2  2  7]
 [ 6  1  5]
 [ 5  1 14]]
              precision    recall  f1-score   support

           1       0.15      0.18      0.17        11
           3       0.25      0.08      0.12        12
           5       0.54      0.70      0.61        20

   micro avg       0.40      0.40      0.40        43
   macro avg       0.31      0.32      0.30        43
weighted avg       0.36      0.40      0.36        43



### Train with stem

In [78]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print('Accuracy of K-NN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
     .format(knn.score(X_test, y_test)))

Accuracy of K-NN classifier on training set: 0.67
Accuracy of K-NN classifier on test set: 0.44


In [79]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)
print('Accuracy of GNB classifier on training set: {:.2f}'
     .format(gnb.score(X_train, y_train)))
print('Accuracy of GNB classifier on test set: {:.2f}'
     .format(gnb.score(X_test, y_test)))

Accuracy of GNB classifier on training set: 0.65
Accuracy of GNB classifier on test set: 0.30


#### SVM

In [80]:
C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
grid.fit(X_train, y_train)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))


The best parameters are {'C': 100.0, 'gamma': 10.0} with a score of 0.57


In [81]:
svm = SVC(decision_function_shape='ovo',gamma=10,C= 100.0, random_state=42)#random_state=SEED,class_weight={1:0.8,2:0.4,3:0.7,5:1})
svm.fit(X_train, y_train)
print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svm.score(X_train, y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'
     .format(svm.score(X_test, y_test)))

Accuracy of SVM classifier on training set: 1.00
Accuracy of SVM classifier on test set: 0.40


In [82]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
pred = svm.predict(X_test)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

[[ 2  2  7]
 [ 6  1  5]
 [ 5  1 14]]
              precision    recall  f1-score   support

           1       0.15      0.18      0.17        11
           3       0.25      0.08      0.12        12
           5       0.54      0.70      0.61        20

   micro avg       0.40      0.40      0.40        43
   macro avg       0.31      0.32      0.30        43
weighted avg       0.36      0.40      0.36        43



### Build classifier per class

In [83]:
def split_sets_key(dataset,the_key):
    trainset = {}
    testset = {}
    trainset[0]=[]
    testset[0]=[]
    for key in dataset:
        if key == the_key:
            trainset[key]=[]
            testset[key]=[]
            a_key = the_key
        else:
            a_key = 0
        n = len(dataset[key])
        ra = random.sample(range(1,n),int(n*0.2))
        for i in range(n):
            if i in ra:
                testset[a_key].append(clean_phrases[key][i])
            else:
                trainset[a_key].append(clean_phrases[key][i])
    return trainset,testset
trainset,testset = split_sets_key(clean_phrases,5)

### classify class 5

In [84]:
X_train,y_train = get_topic_distribution(trainset)
X_test,y_test = get_topic_distribution(testset)

In [85]:
C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
grid.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))


The best parameters are {'C': 10000000.0, 'gamma': 1e-07} with a score of 0.66


In [110]:
svm = SVC(decision_function_shape='ovo',gamma=1e-07,C= 10000.0, random_state=42)#random_state=SEED,class_weight={1:0.8,2:0.4,3:0.7,5:1})
svm.fit(X_train, y_train)
print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svm.score(X_train, y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'
     .format(svm.score(X_test, y_test)))

Accuracy of SVM classifier on training set: 0.64
Accuracy of SVM classifier on test set: 0.62


In [111]:
pred = svm.predict(X_test)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

[[33  0]
 [20  0]]
              precision    recall  f1-score   support

           0       0.62      1.00      0.77        33
           5       0.00      0.00      0.00        20

   micro avg       0.62      0.62      0.62        53
   macro avg       0.31      0.50      0.38        53
weighted avg       0.39      0.62      0.48        53



  'precision', 'predicted', average, warn_for)


### classify class 1

In [112]:
trainset,testset = split_sets_key(clean_phrases,1)
X_train,y_train = get_topic_distribution(trainset)
X_test,y_test = get_topic_distribution(testset)

In [113]:
C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
grid.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 1.0, 'gamma': 10.0} with a score of 0.80


In [119]:
svm = SVC(decision_function_shape='ovr',gamma=10,C= 1.0, random_state=42,class_weight={0:0.2,1:0.7})
svm.fit(X_train, y_train)
print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svm.score(X_train, y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'
     .format(svm.score(X_test, y_test)))

Accuracy of SVM classifier on training set: 0.83
Accuracy of SVM classifier on test set: 0.74


In [120]:
pred = svm.predict(X_test)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

[[34  8]
 [ 6  5]]
              precision    recall  f1-score   support

           0       0.85      0.81      0.83        42
           1       0.38      0.45      0.42        11

   micro avg       0.74      0.74      0.74        53
   macro avg       0.62      0.63      0.62        53
weighted avg       0.75      0.74      0.74        53



### classify class 2

In [47]:
trainset,testset = split_sets_key(clean_phrases,2)
X_train,y_train = get_topic_distribution(trainset)
X_test,y_test = get_topic_distribution(testset)

In [None]:

C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
grid.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

In [48]:
svm = SVC(decision_function_shape='ovo',gamma=10,C= 1000.0, random_state=42)#random_state=SEED,class_weight={1:0.8,2:0.4,3:0.7,5:1})
svm.fit(X_train, y_train)
print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svm.score(X_train, y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'
     .format(svm.score(X_test, y_test)))

Accuracy of SVM classifier on training set: 1.00
Accuracy of SVM classifier on test set: 0.79


In [49]:
svm.predict(X_test)

array([0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 2, 0, 0, 0, 2, 0, 0], dtype=int64)

In [51]:
np.array(y_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)