### Description

### load 

In [1]:
import os
import random 
import pickle
import sys
import pandas as pd
import numpy as np

root_path = os.path.dirname(os.path.dirname(os.path.abspath(os.getcwd())))
# to import src is necessary to append the root_path to the path
#sys.path.append(root_path)

data_path = os.path.join(root_path,'data')
results_path = os.path.join(root_path,'results')

### Define global variables

In [2]:
WITH_STEMMING = True
REMOVE_STOPWORDS = True
MINIMUM_WORDS_PER_PHRASE = 10
GROUP = -1
SEED = 10
num_topics = 60
random.seed(SEED)

### split dataset

In [4]:
lessons_filename = 'clean_inquiry_phrases_by_phase_{}_{}_GROUP_{}_min_{}.pickle'.format(REMOVE_STOPWORDS,WITH_STEMMING,GROUP,MINIMUM_WORDS_PER_PHRASE)
with open(os.path.join(data_path,lessons_filename),'rb') as f:
    clean_phrases = pickle.load(f)

In [5]:
for key in clean_phrases:
    n = len(clean_phrases[key])
    print("key {}, total {}".format(key,n))

key 1, total 57
key 2, total 44
key 3, total 63
key 5, total 100
key 4, total 11


In [6]:
def split_sets(dataset,train=False):
    trainset = {}
    testset = {}
    for key in dataset:
        if len(dataset[key])>50 and key!=4:
            trainset[key]=[]
            testset[key]=[]
            n = len(dataset[key])
            ra = random.sample(range(1,n),int(n*0.2))
            for i in range(n):
                if i in ra:
                    testset[key].append(clean_phrases[key][i])
                else:
                    trainset[key].append(clean_phrases[key][i])                
    return trainset,testset

In [7]:
trainset,testset = split_sets(clean_phrases)

In [8]:
len(trainset[1])

46

### get topic distribution to test set and trainset

In [9]:
a_name = 'lda_textbooks_chunksize_alpha_auto_seed_{}_{}_{}_{}.pickle'.format(SEED,num_topics,REMOVE_STOPWORDS,WITH_STEMMING)
model_file = os.path.join(results_path,'lda_models',a_name)
with open(model_file,'rb') as f:
    ldamodel = pickle.load(f)



In [10]:
dict_file = os.path.join(data_path,'dictionary_seed_{}_{}_{}_{}.pickle'.format(SEED,num_topics,REMOVE_STOPWORDS,WITH_STEMMING))
with open(dict_file,'rb') as f:
    dictionary = pickle.load(f)
print("Dictionary length removing unfrequent words: {}".format(len(dictionary)))

Dictionary length removing unfrequent words: 11032


In [11]:
def get_topic_distribution(a_set):
    y = []
    X = []
    for key in a_set:
        for a_lesson in a_set[key]:
            bow = dictionary.doc2bow(a_lesson.split())
            T = ldamodel.get_document_topics(bow,minimum_probability=0,minimum_phi_value=0.001)
            X.append([x[1] for x in T])
            y.append(key)
    return X,y

In [12]:
X_train,y_train = get_topic_distribution(trainset)
X_test,y_test = get_topic_distribution(testset)

### Train decision tree


In [13]:
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier().fit(X_train, y_train)

print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

Accuracy of Decision Tree classifier on training set: 1.00
Accuracy of Decision Tree classifier on test set: 0.40


In [14]:
clf.predict(X_test)

array([5, 3, 5, 5, 3, 3, 1, 5, 1, 1, 5, 5, 3, 5, 1, 3, 5, 5, 5, 1, 5, 3,
       5, 5, 1, 3, 5, 5, 5, 5, 5, 1, 5, 1, 3, 5, 5, 1, 5, 1, 3, 1, 5],
      dtype=int64)

### Train without stem

In [15]:
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV

In [16]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print('Accuracy of K-NN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
     .format(knn.score(X_test, y_test)))

Accuracy of K-NN classifier on training set: 0.60
Accuracy of K-NN classifier on test set: 0.40


In [17]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)
print('Accuracy of GNB classifier on training set: {:.2f}'
     .format(gnb.score(X_train, y_train)))
print('Accuracy of GNB classifier on test set: {:.2f}'
     .format(gnb.score(X_test, y_test)))

Accuracy of GNB classifier on training set: 0.54
Accuracy of GNB classifier on test set: 0.40


In [18]:
C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
grid.fit(X_train, y_train)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))


The best parameters are {'C': 10000000.0, 'gamma': 1e-06} with a score of 0.51


In [19]:
svm = SVC(decision_function_shape='ovo',gamma=1e-06,C= 10000000.0)#random_state=SEED,class_weight={1:0.8,2:0.4,3:0.7,5:1})
svm.fit(X_train, y_train)
print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svm.score(X_train, y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'
     .format(svm.score(X_test, y_test)))

Accuracy of SVM classifier on training set: 0.62
Accuracy of SVM classifier on test set: 0.49


In [23]:
svm = SVC(decision_function_shape='ovo',gamma=1,C= 100.0)#random_state=SEED,class_weight={1:0.8,2:0.4,3:0.7,5:1})
svm.fit(X_train, y_train)
print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svm.score(X_train, y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'
     .format(svm.score(X_test, y_test)))

Accuracy of SVM classifier on training set: 0.85
Accuracy of SVM classifier on test set: 0.44


In [24]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
pred = svm.predict(X_test)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

[[ 3  2  6]
 [ 4  5  3]
 [ 7  2 11]]
              precision    recall  f1-score   support

           1       0.21      0.27      0.24        11
           3       0.56      0.42      0.48        12
           5       0.55      0.55      0.55        20

   micro avg       0.44      0.44      0.44        43
   macro avg       0.44      0.41      0.42        43
weighted avg       0.47      0.44      0.45        43



### Train with stem

In [25]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn.fit(X_train, y_train)
print('Accuracy of K-NN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
     .format(knn.score(X_test, y_test)))

Accuracy of K-NN classifier on training set: 0.60
Accuracy of K-NN classifier on test set: 0.40


In [26]:
from sklearn.naive_bayes import GaussianNB

gnb = GaussianNB()
gnb.fit(X_train, y_train)
print('Accuracy of GNB classifier on training set: {:.2f}'
     .format(gnb.score(X_train, y_train)))
print('Accuracy of GNB classifier on test set: {:.2f}'
     .format(gnb.score(X_test, y_test)))

Accuracy of GNB classifier on training set: 0.54
Accuracy of GNB classifier on test set: 0.40


#### SVM

In [27]:
C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
grid.fit(X_train, y_train)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))


The best parameters are {'C': 10000000.0, 'gamma': 1e-06} with a score of 0.51


In [28]:
svm = SVC(decision_function_shape='ovo',gamma=1e-06,C= 10000000.0, random_state=42)#random_state=SEED,class_weight={1:0.8,2:0.4,3:0.7,5:1})
svm.fit(X_train, y_train)
print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svm.score(X_train, y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'
     .format(svm.score(X_test, y_test)))

Accuracy of SVM classifier on training set: 0.62
Accuracy of SVM classifier on test set: 0.49


In [29]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
pred = svm.predict(X_test)
print(confusion_matrix(y_test, pred))
print(classification_report(y_test, pred))

[[ 4  1  6]
 [ 4  3  5]
 [ 6  0 14]]
              precision    recall  f1-score   support

           1       0.29      0.36      0.32        11
           3       0.75      0.25      0.38        12
           5       0.56      0.70      0.62        20

   micro avg       0.49      0.49      0.49        43
   macro avg       0.53      0.44      0.44        43
weighted avg       0.54      0.49      0.48        43



### Build classifier per class

In [30]:
def split_sets_key(dataset,the_key):
    trainset = {}
    testset = {}
    trainset[0]=[]
    testset[0]=[]
    for key in dataset:
        if key == the_key:
            trainset[key]=[]
            testset[key]=[]
            a_key = the_key
        else:
            a_key = 0
        n = len(dataset[key])
        ra = random.sample(range(1,n),int(n*0.2))
        for i in range(n):
            if i in ra:
                testset[a_key].append(clean_phrases[key][i])
            else:
                trainset[a_key].append(clean_phrases[key][i])
    return trainset,testset
trainset,testset = split_sets_key(clean_phrases,5)

### classify class 5

In [31]:
X_train,y_train = get_topic_distribution(trainset)
X_test,y_test = get_topic_distribution(testset)

In [32]:
C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
grid.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))


The best parameters are {'C': 1.0, 'gamma': 10.0} with a score of 0.65


In [44]:
svm = SVC(decision_function_shape='ovo',gamma=1.0,C= 10.0, random_state=42,class_weight={0:0.4,5:0.6})#random_state=SEED,class_weight={1:0.8,2:0.4,3:0.7,5:1})
svm.fit(X_train, y_train)
print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svm.score(X_train, y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'
     .format(svm.score(X_test, y_test)))

Accuracy of SVM classifier on training set: 0.69
Accuracy of SVM classifier on test set: 0.64


In [46]:
y_test

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5,
 5]

In [45]:
svm.predict(X_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 0, 0, 5, 0, 0, 0, 5, 5, 5, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5, 5, 0, 5, 0, 5, 0, 0, 5, 0, 0,
       0, 0, 0, 0, 0, 0, 5, 0, 0], dtype=int64)

### classify class 1

In [378]:
trainset,testset = split_sets_key(clean_phrases,1)
X_train,y_train = get_topic_distribution(trainset)
X_test,y_test = get_topic_distribution(testset)

In [380]:
C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
grid.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'C': 10000000.0, 'gamma': 1e-05} with a score of 0.75


In [395]:
svm = SVC(decision_function_shape='ovr',gamma=1e-05,C= 10000000.0, random_state=42)
svm.fit(X_train, y_train)
print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svm.score(X_train, y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'
     .format(svm.score(X_test, y_test)))

Accuracy of SVM classifier on training set: 0.65
Accuracy of SVM classifier on test set: 0.57


### classify class 2

In [47]:
trainset,testset = split_sets_key(clean_phrases,2)
X_train,y_train = get_topic_distribution(trainset)
X_test,y_test = get_topic_distribution(testset)

In [None]:

C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=42)
grid = GridSearchCV(SVC(), param_grid=param_grid, cv=cv)
grid.fit(X_train, y_train)
print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

In [48]:
svm = SVC(decision_function_shape='ovo',gamma=10,C= 1000.0, random_state=42)#random_state=SEED,class_weight={1:0.8,2:0.4,3:0.7,5:1})
svm.fit(X_train, y_train)
print('Accuracy of SVM classifier on training set: {:.2f}'
     .format(svm.score(X_train, y_train)))
print('Accuracy of SVM classifier on test set: {:.2f}'
     .format(svm.score(X_test, y_test)))

Accuracy of SVM classifier on training set: 1.00
Accuracy of SVM classifier on test set: 0.79


In [49]:
svm.predict(X_test)

array([0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0,
       0, 0, 2, 0, 0, 0, 2, 0, 0], dtype=int64)

In [51]:
np.array(y_test)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 2, 2, 2, 2, 2, 2, 2, 2], dtype=int64)