
# Atlas-voting decision regions-scorer

Developed  by Frank Greco & Orest Xherja for analysing Atlas transcript data

## Synopsis
Three classifiers are initialized (`DecisionTreeClassifier`,
`KNeighborsClassifier`, and `SVC`) and used to initialize a
soft-voting `VotingClassifier` with weights [2, 1, 2].

A N-B classifier is trained using the NLTK-chat corpus to identify speech acts in text. 

The classifier is used to build a vector of speech acts for each transcript.

These vectors become training and test feature information which is fed to the above
classifiers, using the a specific tag present/not present vector as the label data.


In [120]:
#This may no longer be needed as we are not plotting...

%matplotlib inline
%load_ext autoreload
%autoreload 2
#%reload_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Imports

In [121]:
import sys
import os
#sys.path.append('/Users/fjgreco/Dev-Atlas')
#sys.path.insert(0, os.path.abspath(".."))

print(__doc__)

Automatically created module for IPython interactive environment


In [123]:
# Run this once

import json
import numpy as np

from itertools import product

from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.utils import shuffle

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer
import collections

import dialogue_act as da


## External dependencies
dialgue_act.py depend on:
    nltk & nltk.corpus.nps_chat.xml_posts()
    frame_stack.py called by dialogue_act

text_dict5.json

Optional: X_tw08.p  contains the  2D tone analysis feature vectors for the transcripts

In [124]:
# Using a dictionary containing the transcript text, 
# returns a tf-idf matrix for the transcript corpus.

def tfidf(train_dict):

    print ("# retained documents:", len(train_dict))

    train_set = []

    #od = collections.OrderedDict(sorted(train_dict.items()))
    #for x in od.keys():
    #    train_set.append(od[x]['text'])
    
    for x in train_dict.keys():
        train_set.append(train_dict[x]['text'])

    vectorizer1 = HashingVectorizer()
    vectorizer2 = TfidfVectorizer(ngram_range=(1, 1))
    vectorizer3 = CountVectorizer()

    matrix_train1 = vectorizer1.fit_transform(train_set)  # finds the Hashing score with normalization
    cosine_scores1 = cosine_similarity(matrix_train1, matrix_train1)
    matrix_train2 = vectorizer2.fit_transform(train_set)  # finds the tfidf score with normalization
    cosine_scores2 = cosine_similarity(matrix_train2, matrix_train2)
    matrix_train3 = vectorizer3.fit_transform(train_set)  # finds the Count score with normalization
    cosine_scores3 = cosine_similarity(matrix_train3, matrix_train3)
    # [n:m] controls what document[s] are  compared to. Comparison values are stored as lists in a list.
    # [0:1] causes he first element of tfidf_matrix_train to me compared to the remaining elements.

    #for item in zip(doc_ids, doc_names,cosine_scores[0]):
    #   print (item)
    return matrix_train2.todense()

In [125]:
# Proxy routine that reads a pre-built json file containing a curated set of transcripts and associated 
# feature information. This json/dictionary approach was taken to minimized time consuming file I/O 
# and cloud based service interactions.   
def make_tfidf_vectors(text_dict='text_dict5.json'):
    import json
    
    print("Started. make_tfidf_vectors...")
    
    with open(text_dict) as data_file:
        text_dict = json.load(data_file)
        
    return tfidf(text_dict)

In [126]:
# Returns a vector (y) of class  labels, one for each transcript, indicating whether the input tag was assigned to
# the associated transcripts. (Each  transcript is associated with a y component that align
# aligns withe  a row featue vector in the X training matrix.)

def make_y_vectors(text_dict='text_dict5.json', tag='TWXX',limit=200):
    #print("Started make_y_vectors...")
    # Read JSON file
 
    count = 0

    with open(text_dict) as data_file:
        text_dict = json.load(data_file)

    y_pre=[]
    
    for key in text_dict.keys():

        count+= 1

        if tag in text_dict[key]['tags']:
            v = 1

        else:
            v = 0

        y_pre.append([v])

        if count == limit:
            break


    #print (len(y_pre))

    y = np.array(y_pre)


    return y.ravel(1)

In [128]:
# Using the N-B classifier, returns the speech act vector for each transript 

def make_sac_vectors(text_dict='text_dict5.json', limit=217):
    print("Started make_sac_vectors...")
    
    count = 0

    with open(text_dict) as data_file:
        text_dict = json.load(data_file)

    X_pre=[]

    for key in text_dict.keys():

        count+= 1

        #print ('Count: {}, key: {}'.format(count,key))


        #tone_r = tone_analyzer.tone(text_dict[key]['text'])

        sac= da.speech_act_vector(text_dict[key]['text'])
        
        
        #X_pre.append([tone_r["document_tone"]["tone_categories"][1]['tones'][0]["score"],
        #              tone_r["document_tone"]["tone_categories"][2]['tones'][4]["score"]])

        #print([sac[key2] for key2 in sac.keys()])

        X_pre.append([sac[key2] for key2 in sac.keys()])


        #print (json.dumps(tone_r,indent=2))

        if count == limit:
            break


    print (len(X_pre))


    #X = np.array(X_pre)
    flat_list=[item for sublist in X_pre for item in sublist]
    X_sac=np.array(flat_list).reshape((217,15))##

    return X_sac

## Build X (train/test) matrix; stack various feature vectors as desired

In [130]:

# Uncomment the following two lines to restore the "tone analysis" vector set.
# import pickle
# X_twa = pickle.load(open("X_tw08.p","rb"))

# Uncomment the following to generate a tf-idf vector set (vector)
X_tfidf=make_tfidf_vectors()


# The following generated the speech act vector set (matrix)
#X_sac = make_sac_vectors(limit=217)

# X iss picked up by the subsequent multi-method classifiers.
#X=X_sac
X=X_tfidf

# Pick and choice what features to incorporate
#X=np.hstack((X_tfidf,X_sac))
#X=np.hstack((X_twa,X_sac))
#X=X_twa

print(X)


Started. make_tfidf_vectors...
# retained documents: 217
[[ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 ..., 
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]
 [ 0.  0.  0. ...,  0.  0.  0.]]


In [131]:
# The main mult-classifier function...

def fit(X,y,tag,n_shuffles=5):
    print ('\n'+tag)
    
    score_totals=[0,0,0,0]
    split=100

    for j in range(n_shuffles):
        #print("\nRun:{}".format(j))
        # Training classifiers
        clf1 = DecisionTreeClassifier(max_depth=4)
        clf2 = KNeighborsClassifier(n_neighbors=7)
        clf3 = SVC(kernel='rbf', probability=True)
        eclf = VotingClassifier(estimators=[('dt', clf1), ('knn', clf2),
                                            ('svc', clf3)],
                                voting='soft', weights=[2, 1, 2])

        clf1.fit(X[:split], y[:split])
        clf2.fit(X[:split], y[:split])
        clf3.fit(X[:split], y[:split])
        eclf.fit(X[:split], y[:split])


        """
        # Plotting decision regions
        x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
        y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
        xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                         np.arange(y_min, y_max, 0.1))
        """                 


        for idx1, idx, clf, tt in zip([0,1,2,3],product([0, 1], [0, 1]),
                                [clf1, clf2, clf3, eclf],
                                ['Decision Tree (depth=4)', 'KNN (k=7)','Kernel SVM', 'Soft Voting']):

            clf.fit(X[:split],y[:split])
            Z=clf.predict(X[split:])
            score=accuracy_score(y[split:],Z)
            #print(clf.__class__.__name__,score)
            score_totals[idx1] += score

            #Z = clf.predict(np.c_[xx.ravel(), yy.ravel()]) # Not apropos here

            #print ("Two (same) target predictions ===>")
            #print(clf.predict([[  0.00000000e+00 ,  1.00000000e+00 ,  3.47531000e-01  , 4.33134000e-01],[  0.00000000e+00 ,  1.00000000e+00  , 3.47531000e-01 ,  4.33134000e-01]]))



        X, y = shuffle(X, y, random_state=0)

    methods=['Decision Tree (depth=4)', 'KNN (k=7)','Kernel SVM', 'Soft Voting']
    
    for idx1, total in enumerate(score_totals):
        print ("idx:{} average score: {}".format(methods[idx1], total/n_shuffles))


In [132]:
# Calls the above classifier for each tag in the list. Within each instance, the data is 
# shuffled and the various classifiers are called ns times.

# special note: to print the accuray scores for each of the runs, uncomment the 
# appropriate print statments in the fit routine

ns=5  #number of shuffles
print('#shuffles:{}'.format(ns))
for tag in ['TW01','TW02','TW03','TW04','TW05','TW06','TW06s','TW07','TW08','TW09','TW09s',
           'TW10','TW11','TW13','TW15','TW16','TW17','TW18','TW19','TW20','TW22','TW23','TW24']:
    y=make_y_vectors(text_dict='text_dict5.json',tag=tag,limit=217)
    fit(X,y,tag,n_shuffles=ns)


#shuffles:5

TW01
idx:Decision Tree (depth=4) average score: 0.5452991452991454
idx:KNN (k=7) average score: 0.5521367521367522
idx:Kernel SVM average score: 0.4803418803418804
idx:Soft Voting average score: 0.52991452991453

TW02
idx:Decision Tree (depth=4) average score: 0.5846153846153845
idx:KNN (k=7) average score: 0.5282051282051282
idx:Kernel SVM average score: 0.5521367521367522
idx:Soft Voting average score: 0.5931623931623932

TW03
idx:Decision Tree (depth=4) average score: 0.8222222222222222
idx:KNN (k=7) average score: 0.9094017094017094
idx:Kernel SVM average score: 0.9094017094017094
idx:Soft Voting average score: 0.9025641025641026

TW04
idx:Decision Tree (depth=4) average score: 0.811965811965812
idx:KNN (k=7) average score: 0.8957264957264958
idx:Kernel SVM average score: 0.8991452991452992
idx:Soft Voting average score: 0.888888888888889

TW05
idx:Decision Tree (depth=4) average score: 0.5863247863247864
idx:KNN (k=7) average score: 0.6478632478632479
idx:Kernel SVM a