Import needed libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
import re
import os
import tokenizermodule as tm
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
%matplotlib inline

Import excel file with our input data

In [None]:
df = pd.read_excel("oneandtwowithfilepathtest.xlsx")

Split the data set into features and labels

In [None]:
Y = df["Dialogue Move"]
features = df["Commander"]

Print the labels value counts

In [None]:
print(Y.value_counts())

Get the number of sentences in the training data

In [None]:
num_sentences = features.size

Clean and parse the feature data

In [None]:
print ("\n Cleaning and parsing the feature data\n")
clean_features = []
for i in range( 0, len(features)):
    if( (i+1)%100 == 0 ):
        print ("Sentence %d of %d\n" % ( i+1, num_sentences )  )  

    clean_features.append(" ".join(tm.processSentence(features[i])))

Print the top 5 sentences on the feature set

In [None]:
print(clean_features[:5])

Create the bag of words model for the feature set, , and convert to a numpy array

In [None]:
print ("Creating the bag of words...\n")
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(clean_features)
X = X.toarray()

Split features and target into train and test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.30, random_state=1, stratify=Y)

Print out the labels value counts for the test data

In [None]:
y_test.value_counts()

Initialize a Random Forest classifier with 100 trees
Fit the forest to the training set.
This may take a few minutes to run

In [None]:
print ("Training the random forest (this may take a while)...")
forest = RandomForestClassifier(n_estimators = 100)
forest = forest.fit( X_train, y_train)

Make predictions for the test set

In [None]:
y_pred_test = forest.predict(X_test)

View accuracy score

In [None]:
accuracy_score(y_test, y_pred_test)

View confusion matrix for test data and predictions

In [None]:
confusion_matrix(y_test, y_pred_test)

Get and reshape confusion matrix data

In [None]:
matrix = confusion_matrix(y_test, y_pred_test)
matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]

Build the plot and add labels to the plot

In [None]:
plt.figure(figsize=(16,7))
sns.set(font_scale=1.4)
sns.heatmap(matrix, annot=True, annot_kws={'size':10},
            cmap=plt.cm.Greens, linewidths=0.2)

class_names = ['Explore', 'Move', 'Send Image', 'Stop', 'Turn']
tick_marks = np.arange(len(class_names))
tick_marks2 = tick_marks + 0.5
plt.xticks(tick_marks, class_names, rotation=25)
plt.yticks(tick_marks2, class_names, rotation=0)
plt.xlabel('Predicted label')
plt.ylabel('True label')
plt.title('Confusion Matrix for BOW Model')
plt.show()

View the classification report for test data and predictions

In [None]:
print(classification_report(y_test, y_pred_test))