In [107]:
## Random forest classifying based on two training models
## One uses vectorized 5000 features, one uses Readability features
## vectorized 5000 features has a much better performance with homogeneity around 80%

%matplotlib inline 
from sklearn.cross_validation import train_test_split
import pandas as pd
import numpy as np


# #import vectorized feature arrays for acts and scenes
act_Vect = pd.read_csv('VectorizedFeatures/AllComplied/AllAct_Features_Vectorize.txt',sep=',',header = None)
scene_Vect = pd.read_csv('VectorizedFeatures/AllComplied/AllScene_Features_Vectorize.txt',sep=',',header = None)
play_Vect = pd.read_csv('VectorizedFeatures/AllComplied/ALLPlay_Features_Vectorize.txt',sep=',',header = None)

#import LSA reduced vectorized feature arrays for acts and scenes
act_Vect_LSA = pd.read_csv('LSA_VectorizedFeatures/Act_Features_Vectorize_LSA.txt',sep=',',header = None)
scene_Vect_LSA = pd.read_csv('LSA_VectorizedFeatures/Scene_Features_Vectorize_LSA.txt',sep=',',header = None )
play_Vect_LSA = pd.read_csv('LSA_VectorizedFeatures/Play_Features_Vectorize_LSA.txt',sep=',',header = None )

#import readability features
act_Readable = pd.read_csv('Readability_Features/Act_Readability_Feature.txt',sep=',',header = None)
scene_Readable = pd.read_csv('Readability_Features/Scene_Readability_Feature.txt',sep=',',header = None )
play_Readable = pd.read_csv('Readability_Features/Play_Readability_Feature.txt',sep=',',header = None )

#import labels
act_Labels = pd.read_csv('LSA_VectorizedFeatures/act_feature_labels.txt',sep='\n',header = None)
scene_Labels= pd.read_csv('LSA_VectorizedFeatures/scene_feature_labels.txt',sep='\n',header = None )
play_Labels= pd.read_csv('LSA_VectorizedFeatures/play_feature_labels.txt',sep='\n',header = None )

# print(scene_Labels)
act_genre = pd.read_csv('RandomForests/act_genre.txt',sep='\n',header = None)
scene_genre= pd.read_csv('RandomForests/scene_genre.txt',sep='\n',header = None )
play_genre= pd.read_csv('RandomForests/play_genre.txt',sep='\n',header = None )

In [108]:
## Remove brackets for those genre
act_genre = act_genre[0].map(lambda x: x.strip('[').strip(']'))
play_genre = play_genre[0].map(lambda x: x.strip('[').strip(']'))
scene_genre = scene_genre[0].map(lambda x: x.strip('[').strip(']'))

In [112]:
## split the training and testing data sets, as well as genre labels
vect_train, vect_test, readable_train, readable_test, genre_train, genre_test = train_test_split(scene_Vect,scene_Readable, scene_genre,test_size=0.33, random_state=42)

In [113]:
print ("Training the random forest using scene vectorized data...")
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest_vect = forest.fit(vect_train, genre_train)
results_vect = forest.predict(vect_test)
# print(results)
# print("the actual labels are\n")
# print(genre_test)

Training the random forest using scene vectorized data...


In [122]:
from sklearn import metrics
# results = results.map(lambda x: x.strip('[').strip(']'))
predict_results_vect = []
for j in range(len(results_vect)):
    predict_results_vect.insert(-1,float(results_vect[j].strip('[').strip(']')))


## Convert actual results to float array for homogeneity comparison
# r = genre_test[0].map(lambda x: x.strip('[').strip(']'))
r = genre_test
r = np.array(r)
actual_results = []
for i in range(len(r)):
    actual_results.insert(-1,float(r[i]))

print("the predicted results using vectorized features trainig model are\n")
print(predict_results_vect)
print("the actual results are \n")
print(actual_results)
# b = [1,2,4,4]
print("Analysis of prediction results using vectorized features trainig model are\n")
print("Homogeneity: %0.3f" % metrics.homogeneity_score(predict_results_vect, actual_results))
print("Completeness: %0.3f" % metrics.completeness_score(predict_results_vect, actual_results))
print("V-measure: %0.3f" % metrics.v_measure_score(predict_results_vect, actual_results))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(predict_results_vect, actual_results))
# print("Silhouette Coefficient: %0.3f"
#       % metrics.silhouette_score(scene_Vect, predict_results, sample_size=1000))

the predicted results using vectorized features trainig model are

[3.0, 2.0, 1.0, 1.0, 2.0, 3.0, 1.0, 3.0, 3.0, 2.0, 2.0, 3.0, 1.0, 2.0, 1.0, 3.0, 3.0, 1.0, 1.0, 2.0, 2.0, 3.0, 2.0, 2.0, 1.0, 3.0, 1.0, 2.0, 1.0, 1.0, 2.0, 3.0, 1.0, 3.0, 2.0, 1.0, 2.0, 3.0, 1.0, 1.0, 2.0, 3.0, 3.0, 2.0, 1.0, 1.0, 2.0, 1.0, 2.0, 2.0, 3.0, 2.0, 1.0, 1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 1.0, 3.0, 1.0, 1.0, 1.0, 2.0, 2.0, 1.0, 1.0, 2.0, 3.0, 2.0, 3.0, 2.0, 2.0, 2.0, 3.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 3.0, 1.0, 1.0, 2.0, 2.0, 2.0, 3.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 3.0, 1.0, 2.0, 3.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 3.0, 3.0, 1.0, 3.0, 3.0, 2.0, 3.0, 3.0, 2.0, 2.0, 3.0, 3.0, 3.0, 1.0, 2.0, 1.0, 3.0, 2.0, 1.0, 1.0, 2.0, 1.0, 1.0, 1.0, 2.0, 3.0, 1.0, 2.0, 3.0, 1.0, 3.0, 3.0, 2.0, 2.0, 1.0, 2.0, 1.0, 1.0, 2.0, 2.0, 1.0, 3.0, 1.0, 1.0, 1.0, 2.0, 3.0, 3.0, 2.0, 1.0, 1.0, 1.0, 2.0, 3.0, 3.0, 1.0, 1.0, 1.0, 2.0, 1.0, 1.0, 3.0, 1.0, 1.0, 2.0, 1.0, 2.0, 3.0, 3.0, 1.0, 1.0, 3.0, 1.0, 2.0, 2.0, 1.0, 3.0, 2.

In [115]:
print ("Training the random forest using scene readability feature data...")
from sklearn.ensemble import RandomForestClassifier

# Initialize a Random Forest classifier with 100 trees
forest = RandomForestClassifier(n_estimators = 100) 

# Fit the forest to the training set, using the bag of words as 
# features and the sentiment labels as the response variable
#
# This may take a few minutes to run
forest_read = forest.fit(readable_train, genre_train)
results_read = forest.predict(readable_test)


Training the random forest using scene readability feature data...


In [123]:
# results = results.map(lambda x: x.strip('[').strip(']'))
predict_results_read = []
for j in range(len(results_read)):
    predict_results_read.insert(-1,float(results_read[j].strip('[').strip(']')))


## Convert actual results to float array for homogeneity comparison
# r = genre_test[0].map(lambda x: x.strip('[').strip(']'))
# r = np.array(r)
# actual_results = []
# for i in range(len(r)):
#     actual_results.insert(-1,float(r[i]))

print("the predicted results using vectorized features trainig model are\n")
print(predict_results_read)
print("the actual results are \n")
print(actual_results)
print("Analysis of prediction results using vectorized features trainig model are\n")
print("Homogeneity: %0.3f" % metrics.homogeneity_score(predict_results_read, actual_results))
print("Completeness: %0.3f" % metrics.completeness_score(predict_results_read, actual_results))
print("V-measure: %0.3f" % metrics.v_measure_score(predict_results_read, actual_results))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(predict_results_read, actual_results))

the predicted results using vectorized features trainig model are

[3.0, 2.0, 3.0, 1.0, 2.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.0, 3.0, 1.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 1.0, 3.0, 3.0, 1.0, 2.0, 2.0, 3.0, 1.0, 2.0, 1.0, 3.0, 2.0, 2.0, 3.0, 1.0, 2.0, 1.0, 1.0, 3.0, 1.0, 1.0, 1.0, 3.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 3.0, 2.0, 3.0, 1.0, 1.0, 1.0, 2.0, 2.0, 1.0, 3.0, 1.0, 1.0, 2.0, 3.0, 2.0, 2.0, 1.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 2.0, 3.0, 3.0, 1.0, 3.0, 3.0, 3.0, 1.0, 1.0, 3.0, 2.0, 1.0, 1.0, 2.0, 1.0, 3.0, 3.0, 1.0, 1.0, 3.0, 1.0, 3.0, 3.0, 2.0, 3.0, 2.0, 3.0, 1.0, 1.0, 3.0, 1.0, 1.0, 2.0, 1.0, 2.0, 1.0, 3.0, 3.0, 3.0, 3.0, 2.0, 1.0, 3.0, 2.0, 2.0, 1.0, 3.0, 3.0, 3.0, 2.0, 3.0, 3.0, 3.0, 3.0, 1.0, 2.0, 3.0, 1.0, 2.0, 1.0, 3.0, 2.0, 1.0, 3.0, 2.0, 3.0, 2.0, 2.0, 2.0, 2.0, 2.0, 1.0, 2.0, 2.0, 1.0, 3.0, 1.0, 1.0, 1.0, 3.0, 1.0, 2.0, 3.0, 2.0, 1.0, 3.0, 1.0, 2.0, 3.0, 3.0, 1.0, 3.0, 1.0, 1.0, 3.0, 1.0, 3.0, 1.0, 1.0, 2.0, 1.0, 3.0, 3.0, 1.0, 3.0, 3.0, 3.0, 1.0, 2.0, 1.0, 2.0, 3.0, 1.