In [1]:
import os
os.environ['SPARK_HOME']='/home/envmodules/lib/spark-2.2.0-bin-hadoop2.7/'
import findspark
findspark.init()

# Imports
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnan, when, count, col

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.mllib.evaluation import MulticlassMetrics

spark = SparkSession.builder.appName('733').getOrCreate()

# Using the integrated file to start working on
integrated_df = spark.read.parquet('/user/vcs/annual_integrated_dataset_with_labels_ibes_fix_v2.parquet')
#spark.read.parquet('/user/vcs/annual_integrated_dataset_v2.parquet')

In [2]:
misstated_df = integrated_df.filter(integrated_df.label == 1.0)

In [3]:
misstated_count = misstated_df.count()

In [4]:
non_misstated_df = integrated_df.filter(integrated_df.label == 0.0).limit(misstated_count)

In [5]:
non_misstated_df.count()

1694

In [6]:
integrated_df = misstated_df.union(non_misstated_df).cache()

In [7]:
# Using nullcounts to filter columns to keep
nullcounts = integrated_df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in integrated_df.columns])
nc = list(nullcounts.first())

# Extracting out an industrial segment and modelling on it instead of the whole dataset
# Services-packaged software category selection (from EDA)
services_prepacked_software = integrated_df #.filter(integrated_df.sic == '7372')
print('Total records in integrated file: ', integrated_df.count())
print(services_prepacked_software.show())
print('Number of records in Services-packaged software industrial category: ', services_prepacked_software.count())

# Reusing preprocessing steps implemented by Vincent
some_dict = {}
for x in services_prepacked_software.columns:
	some_dict[x] = 0

nwdf = services_prepacked_software.fillna(some_dict)

good_columns = []
for i in range(0, len(nc)):
	if nc[i] == 0:
		good_columns.append(i)

great_columns = [nwdf.columns[i] for i in good_columns]
great_columns.append('rea')
nwdf = nwdf.fillna(some_dict)

non_string_columns = [k for (k,v) in nwdf.dtypes if v != 'string']
nwdf_no_strings = nwdf.select(*non_string_columns)
feature_columns = [item for item in nwdf_no_strings.columns if item not in ['rea', 'features', 'label', 'rea_label']]

assembler = VectorAssembler(inputCols=feature_columns, outputCol="features")
final_df = assembler.transform(nwdf_no_strings)
final_final_df = final_df.drop(*feature_columns).cache()

Total records in integrated file:  3388
+-----+------+--------------------+------+------+-------+-------+-------+-------+--------+--------+--------+----+--------+-------+-------+---------+-------+-------+----+--------+----+--------+-------+----+-----+-----+-----+-----+----+----+-------+------+-------+------+----+--------+-------+----+-------+------+------+-------+------+------+----+-----+-------+-------+----+-----+-----+-----+------+-------+-------+------+--------+-------+----+------+------+----+--------+----+-------+------+------+----+--------+----+-------+--------+-----+-----+-------+--------+-----+--------+------+------+-------+------+----+-----+-----+----+----+----+----+--------+--------+--------+--------+-----+-----+------+--------+-------+-------+------+-------+-------+--------+--------+-------+-------+------+------+-------+-------+-----+-------+-------+-----+-----+-----+-------+-----+-----+------+-------+-----+--------+--------+--------+--------+--------+------+------+-------+--

Number of records in Services-packaged software industrial category:  3388


In [8]:
final_final_df.columns

['rea', 'label', 'features']

In [9]:
# String indexing not required
stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = stringIndexer.fit(final_final_df)
td = si_model.transform(final_final_df)

# Evaluators
evaluator = MulticlassClassificationEvaluator(metricName = 'accuracy')
eval = BinaryClassificationEvaluator()

# RandomForest classifier
rf = RandomForestClassifier(numTrees=100, maxDepth=16, labelCol="indexed", seed=42)
model = rf.fit(td)
result = model.transform(final_final_df)
print('Accuracy on training data: ', evaluator.evaluate(result))

# Train test split for model evaluation
train, test = final_final_df.randomSplit([0.7, 0.3], seed=12345)

rf = RandomForestClassifier(numTrees=100, maxDepth=16, labelCol="label", seed=42)
print('Training RandomForest model on training set. \n Model parameters: {}'.format(rf._paramMap))
trained_model = rf.fit(train)
res = trained_model.transform(test)
metrics = MulticlassMetrics(res.select(['label', 'prediction']).rdd)
print('Accuracy on test set: ', evaluator.evaluate(res))
print('Precision on test data: ', metrics.precision())
print('Recall on test data: ', metrics.recall())
print('F1 Score on test data: ', metrics.fMeasure())
print('Area under ROC curve: ', eval.evaluate(res))

Accuracy on training data:  0.9911452184179457
Training RandomForest model on training set. 
 Model parameters: {Param(parent='RandomForestClassifier_4484859be720c744bed3', name='seed', doc='random seed.'): 42, Param(parent='RandomForestClassifier_4484859be720c744bed3', name='maxDepth', doc='Maximum depth of the tree. (>= 0) E.g., depth 0 means 1 leaf node; depth 1 means 1 internal node + 2 leaf nodes.'): 16, Param(parent='RandomForestClassifier_4484859be720c744bed3', name='numTrees', doc='Number of trees to train (>= 1).'): 100, Param(parent='RandomForestClassifier_4484859be720c744bed3', name='labelCol', doc='label column name.'): 'label'}
Accuracy on test set:  0.8181818181818182




Precision on test data:  0.8181818181818182
Recall on test data:  0.8181818181818182
F1 Score on test data:  0.8181818181818182




Area under ROC curve:  0.910230196267877


In [10]:
# Logistic regression
print('Training LogisticRegression model on training set.')
logistic = LogisticRegression(regParam=0.1, labelCol="label")#, thresholds = [0.2, 0.5])
trained_model = logistic.fit(train)
res = trained_model.transform(test)
metrics = MulticlassMetrics(res.select(['label', 'prediction']).rdd)
print('Accuracy on test set: ', evaluator.evaluate(res))
print('Precision on test data: ', metrics.precision())
print('Recall on test data: ', metrics.recall())
print('F1 Score on test data: ', metrics.fMeasure())
print('Area under ROC curve: ', eval.evaluate(res))


from pyspark.ml.classification import LogisticRegression

# Extract the summary from the returned LogisticRegressionModel instance trained
# in the earlier example
trainingSummary = trained_model.summary

# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

# Set the model threshold to maximize F-Measure
fMeasure = trainingSummary.fMeasureByThreshold
maxFMeasure = fMeasure.groupBy().max('F-Measure').select('max(F-Measure)').head()
bestThreshold = fMeasure.where(fMeasure['F-Measure'] == maxFMeasure['max(F-Measure)']) \
    .select('threshold').head()['threshold']
logistic.setThreshold(bestThreshold)

res = trained_model.transform(test)

print('best threshold is:' + str(bestThreshold))

Training LogisticRegression model on training set.
Accuracy on test set:  0.7272727272727273




Precision on test data:  0.7272727272727273
Recall on test data:  0.7272727272727273
F1 Score on test data:  0.7272727272727273




Area under ROC curve:  0.7928720420322581
objectiveHistory:
0.693054780889867
0.6927449570920323
0.6865521824268418
0.6573807051265931
0.6458610500184775
0.608905822853849
0.5982274074795306
0.5899200989638572
0.5855751704514335
0.5848942940464251
0.5845607566022901
0.5844575005804169
0.5843962818977303
0.5843542824925069
0.5842710693776809
0.5840686379667835
0.5839878679688942
0.5836606303329172
0.5833755528413657
0.5832948559603599
0.5832747982269266
0.5830823139698014
0.582944344836495
0.582922037350081
0.582909445246052
0.5828923915865181
0.5828699972713551
0.5828682818912709
0.5828674201128912
0.5828672682559426
0.5828671129777523
0.5828667465123549
0.5828657161094727
0.5828634775517001
0.5828626967212738
0.582862266598108
0.5828603791362766
0.5828599061634633
0.5828598507413352
0.5828598364081701
0.5828597909002018
0.5828596046338972
0.5828593206590715
0.5828587999828764
0.5828581187817667
0.5828578467373607
0.582857134282419
0.5828532979027322
0.5828529634087118
0.58285174253448

In [11]:
test.groupby('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0|  533|
|  1.0|  501|
+-----+-----+



In [12]:
res = res.withColumn('correct', res.label == res.prediction)

#positive class (misstatements)
true_positives_df = res.filter(res.prediction == 1.0).filter(res.label == 1.0)
ground_truth_positives_df = res.filter(res.label == 1.0)
misstatement_recall = true_positives_df.count()/ground_truth_positives_df.count()

new_all_predicted_positive_df = res.filter(res.prediction == 1.0)
misstatement_precision = true_positives_df.count()/new_all_predicted_positive_df.count()

#negative class (non misstatements)
true_negative_df = res.filter(res.prediction == 0.0).filter(res.label == 0.0)
ground_truth_negative_df = res.filter(res.label == 0.0)
non_misstatement_recall = true_negative_df.count()/ground_truth_negative_df.count()

new_all_predicted_negative_df = res.filter(res.prediction == 0.0)
non_misstatement_precision = true_negative_df.count()/new_all_predicted_negative_df.count()

print('misstatement_precision is {}, misstatement recall is {}'.format(misstatement_precision, misstatement_recall))
print('non_misstatement_precision is {}, non_misstatement recall is {}'.format(non_misstatement_precision, non_misstatement_recall))


misstatement_precision is 0.689119170984456, misstatement recall is 0.7964071856287425
non_misstatement_precision is 0.7758241758241758, non_misstatement recall is 0.6622889305816135


In [13]:
new_all_predicted_positive_df.count()

579

In [14]:
res.count()

1034

In [15]:
true_positives_df = res.filter(res.prediction == 1.0).filter(res.label == 1.0)

In [16]:
true_positives_df.count()

399

In [17]:
res.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|       0.0|  455|
|       1.0|  579|
+----------+-----+



In [18]:
res.groupBy('label').count().show()

+-----+-----+
|label|count|
+-----+-----+
|  0.0|  533|
|  1.0|  501|
+-----+-----+

