In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import isnan, when, count, col
spark = SparkSession.builder.appName('733').getOrCreate()
sc = spark.sparkContext

In [2]:
annual_df = spark.read.csv('../annual_compustat.csv', header=True, inferSchema=True).limit(1000).cache()

In [3]:
nullcounts = spark.read.csv('annual_compustat_null_count.csv', header=False)

In [4]:
import csv

with open('annual_compustat_null_count.csv', 'r') as f:
  reader = csv.reader(f)
  your_list = list(reader)

# print(your_list)

In [5]:
null_count_list = your_list[0]

In [6]:
null_count_list = [float(x) for x in null_count_list]

In [7]:
good_columns = []
for i in range(0, len(null_count_list)):
    if null_count_list[i]==0:
        good_columns.append(i)

In [8]:
good_columns

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 14, 15, 18, 23, 26, 599, 601, 602]

In [9]:
great_columns = [annual_df.columns[i] for i in good_columns]

In [10]:
great_columns.append('rea')

In [11]:
great_columns

['gvkey',
 'datadate',
 'fyear',
 'indfmt',
 'consol',
 'popsrc',
 'datafmt',
 'tic',
 'cusip',
 'conm',
 'acctchg',
 'acctstd',
 'ajex',
 'ajp',
 'curcd',
 'fyr',
 'ogm',
 'prstkc',
 'prstkpc',
 'prvt',
 'rea']

In [12]:
print(great_columns)

['gvkey', 'datadate', 'fyear', 'indfmt', 'consol', 'popsrc', 'datafmt', 'tic', 'cusip', 'conm', 'acctchg', 'acctstd', 'ajex', 'ajp', 'curcd', 'fyr', 'ogm', 'prstkc', 'prstkpc', 'prvt', 'rea']


In [13]:
columns_num = [3, 10, 14]
annual_df = annual_df.select(*great_columns)
# df2.show()

In [14]:
annual_df.columns

['gvkey',
 'datadate',
 'fyear',
 'indfmt',
 'consol',
 'popsrc',
 'datafmt',
 'tic',
 'cusip',
 'conm',
 'acctchg',
 'acctstd',
 'ajex',
 'ajp',
 'curcd',
 'fyr',
 'ogm',
 'prstkc',
 'prstkpc',
 'prvt',
 'rea']

In [15]:
some_dict = {}
for x in annual_df.columns:
    some_dict[x] = 0
# some_dict

In [16]:
permuted_annual_df = annual_df.fillna(some_dict)

In [17]:
permuted_annual_dtypes = permuted_annual_df.dtypes

In [18]:
non_string_columns = [k for (k,v) in permuted_annual_dtypes if v != 'string']

In [19]:
permuted_annual_df_no_strings = permuted_annual_df.select(*non_string_columns)

In [20]:
feature_columns = [item for item in permuted_annual_df_no_strings.columns if item not in ['rea', 'features']]

In [23]:
permuted_annual_df_no_strings.show()

+-----+--------+-----+------+---+---+------+-------+----+------+
|gvkey|datadate|fyear|  ajex|ajp|fyr|prstkc|prstkpc|prvt|   rea|
+-----+--------+-----+------+---+---+------+-------+----+------+
| 1000|19611231| 1961|3.3418|1.0| 12|   0.0|    0.0| 0.0|   0.0|
| 1000|19621231| 1962|3.3418|1.0| 12|   0.0|    0.0| 0.0|   0.0|
| 1000|19631231| 1963|3.2445|1.0| 12|   0.0|    0.0| 0.0|   0.0|
| 1000|19641231| 1964|  3.09|1.0| 12|   0.0|    0.0| 0.0|   0.0|
| 1000|19651231| 1965|  3.09|1.0| 12|   0.0|    0.0| 0.0|   0.0|
| 1000|19661231| 1966|  3.09|1.0| 12|   0.0|    0.0| 0.0|   0.0|
| 1000|19671231| 1967|  3.09|1.0| 12|   0.0|    0.0| 0.0|   0.0|
| 1000|19681231| 1968|   3.0|1.0| 12|   0.0|    0.0| 0.0|   0.0|
| 1000|19691231| 1969|   1.0|1.0| 12|   0.0|    0.0| 0.0| 2.772|
| 1000|19701231| 1970|   1.0|1.0| 12|   0.0|    0.0| 0.0|   0.0|
| 1000|19711231| 1971|   1.0|1.0| 12| 0.086|    0.0| 0.0|   0.0|
| 1000|19721231| 1972|   1.0|1.0| 12| 4.067|    0.0| 0.0|   0.0|
| 1000|19731231| 1973|   

In [26]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# data = 
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=feature_columns, outputCol="features")

final_df = assembler.transform(permuted_annual_df_no_strings
)

In [27]:
final_final_df = final_df.drop(*feature_columns)

In [28]:
final_final_df.show()

+------+--------------------+
|   rea|            features|
+------+--------------------+
|   0.0|[1000.0,1.9611231...|
|   0.0|[1000.0,1.9621231...|
|   0.0|[1000.0,1.9631231...|
|   0.0|[1000.0,1.9641231...|
|   0.0|[1000.0,1.9651231...|
|   0.0|[1000.0,1.9661231...|
|   0.0|[1000.0,1.9671231...|
|   0.0|[1000.0,1.9681231...|
| 2.772|[1000.0,1.9691231...|
|   0.0|[1000.0,1.9701231...|
|   0.0|[1000.0,1.9711231...|
|   0.0|[1000.0,1.9721231...|
|   0.0|[1000.0,1.9731231...|
|   0.0|[1000.0,1.9741231...|
|-1.656|[1000.0,1.9751231...|
|   0.0|[1000.0,1.9761231...|
|   0.0|[1000.0,1.9771231...|
|   0.0|[1001.0,1.9781231...|
|   0.0|[1001.0,1.9791231...|
|   0.0|[1001.0,1.9801231...|
+------+--------------------+
only showing top 20 rows



In [29]:
final_final_df = final_final_df.withColumn('label', final_final_df.rea)

In [30]:
final_final_df.show()

+------+--------------------+------+
|   rea|            features| label|
+------+--------------------+------+
|   0.0|[1000.0,1.9611231...|   0.0|
|   0.0|[1000.0,1.9621231...|   0.0|
|   0.0|[1000.0,1.9631231...|   0.0|
|   0.0|[1000.0,1.9641231...|   0.0|
|   0.0|[1000.0,1.9651231...|   0.0|
|   0.0|[1000.0,1.9661231...|   0.0|
|   0.0|[1000.0,1.9671231...|   0.0|
|   0.0|[1000.0,1.9681231...|   0.0|
| 2.772|[1000.0,1.9691231...| 2.772|
|   0.0|[1000.0,1.9701231...|   0.0|
|   0.0|[1000.0,1.9711231...|   0.0|
|   0.0|[1000.0,1.9721231...|   0.0|
|   0.0|[1000.0,1.9731231...|   0.0|
|   0.0|[1000.0,1.9741231...|   0.0|
|-1.656|[1000.0,1.9751231...|-1.656|
|   0.0|[1000.0,1.9761231...|   0.0|
|   0.0|[1000.0,1.9771231...|   0.0|
|   0.0|[1001.0,1.9781231...|   0.0|
|   0.0|[1001.0,1.9791231...|   0.0|
|   0.0|[1001.0,1.9801231...|   0.0|
+------+--------------------+------+
only showing top 20 rows



In [None]:
# final_final_df.write.parquet("final_final_df2.parquet")

In [56]:
from pyspark.sql import SQLContext
sqlContext = SQLContext(sc)
ml_df = sqlContext.read.parquet("final_final_df2.parquet")

In [57]:
ml_df.show()

+------+--------------------+------+
|   rea|            features| label|
+------+--------------------+------+
|   0.0|[1000.0,1.9611231...|   0.0|
|   0.0|[1000.0,1.9621231...|   0.0|
|   0.0|[1000.0,1.9631231...|   0.0|
|   0.0|[1000.0,1.9641231...|   0.0|
|   0.0|[1000.0,1.9651231...|   0.0|
|   0.0|[1000.0,1.9661231...|   0.0|
|   0.0|[1000.0,1.9671231...|   0.0|
|   0.0|[1000.0,1.9681231...|   0.0|
| 2.772|[1000.0,1.9691231...| 2.772|
|   0.0|[1000.0,1.9701231...|   0.0|
|   0.0|[1000.0,1.9711231...|   0.0|
|   0.0|[1000.0,1.9721231...|   0.0|
|   0.0|[1000.0,1.9731231...|   0.0|
|   0.0|[1000.0,1.9741231...|   0.0|
|-1.656|[1000.0,1.9751231...|-1.656|
|   0.0|[1000.0,1.9761231...|   0.0|
|   0.0|[1000.0,1.9771231...|   0.0|
|   0.0|[1001.0,1.9781231...|   0.0|
|   0.0|[1001.0,1.9791231...|   0.0|
|   0.0|[1001.0,1.9801231...|   0.0|
+------+--------------------+------+
only showing top 20 rows



In [58]:
from pyspark.ml.regression import LinearRegression
lr = LinearRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Fit the model
train = final_final_df
lrModel = lr.fit(train)

# Print the coefficients and intercept for linear regression
print("Coefficients: %s" % str(lrModel.coefficients))
print("Intercept: %s" % str(lrModel.intercept))

# Summarize the model over the training set and print out some metrics
trainingSummary = lrModel.summary
print("numIterations: %d" % trainingSummary.totalIterations)
print("objectiveHistory: %s" % str(trainingSummary.objectiveHistory))
trainingSummary.residuals.show()
print("RMSE: %f" % trainingSummary.rootMeanSquaredError)
print("r2: %f" % trainingSummary.r2)

Coefficients: [0.0,-1.06449397447e-06,-0.00244103246633,0.0,0.0,0.0,0.0,0.0,0.0]
Intercept: 25.74190069013414
numIterations: 11
objectiveHistory: [0.5000000000000001, 0.4999142911765882, 0.4998611581179943, 0.4998610757468472, 0.49986107450295986, 0.4998610630318895, 0.4998610567796463, 0.499861007730838, 0.49986088950678204, 0.4998608886775636, 0.4998608885466327]
+--------------------+
|           residuals|
+--------------------+
|-0.07899879220746797|
|-0.06591281999643073|
|-0.05282684778539348|
|-0.03974087557435624|
|-0.02665490336331...|
|-0.01356893115227...|
|-4.82958941244504...|
| 0.01260301326979274|
|    2.79768898548083|
| 0.03877495769186723|
| 0.05186092990290447|
| 0.06494690211394172|
| 0.07803287432498252|
| 0.09111884653601621|
| -1.5517951812529465|
|  0.1172907909580907|
| 0.13037676316912794|
| 0.14346273538016519|
| 0.15654870759120243|
| 0.16963467980224323|
+--------------------+
only showing top 20 rows

RMSE: 10.106638
r2: 0.001070


In [59]:
ml_df.show()

+------+--------------------+------+
|   rea|            features| label|
+------+--------------------+------+
|   0.0|[1000.0,1.9611231...|   0.0|
|   0.0|[1000.0,1.9621231...|   0.0|
|   0.0|[1000.0,1.9631231...|   0.0|
|   0.0|[1000.0,1.9641231...|   0.0|
|   0.0|[1000.0,1.9651231...|   0.0|
|   0.0|[1000.0,1.9661231...|   0.0|
|   0.0|[1000.0,1.9671231...|   0.0|
|   0.0|[1000.0,1.9681231...|   0.0|
| 2.772|[1000.0,1.9691231...| 2.772|
|   0.0|[1000.0,1.9701231...|   0.0|
|   0.0|[1000.0,1.9711231...|   0.0|
|   0.0|[1000.0,1.9721231...|   0.0|
|   0.0|[1000.0,1.9731231...|   0.0|
|   0.0|[1000.0,1.9741231...|   0.0|
|-1.656|[1000.0,1.9751231...|-1.656|
|   0.0|[1000.0,1.9761231...|   0.0|
|   0.0|[1000.0,1.9771231...|   0.0|
|   0.0|[1001.0,1.9781231...|   0.0|
|   0.0|[1001.0,1.9791231...|   0.0|
|   0.0|[1001.0,1.9801231...|   0.0|
+------+--------------------+------+
only showing top 20 rows



In [60]:
ml_df = ml_df.withColumn('boolean_label', ml_df.rea != 0)

In [61]:
ml_df = ml_df.withColumn('label', ml_df.boolean_label.cast('float'))

In [62]:
ml_df.show()

+------+--------------------+-----+-------------+
|   rea|            features|label|boolean_label|
+------+--------------------+-----+-------------+
|   0.0|[1000.0,1.9611231...|  0.0|        false|
|   0.0|[1000.0,1.9621231...|  0.0|        false|
|   0.0|[1000.0,1.9631231...|  0.0|        false|
|   0.0|[1000.0,1.9641231...|  0.0|        false|
|   0.0|[1000.0,1.9651231...|  0.0|        false|
|   0.0|[1000.0,1.9661231...|  0.0|        false|
|   0.0|[1000.0,1.9671231...|  0.0|        false|
|   0.0|[1000.0,1.9681231...|  0.0|        false|
| 2.772|[1000.0,1.9691231...|  1.0|         true|
|   0.0|[1000.0,1.9701231...|  0.0|        false|
|   0.0|[1000.0,1.9711231...|  0.0|        false|
|   0.0|[1000.0,1.9721231...|  0.0|        false|
|   0.0|[1000.0,1.9731231...|  0.0|        false|
|   0.0|[1000.0,1.9741231...|  0.0|        false|
|-1.656|[1000.0,1.9751231...|  1.0|         true|
|   0.0|[1000.0,1.9761231...|  0.0|        false|
|   0.0|[1000.0,1.9771231...|  0.0|        false|


In [63]:
ml_df = ml_df.drop('rea').drop('boolean_label')
# ml_df.drop('label')
# ml_df.drop('boolean_label')

In [64]:
ml_df.show()

+--------------------+-----+
|            features|label|
+--------------------+-----+
|[1000.0,1.9611231...|  0.0|
|[1000.0,1.9621231...|  0.0|
|[1000.0,1.9631231...|  0.0|
|[1000.0,1.9641231...|  0.0|
|[1000.0,1.9651231...|  0.0|
|[1000.0,1.9661231...|  0.0|
|[1000.0,1.9671231...|  0.0|
|[1000.0,1.9681231...|  0.0|
|[1000.0,1.9691231...|  1.0|
|[1000.0,1.9701231...|  0.0|
|[1000.0,1.9711231...|  0.0|
|[1000.0,1.9721231...|  0.0|
|[1000.0,1.9731231...|  0.0|
|[1000.0,1.9741231...|  0.0|
|[1000.0,1.9751231...|  1.0|
|[1000.0,1.9761231...|  0.0|
|[1000.0,1.9771231...|  0.0|
|[1001.0,1.9781231...|  0.0|
|[1001.0,1.9791231...|  0.0|
|[1001.0,1.9801231...|  0.0|
+--------------------+-----+
only showing top 20 rows



In [65]:
# Split the data into train and test
splits = ml_df.randomSplit([0.6, 0.4], 12)
train = splits[0]
test = splits[1]

# specify layers for the neural network:
# input layer of size 4 (features), two intermediate of size 5 and 4
# and output of size 3 (classes)
layers = [1514, 1514, 1514, 2]

# create the trainer and set its parameters
trainer = MultilayerPerceptronClassifier(maxIter=10, layers=layers, blockSize=128, seed=1234)

In [66]:
train.first()

Row(features=DenseVector([1000.0, 19611231.0, 1961.0, 3.3418, 1.0, 12.0, 0.0, 0.0, 0.0]), label=0.0)

In [72]:
train.first()

Row(features=DenseVector([1000.0, 19611231.0, 1961.0, 3.3418, 1.0, 12.0, 0.0, 0.0, 0.0]), label=0.0)

In [73]:
train.schema

StructType(List(StructField(features,VectorUDT,true),StructField(label,FloatType,true)))

In [70]:
# train the model
model = trainer.fit(train)

Py4JJavaError: An error occurred while calling o294.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 66.0 failed 1 times, most recent failure: Lost task 0.0 in stage 66.0 (TID 70, localhost, executor driver): java.lang.ArrayIndexOutOfBoundsException
	at java.lang.System.arraycopy(Native Method)
	at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3$$anonfun$apply$4.apply(Layer.scala:628)
	at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3$$anonfun$apply$4.apply(Layer.scala:627)
	at scala.collection.immutable.List.foreach(List.scala:381)
	at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3.apply(Layer.scala:627)
	at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3.apply(Layer.scala:623)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:216)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1038)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1029)
	at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:969)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1029)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:760)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:285)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1499)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1487)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1486)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1486)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:814)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:814)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1714)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1669)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1658)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:630)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2022)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2043)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2062)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2087)
	at org.apache.spark.rdd.RDD.count(RDD.scala:1158)
	at org.apache.spark.mllib.optimization.LBFGS$.runLBFGS(LBFGS.scala:195)
	at org.apache.spark.mllib.optimization.LBFGS.optimize(LBFGS.scala:142)
	at org.apache.spark.ml.ann.FeedForwardTrainer.train(Layer.scala:817)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassifier.train(MultilayerPerceptronClassifier.scala:267)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassifier.train(MultilayerPerceptronClassifier.scala:145)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:118)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:82)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:280)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:214)
	at java.lang.Thread.run(Thread.java:748)
Caused by: java.lang.ArrayIndexOutOfBoundsException
	at java.lang.System.arraycopy(Native Method)
	at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3$$anonfun$apply$4.apply(Layer.scala:628)
	at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3$$anonfun$apply$4.apply(Layer.scala:627)
	at scala.collection.immutable.List.foreach(List.scala:381)
	at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3.apply(Layer.scala:627)
	at org.apache.spark.ml.ann.DataStacker$$anonfun$5$$anonfun$apply$3.apply(Layer.scala:623)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at scala.collection.Iterator$$anon$11.next(Iterator.scala:409)
	at org.apache.spark.storage.memory.MemoryStore.putIteratorAsValues(MemoryStore.scala:216)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1038)
	at org.apache.spark.storage.BlockManager$$anonfun$doPutIterator$1.apply(BlockManager.scala:1029)
	at org.apache.spark.storage.BlockManager.doPut(BlockManager.scala:969)
	at org.apache.spark.storage.BlockManager.doPutIterator(BlockManager.scala:1029)
	at org.apache.spark.storage.BlockManager.getOrElseUpdate(BlockManager.scala:760)
	at org.apache.spark.rdd.RDD.getOrCompute(RDD.scala:334)
	at org.apache.spark.rdd.RDD.iterator(RDD.scala:285)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:108)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:335)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	... 1 more


In [71]:
# compute accuracy on the test set
result = model.transform(test)
predictionAndLabels = result.select("prediction", "label")
evaluator = MulticlassClassificationEvaluator(metricName="accuracy")
print("Test set accuracy = " + str(evaluator.evaluate(predictionAndLabels)))

NameError: name 'model' is not defined