In [4]:
# install Spark
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://www-us.apache.org/dist/spark/spark-3.0.1/spark-3.0.1-bin-hadoop2.7.tgz
!tar xf spark-3.0.1-bin-hadoop2.7.tgz

In [10]:
!pip install -q findspark


In [11]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.1-bin-hadoop2.7"
import findspark
findspark.init()

In [87]:
# parameters for random forest
TRAIN_PATH = "X_small_train.csv"
TEST_PATH = "X_small_test.csv"
LABELS = "project1_files_y_small_train.txt"
LABELS_TEST = "y_small_test.txt"
APP_NAME = "Random Forest Classifier"
SPARK_URL = "local[*]"
IMPURITY = "gini"
RANDOM_SEED = 13579
RF_NUM_TREES = 3
RF_MAX_DEPTH = 4
RF_MAX_BINS = 32

In [88]:
from pyspark import SparkContext
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.tree import RandomForest
from time import *
from pyspark.sql import *
from pyspark.sql.functions import *

spark = SparkSession.builder.appName(APP_NAME).master(SPARK_URL).getOrCreate()

In [90]:

# read csv features file into dataframe
df_train_X = spark.read.options(header = "true", inferschema = "true").csv(TRAIN_PATH)

# read txt labels file into dataframe
df_train_Y = spark.read.load(LABELS, format="csv", sep=" ", inferSchema="true", header="false").toDF('Y')

# change labels (1,2,...,9) to int (0,1,2,...,8), for the purpose of classification
df_train_Y = df_train_Y.withColumn("Y",col("Y") - lit(1))

# combine features and labels into one dataframe
# first, create a row index list w
w = Window.orderBy(lit(1))

# add row indexs to dataframes X and Y
df_X=df_train_X.withColumn("rn",row_number().over(w)-1)
df_Y=df_train_Y.withColumn("rn",row_number().over(w)-1)

# join X dataframe and Y dataframe; and drop the row index
df_train = df_X.join(df_Y,["rn"]).drop("rn")

### if one would like to view the dataframe, please uncomment below line.
# df_train.show()

### below two lines are abandoned. don't worry about them.
# drop first two columns for training
# df_train = df_train.drop('_c0','hash')


# put training dataframe into a RDD of LabeledPoint.
# column_0 is index, and column_hash is hashes; we don't use them for training.
# the last column, 'Y', is our label column. 
transformed_df = df_train.rdd.map(lambda row: LabeledPoint(row[-1], Vectors.dense(row[2:-1])))

In [None]:
splits = [0.9, 0.1]
training_data, test_data = transformed_df.randomSplit(splits, RANDOM_SEED)

print("Number of training set rows: %d" % training_data.count())
print("Number of test set rows: %d" % test_data.count())

In [None]:
# set up a start time, in the purpose of recording the length of training time.
start_time = time()

# model of randomforest
model = RandomForest.trainClassifier(training_data, numClasses=9, categoricalFeaturesInfo={}, numTrees=RF_NUM_TREES, featureSubsetStrategy="auto", impurity=IMPURITY, maxDepth=RF_MAX_DEPTH, maxBins=RF_MAX_BINS, seed=RANDOM_SEED)

### model parameters
# categoricalFeaturesInfo - Map storing arity of categorical features. An entry (n to k) indicates that feature n is categorical with k categories indexed from 0: {0, 1, ..., k-1}.
# featureSubsetStrategy - Number of features to consider for splits at each node.
# impurity - Criterion used for information gain calculation. See https://spark.apache.org/docs/latest/mllib-decision-tree.html for more details
# maxDepth - Maximum depth of the tree (e.g. depth 0 means 1 leaf node, depth 1 means 1 internal node + 2 leaf nodes).
# maxBins - Maximum number of bins used for splitting features

end_time = time()
training_time = end_time - start_time
print("Time to train model: %.3f seconds" % training_time)

In [None]:
# prediction
predictions = model.predict(test_data.map(lambda x: x.features))
labels_and_predictions = test_data.map(lambda x: x.label).zip(predictions)
acc = labels_and_predictions.filter(lambda x: x[0] == x[1]).count() / float(test_data.count())
print("Model accuracy: %.3f%%" % (acc * 100))