In [None]:
# parameters for random forest
src_bucket='gs://micky-practicum/'
dest='home/zainmeekail/rf_predictions.txt'
indicator_path='gs://uga-dsp/project1/files/'
RANDOM_SEED = 13579
RF_NUM_TREES = 300
RF_MAX_DEPTH = 15

In [None]:
from pyspark.mllib.linalg import Vectors
from pyspark.ml.classification import RandomForestClassifier
from pyspark.sql.types import StructType, StructField, IntegerType, LongType, FloatType, ArrayType, StringType
from pyspark.ml import Pipeline
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer, VectorAssembler

In [None]:
train_hashes=sc.textFile(f'{indicator_path}X_train.txt').collect()
train_labels=sc.textFile(f'{indicator_path}y_train.txt').collect()
hash_labels=sc.broadcast({train_hashes[i]:train_labels[i] for i in range(len(train_hashes))})
del train_hashes
del train_labels

In [None]:
def buildSchema(mode):
    t=None
    if mode=='int':
        t=IntegerType
    elif mode=='long':
        t=LongType
    elif mode=='float':
        t=FloatType
    schema = StructType([StructField('Y',LongType())]+[StructField(hexGen(i),t()) for i in range(257)])
    return schema

In [None]:
def hash_to_label(x):
    out=[int(hash_labels.value[x[0]])]
    out+=x[1:]
    return out

In [None]:
# A method to get the two-letter hex-string corresponding to a word_index
# @param i: int in [0,256] mapped to corresponding hex in [00,...,FF,??]
def hexGen(i):
    return ('0'+str(hex(i)).upper()[2:])[-2:] if i <256 else '??'

In [None]:
# Load and parse the data file, converting it to a DataFrame.
_data= sqlContext.read.load(f'{src_bucket}counts/X_train.parquet').rdd\
                    .map(hash_to_label)

df=spark.createDataFrame(_data,schema=buildSchema('long'))

assembler = VectorAssembler().setInputCols(df.schema.names[1:]).setOutputCol('features')

data=assembler.transform(df).select('Y','features').selectExpr("Y as label",'features')

# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(data)

trainingData=data
#testData=
_testData= sqlContext.read.load(f'{src_bucket}counts/X_test.parquet')

test_assembler = VectorAssembler().setInputCols(_testData.schema.names[1:]).setOutputCol('features')

testData=test_assembler.transform(_testData).select('hash','features')

# Train a RandomForest model.
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features", numTrees=RF_NUM_TREES,maxDepth=RF_MAX_DEPTH)

# Convert indexed labels back to original labels.
labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)

# Chain indexers and forest in a Pipeline
pipeline = Pipeline(stages=[labelIndexer, rf, labelConverter])

# Train model.  This also runs the indexers.
model = pipeline.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)

# Select example rows to display.
output=predictions.select("hash","predictedLabel" ).rdd.collectAsMap()



In [None]:
test_hashes=sc.textFile(f'{indicator_path}X_test.txt').collect()
with open(dest, 'a') as the_file:
    for h in test_hashes:
        the_file.write(f'{output[h]}\n')