In [1]:
RF_NUM_TREES=300
RF_MAX_DEPTH=15

In [2]:
import json
import os
import findspark
import numpy as np
from argparse import ArgumentParser
from string import punctuation

In [168]:
# Finds and adds spark to python path
# Convenient for env managers like conda

findspark.init()
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer, VectorAssembler
from pyspark.mllib.linalg import Vectors
from pyspark.ml.classification import RandomForestClassifier, GBTClassifier, MultilayerPerceptronClassifier, LinearSVC, LogisticRegression
from pyspark.sql.types import StructType, StructField, IntegerType, LongType, FloatType, ArrayType, StringType, DoubleType
from pyspark.ml import Pipeline

In [4]:
# Creates an initial spark configuration utilizing all local cores
conf = SparkConf().setMaster("local[*]")

In [5]:
# Creates spark context through which to process RDD ops
sc = SparkContext(conf = conf)

In [6]:
spark = SparkSession.builder\
                    .master("local")\
                    .appName("Word Count")\
                    .config("spark.driver.memory", "6g") \
                    .getOrCreate()

In [7]:
def mold(df, labeled=True):
    df = df.select([df.columns[1]]+df.columns[145:])

    df = df.withColumn(df.schema.names[0],col(df.schema.names[0]).cast("Long")).withColumnRenamed("Face ID", "face_id")

    offset= 2 if labeled else 1
    for i in range(len(df.schema.names)-offset):
        df = df.withColumn(df.schema.names[1+i],col(df.schema.names[1+i]).cast("Float"))
    if labeled:
        df = df.withColumn('Y',col(df.schema.names[-1]).cast("Integer")).drop('Sex (subj)')
    
    return df

In [8]:
_train=spark.read.load("X_small_train.csv", format="csv", header=True)
_test=spark.read.load("X_small_test.csv", format="csv", header=True)
big_train=spark.read.load("X_train.csv", format="csv", header=True)
_testA=spark.read.load("Xa_test.csv", format="csv", header=True)
_testB=spark.read.load("Xb_test.csv", format="csv", header=True)
_testC=spark.read.load("Xc_test.csv", format="csv", header=True)

In [184]:
#trainingData=mold(_train)
trainingDataBig=mold(big_train)

In [205]:
testingData=mold(_test)
testingDataA=mold(_testA,False)
testingDataB=mold(_testB,False)
testingDataC=mold(_testC,False)

In [208]:
def buildModel(train,model_type='rf'):
    
    if model_type=='lr':
        train=train.withColumn("bias", lit(1)).select([train.schema.names[0],'bias']+train.schema.names[1:])

    train_assembler = VectorAssembler().setInputCols(train.schema.names[1:-1]).setOutputCol('features')

    trainData=train_assembler.transform(train).selectExpr('face_id','features',"Y")

    # Index labels, adding metadata to the label column.
    # Fit on whole dataset to include all labels in index.
    labelIndexer = StringIndexer(inputCol="Y", outputCol="indexedLabel").fit(trainData)


    # Create model templates.
    rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features", numTrees=RF_NUM_TREES,maxDepth=RF_MAX_DEPTH)
    
    gbt = GBTClassifier(labelCol="indexedLabel", featuresCol="features",maxDepth=RF_MAX_DEPTH, maxIter=100)

    layers = [len(train.schema.names[1:-1]), 256, 256, 2]
    perceptron = MultilayerPerceptronClassifier(labelCol="indexedLabel", featuresCol="features",maxIter=400, layers=layers, blockSize=128)

    lsvc = LinearSVC(labelCol="indexedLabel", featuresCol="features",maxIter=40, regParam=0.1)

    lr = LogisticRegression(labelCol="indexedLabel", featuresCol="features",maxIter=400, regParam=0.0, elasticNetParam=0)

    # Convert indexed labels back to original labels.
    labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                                   labels=labelIndexer.labels)

    # Chain indexers and forest in a Pipeline
    _model={'rf':rf,'gbt':gbt,'per':perceptron,'svm':lsvc,'lr':lr}[model_type]
    pipeline = Pipeline(stages=[labelIndexer, _model, labelConverter])

    # Train model.  This also runs the indexers.
    model = pipeline.fit(trainData)

    return model
    # Make predictions.
    #if model_type=='lr':
    #    predictions = model.predict(testData)
    #else:

In [189]:
def testModel(test,model,model_type='rf',labeled=True):
    if model_type=='lr':
        test=test.withColumn("bias", lit(1)).select([test.schema.names[0],'bias']+test.schema.names[1:])

    _names=test.schema.names[1:-1] if labeled else test.schema.names[1:]
    test_assembler = VectorAssembler().setInputCols(_names).setOutputCol('features')

    cols=['face_id','features']
    if labeled:
        cols+=['Y']
    testData=test_assembler.transform(test).select(*cols)

    
    predictions = model.transform(testData)

    out_cols=['face_id','predictedLabel']
    # Select example rows to display.
    if labeled:
        out_cols+=['Y']
    return predictions.select(*out_cols).rdd
    

In [190]:
model=buildModel(trainingDataBig,model_type='lr')

In [212]:
output=testModel(testingDataC,model,model_type='lr',labeled=False)

In [201]:
output.collect()[150:]

[Row(face_id=280177, predictedLabel='1', Y=1),
 Row(face_id=9559, predictedLabel='1', Y=0),
 Row(face_id=393795, predictedLabel='0', Y=0),
 Row(face_id=157822, predictedLabel='0', Y=0),
 Row(face_id=962033, predictedLabel='1', Y=1),
 Row(face_id=142054, predictedLabel='0', Y=0),
 Row(face_id=6563, predictedLabel='1', Y=0),
 Row(face_id=840223, predictedLabel='1', Y=0),
 Row(face_id=963342, predictedLabel='0', Y=1),
 Row(face_id=757875, predictedLabel='0', Y=0),
 Row(face_id=315910, predictedLabel='0', Y=0),
 Row(face_id=678490, predictedLabel='1', Y=1),
 Row(face_id=319563, predictedLabel='0', Y=0),
 Row(face_id=549436, predictedLabel='0', Y=0),
 Row(face_id=731076, predictedLabel='0', Y=1),
 Row(face_id=163999, predictedLabel='1', Y=1),
 Row(face_id=641028, predictedLabel='0', Y=1),
 Row(face_id=375392, predictedLabel='1', Y=1),
 Row(face_id=926786, predictedLabel='1', Y=1),
 Row(face_id=545577, predictedLabel='0', Y=0),
 Row(face_id=281026, predictedLabel='0', Y=0),
 Row(face_id=1972

In [129]:
def eval(out):
    count=0
    for o in out:
        if int(o[1])==o[2]:
            count+=1
    return count/len(out)

In [195]:
eval(output.collect())

0.7604562737642585

In [213]:
dest='yc.txt'
with open(dest, 'a') as the_file:
    for row in output.collect():
        the_file.write(f'{row[1]}\n')