In [1]:
# this notebook uses Pyspark ML Logistic Regression to train a model to 
# classify images in our dataset by gender
import pandas as pd
from pyspark.sql import SQLContext
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml import Pipeline
from pyspark.ml.feature import VectorAssembler
from pyspark import SparkContext

# create Spark session
sc= SparkSession.builder.getOrCreate()

In [2]:
# read in training data and rename our target label column
train_df = sc.read.csv("gs://uga-dsp/project2/files/X_train.csv", header=True, inferSchema=True)
train_df = train_df.withColumnRenamed('Sex (subj)','label')

In [3]:
# read in dataframes for testing
test_df1 = sc.read.csv("gs://uga-dsp/project2/files/Xa_test.csv", header=True, inferSchema=True) 
test_df2 = sc.read.csv("gs://uga-dsp/project2/files/Xb_test.csv", header=True, inferSchema=True)
test_df3 = sc.read.csv("gs://uga-dsp/project2/files/Xc_test.csv", header=True, inferSchema=True)

In [4]:
# assemble all feature columns into a vector with its own output column
feature_columns = train_df.columns[9:-1]
assembler = VectorAssembler(inputCols=feature_columns,outputCol="features")

In [5]:
# instantiate the Logistic Regression model, add it to a pipeline and train it
lr = LogisticRegression(maxIter=20, regParam=0.05, elasticNetParam=0.3, labelCol="label")
p = Pipeline(stages=[assembler,lr])
p_model = p.fit(train_df)

In [6]:
# make predictions on image data
predictions_train = p_model.transform(train_df)
predictions_test1 = p_model.transform(test_df1)
predictions_test2 = p_model.transform(test_df2)
predictions_test3 = p_model.transform(test_df3)

# coalesce test data with predicted labels
predictionAndLabels1 = predictions_test1.select('prediction').coalesce(1).rdd.map(lambda x : int(x[0]))
predictionAndLabels2 = predictions_test2.select('prediction').coalesce(1).rdd.map(lambda x : int(x[0]))
predictionAndLabels3 = predictions_test3.select('prediction').coalesce(1).rdd.map(lambda x : int(x[0]))
answer_list1 = predictionAndLabels1.collect()
answer_list2 = predictionAndLabels2.collect()
answer_list3 = predictionAndLabels3.collect()

In [7]:
# write results to file
with open('ya.txt', 'w+') as f: 
      
    # write elements of list 
    for items in answer_list1: 
        f.write('%s\n' %items) 
      
    print("File written successfully") 
  
  
# close the file 
f.close()
with open('yb.txt', 'w+') as f: 
      
    # write elements of list 
    for items in answer_list2: 
        f.write('%s\n' %items) 
      
    print("File written successfully") 
  
  
# close the file 
f.close()

with open('yc.txt', 'w+') as f: 
      
    # write elements of list 
    for items in answer_list3: 
        f.write('%s\n' %items) 
      
    print("File written successfully") 
  
  
# close the file 
f.close()

File written successfully
File written successfully
File written successfully
