In [15]:
# import findspark
# findspark.init()
# import pyspark
# import random
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Python Spark SQL basic example2").getOrCreate()
# This is for testing the connection
labels = spark.read.csv("../data/wiki_data.csv", header=True, inferSchema=True)
labels = labels.drop(labels._c0)
labels.show(5, False)

+------------------------------------+------+
|file                                |gender|
+------------------------------------+------+
|cropped_10000217_1981-05-05_2009.jpg|1.0   |
|cropped_10000548_1925-04-04_1964.jpg|1.0   |
|cropped_100012_1948-07-03_2008.jpg  |1.0   |
|cropped_10001965_1930-05-23_1961.jpg|1.0   |
|cropped_10002116_1971-05-31_2012.jpg|0.0   |
+------------------------------------+------+
only showing top 5 rows



In [30]:
df_images = spark.read.option(
    "maxColumns", 10000
).csv("data_processing/grayscale_data2.csv", inferSchema=True)
names = df_images.schema.names
types = df_images.schema.fields
print(names[0:10])
print(types[0:10])
names = names[1:]

['_c0', '_c1', '_c2', '_c3', '_c4', '_c5', '_c6', '_c7', '_c8', '_c9']
[StructField(_c0,StringType,true), StructField(_c1,DoubleType,true), StructField(_c2,DoubleType,true), StructField(_c3,DoubleType,true), StructField(_c4,DoubleType,true), StructField(_c5,DoubleType,true), StructField(_c6,DoubleType,true), StructField(_c7,DoubleType,true), StructField(_c8,DoubleType,true), StructField(_c9,DoubleType,true)]


In [36]:
df_labeled = df_images.join(labels, df_images._c0 == labels.file, "left_outer")
df_labeled = df_labeled.na.drop()
tmp = df_labeled.select(df_labeled._c0, df_labeled.file, df_labeled._c1)
tmp.show()

+--------------------+--------------------+-----+
|                 _c0|                file|  _c1|
+--------------------+--------------------+-----+
|cropped_729282_19...|cropped_729282_19...| 50.0|
|cropped_16744391_...|cropped_16744391_...|104.0|
|cropped_33822611_...|cropped_33822611_...|176.0|
|cropped_1531980_1...|cropped_1531980_1...| 22.0|
|cropped_6932212_1...|cropped_6932212_1...|131.0|
|cropped_159603_19...|cropped_159603_19...|131.0|
|cropped_6217898_1...|cropped_6217898_1...| 94.0|
|cropped_6617452_1...|cropped_6617452_1...|175.0|
|cropped_39206675_...|cropped_39206675_...|109.0|
|cropped_977529_19...|cropped_977529_19...|169.0|
|cropped_4978095_1...|cropped_4978095_1...|217.0|
|cropped_24998469_...|cropped_24998469_...|121.0|
|cropped_16919959_...|cropped_16919959_...| 28.0|
|cropped_3951105_1...|cropped_3951105_1...|113.0|
|cropped_40229958_...|cropped_40229958_...|  6.0|
|cropped_18067536_...|cropped_18067536_...| 13.0|
|cropped_10529091_...|cropped_10529091_...|174.0|


In [39]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(
    inputCols=names,
    outputCol="features")

# assembler = VectorAssembler(
#     inputCols=['_c1', '_c2'],
#     outputCol="features")
output = assembler.transform(df_labeled)
formatted = output.select(output.gender.alias('label'), output.features)
# formatted  = formatted.limit(100)
formatted.show(5)

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  1.0|[50.0,41.0,41.0,3...|
|  1.0|[104.0,130.0,154....|
|  0.0|[176.0,198.0,157....|
|  1.0|[22.0,36.0,28.0,2...|
|  1.0|[131.0,129.0,133....|
+-----+--------------------+
only showing top 5 rows



In [26]:
formatted  = formatted.limit(100)

In [40]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8)

# Fit the model
lrModel = lr.fit(formatted)

# Print the coefficients and intercept for logistic regression
print("Coefficients: " + str(lrModel.coefficients))
print("Intercept: " + str(lrModel.intercept))

Coefficients: (2500,[],[])
Intercept: 1.0582915115648464


In [41]:
# Extract the summary from the returned LogisticRegressionModel instance trained
# in the earlier example
trainingSummary = lrModel.summary

# Obtain the objective per iteration
objectiveHistory = trainingSummary.objectiveHistory
print("objectiveHistory:")
for objective in objectiveHistory:
    print(objective)

# Obtain the receiver-operating characteristic as a dataframe and areaUnderROC.
trainingSummary.roc.show()
print("areaUnderROC: " + str(trainingSummary.areaUnderROC))

objectiveHistory:
0.5705697849717789
+---+---+
|FPR|TPR|
+---+---+
|0.0|0.0|
|1.0|1.0|
|1.0|1.0|
+---+---+

areaUnderROC: 0.5


In [7]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

dataset = spark.createDataFrame(
    [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)],
    ["id", "hour", "mobile", "userFeatures", "clicked"])

assembler = VectorAssembler(
    inputCols=["hour", "mobile", "userFeatures"],
    outputCol="features")

output = assembler.transform(dataset)
print("Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'")
output.select("features", "clicked").show(truncate=False)

Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'
+-----------------------+-------+
|features               |clicked|
+-----------------------+-------+
|[18.0,1.0,0.0,10.0,0.5]|1.0    |
+-----------------------+-------+

