In [1]:
df = spark.read.csv('/FileStore/tables/adult.data', header=True, inferSchema=True)
df.show()

In [2]:
ageCount = df.groupBy('age').count()
ageCount.show()


In [3]:
maritalStatusAvgAge = df.groupBy('marital-status').avg('age')
maritalStatusAvgAge.show()

In [4]:
stateMaxSalary = df.groupBy('native-country').max('capital-gain')
stateMaxSalary.show()

In [5]:
types = df.dtypes
print(types)

In [6]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorAssembler, StringIndexer, IndexToString
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Instantiate the element for creating the vector of features
assembler = VectorAssembler(inputCols = ['fnlwgt', 'age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'], outputCol = 'features')

# Instantiate the element for converting the class column, which is of type string, into a numerical one.
# It is not the unique way to do it, there are other "converting funtions"
# We use the fit here in order to let the StringIndexer be aware of the data in the dataset and build accordingly the "mapping" between class element and index
# So, stringIndexer here is already a model
# A StringIndexerModel exposes an attribute called label (that we are using in the IndexToString), otherwise we would have not been able to put the labelConverter in the pipeline
stringIndexer = StringIndexer(inputCol='class', outputCol='class-index').fit(df)

# Instantiate the random forest classifier (here it is still ONLY an algorithm, not a model)
rf = RandomForestClassifier(labelCol='class-index', featuresCol='features', numTrees=100)

# Instantiate the IndexToString for converting back the class index value that was predicted to its textual representation
labelConverter = IndexToString(inputCol='prediction', outputCol='predictedLabel', labels=stringIndexer.labels)

# We drop the nulls
df = df.dropna()
(trainDF, testDF) = df.randomSplit([0.7, 0.3])

pipeline = Pipeline(stages=[assembler, stringIndexer, rf, labelConverter])

model = pipeline.fit(trainDF)
outDF = model.transform(testDF)

outDF.select('id', 'class', 'rawPrediction', 'probability', 'prediction', 'predictedLabel').show()

evaluator = MulticlassClassificationEvaluator(labelCol='class-index', predictionCol='prediction', metricName='accuracy')
accuracy = evaluator.evaluate(outDF)
print("Test Error = %g" % (1.0 - accuracy))

In [7]:
display(outDF)