# Spark models

In [2]:
# Importations
from pyspark.ml.regression import RandomForestRegressor
from pyspark.ml.feature import IndexToString, StringIndexer, VectorIndexer
from pyspark.ml import Pipeline
from pyspark.sql import functions as fn
from pyspark.ml.evaluation import RegressionEvaluator

In [3]:
df = spark.read\
.format("csv")\
.option("header", "true")\
.option("sep", ";")\
.option("inferSchema", "true")\
.load("/FileStore/tables/student_por-3edc6.csv")


In [4]:
display(df).take(5)

school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,label,G3_discret
GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,4,0,11,11,1,1
GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,2,9,11,11,1,1
GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,0,yes,no,no,no,yes,yes,yes,no,4,3,2,2,3,3,6,12,13,12,1,1
GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,no,yes,yes,yes,yes,yes,3,2,2,1,1,5,0,14,14,14,2,2
GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,no,no,yes,yes,no,no,4,3,2,1,2,5,0,11,13,13,1,1
GP,M,16,U,LE3,T,4,3,services,other,reputation,mother,1,2,0,no,yes,no,yes,yes,yes,yes,no,5,4,2,1,2,5,6,12,12,13,1,1
GP,M,16,U,LE3,T,2,2,other,other,home,mother,1,2,0,no,no,no,no,yes,yes,yes,no,4,4,4,1,1,3,0,13,12,13,1,1
GP,F,17,U,GT3,A,4,4,other,teacher,home,mother,2,2,0,yes,yes,no,no,yes,yes,no,no,4,1,4,1,1,1,2,10,13,13,1,1
GP,M,15,U,LE3,A,3,2,services,other,home,mother,1,2,0,no,yes,no,no,yes,yes,yes,no,4,2,2,1,1,1,0,15,16,17,4,4
GP,M,15,U,GT3,T,3,4,other,other,home,mother,1,2,0,no,yes,no,yes,yes,yes,yes,no,5,5,1,1,1,5,0,12,12,13,1,1


In [5]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler

# ONE-HOT-ENCODING CATEGORICAL DATA
categorical_columns = df.columns
categorical_columns.remove('age')
categorical_columns.remove('absences')
categorical_columns.remove('G1')
categorical_columns.remove('G2')
categorical_columns.remove('G3')

# We start by indexing categorical columns (of type string)
indexers = [
  StringIndexer(inputCol=c, outputCol="{0}_indexed".format(c)) for c in categorical_columns
]

# We encode of the indexed columns
encoders = [
  OneHotEncoder(dropLast=False, inputCol=indexer.getOutputCol(), outputCol="{0}_encoded".format(indexer.getOutputCol())) for indexer in indexers
]


In [6]:
# We vectorize encoded columns
assembler = VectorAssembler(inputCols=[encoder.getOutputCol() for encoder in encoders], outputCol="features")

# We create a pipeline and fit the model
pipeline = Pipeline(stages = indexers + encoders + [assembler])
pipieliner = pipeline.fit(df)
transformed = pipieliner.transform(df)

display(transformed).take(5)

school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3,school_indexed,sex_indexed,address_indexed,famsize_indexed,Pstatus_indexed,Medu_indexed,Fedu_indexed,Mjob_indexed,Fjob_indexed,reason_indexed,guardian_indexed,traveltime_indexed,studytime_indexed,failures_indexed,schoolsup_indexed,famsup_indexed,paid_indexed,activities_indexed,nursery_indexed,higher_indexed,internet_indexed,romantic_indexed,famrel_indexed,freetime_indexed,goout_indexed,Dalc_indexed,Walc_indexed,health_indexed,school_indexed_encoded,sex_indexed_encoded,address_indexed_encoded,famsize_indexed_encoded,Pstatus_indexed_encoded,Medu_indexed_encoded,Fedu_indexed_encoded,Mjob_indexed_encoded,Fjob_indexed_encoded,reason_indexed_encoded,guardian_indexed_encoded,traveltime_indexed_encoded,studytime_indexed_encoded,failures_indexed_encoded,schoolsup_indexed_encoded,famsup_indexed_encoded,paid_indexed_encoded,activities_indexed_encoded,nursery_indexed_encoded,higher_indexed_encoded,internet_indexed_encoded,romantic_indexed_encoded,famrel_indexed_encoded,freetime_indexed_encoded,goout_indexed_encoded,Dalc_indexed_encoded,Walc_indexed_encoded,health_indexed_encoded,features
GP,F,18,U,GT3,A,4,4,at_home,teacher,course,mother,2,2,0,yes,no,no,no,yes,yes,no,no,4,3,4,1,1,3,4,0,11,11,0.0,0.0,0.0,0.0,1.0,1.0,3.0,2.0,3.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,1.0,"List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(1), List(1.0))","List(0, 5, List(1), List(1.0))","List(0, 5, List(3), List(1.0))","List(0, 5, List(2), List(1.0))","List(0, 5, List(3), List(1.0))","List(0, 4, List(0), List(1.0))","List(0, 3, List(0), List(1.0))","List(0, 4, List(1), List(1.0))","List(0, 4, List(0), List(1.0))","List(0, 4, List(0), List(1.0))","List(0, 2, List(1), List(1.0))","List(0, 2, List(1), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(1), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(2), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(1), List(1.0))","List(0, 95, List(0, 2, 4, 6, 9, 11, 18, 22, 28, 30, 34, 38, 41, 45, 50, 52, 53, 55, 57, 59, 62, 63, 65, 70, 77, 80, 85, 91), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
GP,F,17,U,GT3,T,1,1,at_home,other,course,father,1,2,0,no,yes,no,no,no,yes,yes,no,5,3,3,1,1,3,2,9,11,11,0.0,0.0,0.0,0.0,0.0,2.0,1.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,"List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 5, List(2), List(1.0))","List(0, 5, List(1), List(1.0))","List(0, 5, List(2), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 4, List(0), List(1.0))","List(0, 3, List(1), List(1.0))","List(0, 4, List(0), List(1.0))","List(0, 4, List(0), List(1.0))","List(0, 4, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(1), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 5, List(1), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(1), List(1.0))","List(0, 95, List(0, 2, 4, 6, 8, 12, 16, 22, 25, 30, 35, 37, 41, 45, 49, 51, 53, 55, 58, 59, 61, 63, 66, 70, 75, 80, 85, 91), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
GP,F,15,U,LE3,T,1,1,at_home,other,other,mother,1,2,0,yes,no,no,no,yes,yes,yes,no,4,3,2,2,3,3,6,12,13,12,0.0,0.0,0.0,1.0,0.0,2.0,1.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,2.0,1.0,"List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(1), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 5, List(2), List(1.0))","List(0, 5, List(1), List(1.0))","List(0, 5, List(2), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 4, List(3), List(1.0))","List(0, 3, List(0), List(1.0))","List(0, 4, List(0), List(1.0))","List(0, 4, List(0), List(1.0))","List(0, 4, List(0), List(1.0))","List(0, 2, List(1), List(1.0))","List(0, 2, List(1), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(1), List(1.0))","List(0, 5, List(1), List(1.0))","List(0, 5, List(2), List(1.0))","List(0, 5, List(1), List(1.0))","List(0, 95, List(0, 2, 4, 7, 8, 12, 16, 22, 25, 33, 34, 37, 41, 45, 50, 52, 53, 55, 57, 59, 61, 63, 65, 70, 76, 81, 87, 91), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
GP,F,15,U,GT3,T,4,2,health,services,home,mother,1,3,0,no,yes,no,yes,yes,yes,yes,yes,3,2,2,1,1,5,0,14,14,14,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,1.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,2.0,2.0,1.0,0.0,0.0,0.0,"List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 5, List(1), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(4), List(1.0))","List(0, 5, List(1), List(1.0))","List(0, 4, List(1), List(1.0))","List(0, 3, List(0), List(1.0))","List(0, 4, List(0), List(1.0))","List(0, 4, List(2), List(1.0))","List(0, 4, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(1), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(1), List(1.0))","List(0, 5, List(2), List(1.0))","List(0, 5, List(2), List(1.0))","List(0, 5, List(1), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 95, List(0, 2, 4, 6, 8, 11, 15, 24, 26, 31, 34, 37, 43, 45, 49, 51, 53, 56, 57, 59, 61, 64, 67, 72, 76, 80, 85, 90), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
GP,F,16,U,GT3,T,3,3,other,other,home,father,1,2,0,no,yes,no,no,yes,yes,no,no,4,3,2,1,2,5,0,11,13,13,0.0,0.0,0.0,0.0,0.0,3.0,2.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,"List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 5, List(3), List(1.0))","List(0, 5, List(2), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 4, List(1), List(1.0))","List(0, 3, List(1), List(1.0))","List(0, 4, List(0), List(1.0))","List(0, 4, List(0), List(1.0))","List(0, 4, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(1), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(1), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(1), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 95, List(0, 2, 4, 6, 8, 13, 17, 20, 25, 31, 35, 37, 41, 45, 49, 51, 53, 55, 57, 59, 62, 63, 65, 70, 76, 80, 86, 90), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
GP,M,16,U,LE3,T,4,3,services,other,reputation,mother,1,2,0,no,yes,no,yes,yes,yes,yes,no,5,4,2,1,2,5,6,12,12,13,0.0,1.0,0.0,1.0,0.0,1.0,2.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,"List(0, 2, List(0), List(1.0))","List(0, 2, List(1), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(1), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 5, List(1), List(1.0))","List(0, 5, List(2), List(1.0))","List(0, 5, List(1), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 4, List(2), List(1.0))","List(0, 3, List(0), List(1.0))","List(0, 4, List(0), List(1.0))","List(0, 4, List(0), List(1.0))","List(0, 4, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(1), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 5, List(1), List(1.0))","List(0, 5, List(1), List(1.0))","List(0, 5, List(1), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(1), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 95, List(0, 3, 4, 7, 8, 11, 17, 21, 25, 32, 34, 37, 41, 45, 49, 51, 53, 56, 57, 59, 61, 63, 66, 71, 76, 80, 86, 90), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
GP,M,16,U,LE3,T,2,2,other,other,home,mother,1,2,0,no,no,no,no,yes,yes,yes,no,4,4,4,1,1,3,0,13,12,13,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0,0.0,1.0,"List(0, 2, List(0), List(1.0))","List(0, 2, List(1), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(1), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 4, List(1), List(1.0))","List(0, 3, List(0), List(1.0))","List(0, 4, List(0), List(1.0))","List(0, 4, List(0), List(1.0))","List(0, 4, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(1), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(1), List(1.0))","List(0, 5, List(2), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(1), List(1.0))","List(0, 95, List(0, 3, 4, 7, 8, 10, 15, 20, 25, 31, 34, 37, 41, 45, 49, 52, 53, 55, 57, 59, 61, 63, 65, 71, 77, 80, 85, 91), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
GP,F,17,U,GT3,A,4,4,other,teacher,home,mother,2,2,0,yes,yes,no,no,yes,yes,no,no,4,1,4,1,1,1,2,10,13,13,0.0,0.0,0.0,0.0,1.0,1.0,3.0,0.0,3.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4.0,2.0,0.0,0.0,3.0,"List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(1), List(1.0))","List(0, 5, List(1), List(1.0))","List(0, 5, List(3), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(3), List(1.0))","List(0, 4, List(1), List(1.0))","List(0, 3, List(0), List(1.0))","List(0, 4, List(1), List(1.0))","List(0, 4, List(0), List(1.0))","List(0, 4, List(0), List(1.0))","List(0, 2, List(1), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(1), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(4), List(1.0))","List(0, 5, List(2), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(3), List(1.0))","List(0, 95, List(0, 2, 4, 6, 9, 11, 18, 20, 28, 31, 34, 38, 41, 45, 50, 51, 53, 55, 57, 59, 62, 63, 65, 74, 77, 80, 85, 93), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
GP,M,15,U,LE3,A,3,2,services,other,home,mother,1,2,0,no,yes,no,no,yes,yes,yes,no,4,2,2,1,1,1,0,15,16,17,0.0,1.0,0.0,1.0,1.0,3.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1.0,0.0,0.0,3.0,"List(0, 2, List(0), List(1.0))","List(0, 2, List(1), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(1), List(1.0))","List(0, 2, List(1), List(1.0))","List(0, 5, List(3), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(1), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 4, List(1), List(1.0))","List(0, 3, List(0), List(1.0))","List(0, 4, List(0), List(1.0))","List(0, 4, List(0), List(1.0))","List(0, 4, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(2), List(1.0))","List(0, 5, List(1), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(3), List(1.0))","List(0, 95, List(0, 3, 4, 7, 9, 13, 15, 21, 25, 31, 34, 37, 41, 45, 49, 51, 53, 55, 57, 59, 61, 63, 65, 72, 76, 80, 85, 93), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
GP,M,15,U,GT3,T,3,4,other,other,home,mother,1,2,0,no,yes,no,yes,yes,yes,yes,no,5,5,1,1,1,5,0,12,12,13,0.0,1.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,4.0,0.0,0.0,0.0,"List(0, 2, List(0), List(1.0))","List(0, 2, List(1), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 5, List(3), List(1.0))","List(0, 5, List(3), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 4, List(1), List(1.0))","List(0, 3, List(0), List(1.0))","List(0, 4, List(0), List(1.0))","List(0, 4, List(0), List(1.0))","List(0, 4, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(1), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 2, List(0), List(1.0))","List(0, 5, List(1), List(1.0))","List(0, 5, List(3), List(1.0))","List(0, 5, List(4), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 5, List(0), List(1.0))","List(0, 95, List(0, 3, 4, 6, 8, 13, 18, 20, 25, 31, 34, 37, 41, 45, 49, 51, 53, 56, 57, 59, 61, 63, 66, 73, 79, 80, 85, 90), List(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"


Split the dataset into a training and a test with a 80/20 ratio.

In [8]:
train, test = transformed.randomSplit([0.8, 0.2])

In [9]:
rf_regressor = RandomForestRegressor(labelCol="G3", featuresCol="features", numTrees=30, subsamplingRate=1.0)

rf_regressor_pipeline = Pipeline(stages=[rf_regressor])

fit = rf_regressor.fit(train)
predictions_rf = fit.transform(test)

regression_evaluator = RegressionEvaluator(predictionCol="prediction", \
                 labelCol="G3",metricName="mae")

In [10]:
mae = regression_evaluator.evaluate(predictions_rf)
print("MAE = %g" % (mae))

With gradient boosting

In [12]:

from pyspark.ml.regression import GBTRegressor

In [13]:
#Define the model
gbt = GBTRegressor(labelCol="G3", featuresCol="features", maxIter=50, maxDepth=4, stepSize=0.01)
pipeline = Pipeline(stages=[gbt])

# Train model
model = pipeline.fit(train)

# Make predictions
predictions_gbt = model.transform(test)

In [14]:
mae = regression_evaluator.evaluate(predictions_gbt)
print("MAE = %g" % (mae))