In [1]:
# Imports relativos ao sistema operacional
import os
import sys

In [2]:
# Passagem de caminhos do Spark e do diretório de dados
SPARK_PATH = "/Users/flavio.clesio/Documents/spark-2.1.0" 
ROOT_DIR = "/Users/flavio.clesio/Desktop/pyspark-regression/dataset/"

In [3]:
# Neste snippet passamos os caminhos do Spark como variaveis de ambiente para o sistema operacional
os.environ['SPARK_HOME'] = SPARK_PATH
os.environ['HADOOP_HOME'] = SPARK_PATH

# Neste snippet passamos todas as partes da instalacao do Spark
sys.path.append(SPARK_PATH + "/bin")
sys.path.append(SPARK_PATH + "/python")
sys.path.append(SPARK_PATH + "/python/pyspark/")
sys.path.append(SPARK_PATH + "/python/lib")
sys.path.append(SPARK_PATH + "/python/lib/pyspark.zip")
sys.path.append(SPARK_PATH + "/python/lib/py4j-0.10.4-src.zip") # Must be the same version of your Spark Version

In [4]:
%matplotlib inline

In [5]:
# Vamos fazer agora alguns imports iniciais em relação ao Spark
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.mllib.tree import DecisionTree, DecisionTreeModel
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.regression import LinearRegressionWithSGD
from pyspark.mllib.evaluation import RegressionMetrics
from pyspark.mllib.tree import DecisionTree
import matplotlib.pyplot
import numpy as np
import matplotlib
matplotlib.use('Agg')

This call to matplotlib.use() has no effect because the backend has already
been chosen; matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.



In [6]:
%pylab inline
pylab.rcParams['figure.figsize'] = (14, 9)

Populating the interactive namespace from numpy and matplotlib


In [7]:
# Instanciamento da sessao do Spark
sc = SparkContext("local", "pyspark-rapiddo")
sc

<pyspark.context.SparkContext at 0x105dd7e90>

# Extracao de features para o modelo linear

In [8]:
# Carga do arquivo .csv
raw_data = sc.textFile(ROOT_DIR + "base_maio_scalled_one_hotted.csv")

In [9]:
# Contagem simples em relacao ao numero de registros
num_data = raw_data.count()
print 'Quantidade de registros:',  num_data

Quantidade de registros: 118544


In [10]:
# Transformacao do arquivo em um RDD realizando a divisao pela virgula
records = raw_data.map(lambda x: x.split("|"))

In [11]:
# Primeiro registro ja sem os headers
first = records.first()
print first

[u'1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3.776331646,-0.303368592,-0.571108822,923']


In [12]:
# Vamos colocar os dados em cache, ja que vamos realizar inumeras leituras
records.cache()

PythonRDD[4] at RDD at PythonRDD.scala:48

In [13]:
# Primeiro registro do dataset records
records.first()

[u'1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3.776331646,-0.303368592,-0.571108822,923']

In [14]:
# Load and parse the data
def parsePoint(line):
    values = [float(x) for x in line.replace(',', ' ').split(' ')]
    return LabeledPoint(values[73], values[1:73])

In [15]:
parsedData = raw_data.map(parsePoint)

In [16]:
parsedData.take(1)

[LabeledPoint(923.0, [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.776331646,-0.303368592,-0.571108822])]

In [17]:
raw_data.take(1)

[u'1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3.776331646,-0.303368592,-0.571108822,923']

In [18]:
(trainingData, testData) = parsedData.randomSplit([0.7, 0.3], seed=100)

In [19]:
trainingData.take(1)

[LabeledPoint(923.0, [0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.776331646,-0.303368592,-0.571108822])]

In [20]:
testData.take(1)

[LabeledPoint(25.0, [1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.332191362,-0.52114871,-0.752435134])]

In [21]:
def get_categorical_fatures():
    categoricalFeaturesInfo = {}
    categoricalFeaturesInfo[0] = 2
    categoricalFeaturesInfo[1] = 2
    categoricalFeaturesInfo[2] = 2
    categoricalFeaturesInfo[3] = 2
    categoricalFeaturesInfo[4] = 2
    categoricalFeaturesInfo[5] = 2
    categoricalFeaturesInfo[6] = 2
    categoricalFeaturesInfo[7] = 2
    categoricalFeaturesInfo[8] = 2
    categoricalFeaturesInfo[9] = 2
    categoricalFeaturesInfo[10] = 2
    categoricalFeaturesInfo[11] = 2
    categoricalFeaturesInfo[12] = 2
    categoricalFeaturesInfo[13] = 2
    categoricalFeaturesInfo[14] = 2
    categoricalFeaturesInfo[15] = 2
    categoricalFeaturesInfo[16] = 2
    categoricalFeaturesInfo[17] = 2
    categoricalFeaturesInfo[18] = 2
    categoricalFeaturesInfo[19] = 2
    categoricalFeaturesInfo[20] = 2
    categoricalFeaturesInfo[21] = 2
    categoricalFeaturesInfo[22] = 2
    categoricalFeaturesInfo[23] = 2
    categoricalFeaturesInfo[24] = 2
    categoricalFeaturesInfo[25] = 2
    categoricalFeaturesInfo[26] = 2
    categoricalFeaturesInfo[27] = 2
    categoricalFeaturesInfo[28] = 2
    categoricalFeaturesInfo[29] = 2
    categoricalFeaturesInfo[30] = 2
    categoricalFeaturesInfo[31] = 2
    categoricalFeaturesInfo[32] = 2
    categoricalFeaturesInfo[33] = 2
    categoricalFeaturesInfo[34] = 2
    categoricalFeaturesInfo[35] = 2
    categoricalFeaturesInfo[36] = 2
    categoricalFeaturesInfo[37] = 2
    categoricalFeaturesInfo[38] = 2
    categoricalFeaturesInfo[39] = 2
    categoricalFeaturesInfo[40] = 2
    categoricalFeaturesInfo[41] = 2
    categoricalFeaturesInfo[42] = 2
    categoricalFeaturesInfo[43] = 2
    categoricalFeaturesInfo[44] = 2
    categoricalFeaturesInfo[45] = 2
    categoricalFeaturesInfo[46] = 2
    categoricalFeaturesInfo[47] = 2
    categoricalFeaturesInfo[48] = 2
    categoricalFeaturesInfo[49] = 2
    categoricalFeaturesInfo[50] = 2
    categoricalFeaturesInfo[51] = 2
    categoricalFeaturesInfo[52] = 2
    categoricalFeaturesInfo[53] = 2
    categoricalFeaturesInfo[54] = 2
    categoricalFeaturesInfo[55] = 2
    categoricalFeaturesInfo[56] = 2
    categoricalFeaturesInfo[57] = 2
    categoricalFeaturesInfo[58] = 2
    categoricalFeaturesInfo[59] = 2
    categoricalFeaturesInfo[60] = 2
    categoricalFeaturesInfo[61] = 2
    categoricalFeaturesInfo[62] = 2
    categoricalFeaturesInfo[63] = 2
    categoricalFeaturesInfo[64] = 2
    categoricalFeaturesInfo[65] = 2
    categoricalFeaturesInfo[66] = 2
    categoricalFeaturesInfo[67] = 2
    categoricalFeaturesInfo[68] = 2

    return categoricalFeaturesInfo

In [22]:
# Train a DecisionTree model.
# Empty categoricalFeaturesInfo indicates all features are continuous.
model = DecisionTree.trainRegressor(trainingData
                                    ,get_categorical_fatures()
                                    ,impurity='variance'
                                    ,maxDepth=8
                                    ,maxBins=200
                                    ,minInstancesPerNode=1
                                    ,minInfoGain = 0.0)

In [23]:
# Evaluate model on test instances and compute test error
predictions = model.predict(testData.map(lambda x: x.features))

In [24]:
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)

In [25]:
testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum()/float(testData.count())

In [26]:
print('Test Mean Squared Error = ' + str(testMSE))
print('Test Root Mean Squared Error = ' + str(np.sqrt(testMSE)))
print('Learned regression tree model:')
print(model.toDebugString())

Test Mean Squared Error = 78713.0239347
Test Root Mean Squared Error = 280.558414479
Learned regression tree model:
DecisionTreeModel regressor of depth 8 with 475 nodes
  If (feature 70 <= 0.983127614)
   If (feature 69 <= -0.332191362)
    If (feature 70 <= -0.081339107)
     If (feature 70 <= -0.362859746)
      If (feature 70 <= -0.444660083)
       If (feature 70 <= -0.494590159)
        If (feature 46 in {0.0})
         If (feature 71 <= -0.69199303)
          Predict: 24.88308366824991
         Else (feature 71 > -0.69199303)
          Predict: 36.66078865750997
        Else (feature 46 not in {0.0})
         If (feature 71 <= -0.69199303)
          Predict: 25.137724550898202
         Else (feature 71 > -0.69199303)
          Predict: 109.21428571428571
       Else (feature 70 > -0.494590159)
        If (feature 70 <= -0.464844582)
         If (feature 28 in {0.0})
          Predict: 47.93636885952313
         Else (feature 28 not in {0.0})
          Predict: 123.16666666666667

In [27]:
# Save and load model
#model.save(sc, "target/tmp/myDecisionTreeRegressionModel")
#sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeRegressionModel")

In [None]:
# Algumas funcoes de erro para a avaliacao das arvores
def squared_error(actual, pred):
    return (pred - actual)**2

def abs_error(actual, pred):
    return np.abs(pred - actual)

def squared_log_error(pred, actual):
    return (np.log(pred + 1) - np.log(actual + 1))**2

In [None]:
# Funcao para avaliar as arvores
def evaluate_dt(train, test, maxDepth, maxBins):
    model = DecisionTree.trainRegressor(train
                                        ,{}
                                        ,impurity='variance'
                                        ,maxDepth=maxDepth
                                        ,maxBins=maxBins)
    
    preds = model.predict(test.map(lambda p: p.features))
    
    actual = test.map(lambda p: p.label)
    
    tp = actual.zip(preds)
    
    rmsle = np.sqrt(tp.map(lambda (t, p): squared_log_error(t,p)).mean())
    
    rmse = np.sqrt(tp.map(lambda (t, p): squared_error(t,p)).mean())
    
    return rmse

In [None]:
# Tree Depth
params = [5,8,10,20,30]
metrics = [evaluate_dt(trainingData, testData, param, 32) for param in params]

print 'Parametros escolhidos:', params
print 'RMSE dos parametros', metrics

plot(params, metrics)
fig = matplotlib.pyplot.gcf()

In [None]:
# Maximo de bins
params = [2, 4, 8, 16, 32, 64, 100, 200]
metrics = [evaluate_dt(trainingData, testData, 5, param) for param in params]

print 'Parametros escolhidos:', params
print 'RMSE dos parametros', metrics

plot(params, metrics)
fig = matplotlib.pyplot.gcf()

In [None]:
# Maximo de bins
params = [200, 400, 500, 700, 1000]
metrics = [evaluate_dt(trainingData, testData, 5, param) for param in params]

print 'Parametros escolhidos:', params
print 'RMSE dos parametros', metrics

plot(params, metrics)
fig = matplotlib.pyplot.gcf()

In [None]:
# Alguns experimentos com Regressao Linear
help(LinearRegressionWithSGD)

In [28]:
def linearRegression_MSE(data, regMode, regForce, numInteracoes, stepConvergence):
    model = LinearRegressionWithSGD.train(data
                                          ,iterations=numInteracoes
                                          , step=stepConvergence
                                          , miniBatchFraction=1.0
                                          , initialWeights=None
                                          , regParam=regForce
                                          , regType=regMode
                                          , intercept=True
                                          , validateData=True
                                          , convergenceTol=0.001)
   
    # Avaliacao do modelo
    # Usamos o .map() do parsedData para pegar os pares {train_data, predict_data}
    valuesAndPreds = parsedData.map(lambda p: (p.label, model.predict(p.features)))
    
    # Informations about model
    print "Regularizador = " + str(regMode)
    
    # Forca do Regularizador
    print "Parametro de Regularizacao = " + str(regForce)   
    
    # Predictions to using in metrics
    Preds = parsedData.map(lambda p: (p.label, float(model.predict(p.features))))
    
    # Get metrics
    metrics = RegressionMetrics(Preds)
    
    # Squared Error
    print("MSE = %s" % metrics.meanSquaredError)
    print("RMSE = %s" % metrics.rootMeanSquaredError)

    # R-squared
    print("R-squared = %s" % metrics.r2)

    # Mean absolute error
    print("MAE = %s" % metrics.meanAbsoluteError)

    # Explained variance
    print("Explained variance = %s" % metrics.explainedVariance)
        
    return

In [29]:
# Sem regularizacao
print '\n Modelo 1'
RegLinear_1 = linearRegression_MSE(trainingData, None, 0, 10000, 0.001)
print '\n Modelo 2'
RegLinear_2 = linearRegression_MSE(trainingData, None, 0, 100000, 0.001)
print '\n Modelo 3'
RegLinear_3 = linearRegression_MSE(trainingData, None, 0, 500000, 0.001)
print '\n Modelo 4'
RegLinear_4 = linearRegression_MSE(trainingData, None, 0, 10000, 0.01)
print '\n Modelo 5'
RegLinear_5 = linearRegression_MSE(trainingData, None, 0, 100000, 0.01)
print '\n Modelo 6'
RegLinear_6 = linearRegression_MSE(trainingData, None, 0, 500000, 0.01)
print '\n Modelo 7'
RegLinear_7 = linearRegression_MSE(trainingData, None, 0, 10000, 0.1)
print '\n Modelo 8'
RegLinear_8 = linearRegression_MSE(trainingData, None, 0, 100000, 0.1)
print '\n Modelo 9'
RegLinear_9 = linearRegression_MSE(trainingData, None, 0, 500000, 0.1)
print '\n Modelo 10'
RegLinear_10 = linearRegression_MSE(trainingData, None, 0, 10000, 1.0)
print '\n Modelo 11'
RegLinear_11 = linearRegression_MSE(trainingData, None, 0, 100000, 1.0)
print '\n Modelo 12'
RegLinear_12 = linearRegression_MSE(trainingData, None, 0, 500000, 1.0)



Regularizador = None
Parametro de Regularizacao = 0
MSE = 311153.633152
RMSE = 557.81146739
R-squared = -1857.01389131
MAE = 263.419184866
Explained variance = 319599.107676

 Modelo
Regularizador = None
Parametro de Regularizacao = 0
MSE = 311153.633152
RMSE = 557.81146739
R-squared = -1857.01389131
MAE = 263.419184866
Explained variance = 319599.107676

 Modelo
Regularizador = None
Parametro de Regularizacao = 0
MSE = 311153.633152
RMSE = 557.81146739
R-squared = -1857.01389131
MAE = 263.419184866
Explained variance = 319599.107676

 Modelo
Regularizador = None
Parametro de Regularizacao = 0
MSE = 213966.788636
RMSE = 462.565442544
R-squared = -20.5264279018
MAE = 202.79982463
Explained variance = 274539.42919

 Modelo
Regularizador = None
Parametro de Regularizacao = 0
MSE = 213966.788636
RMSE = 462.565442544
R-squared = -20.5264279018
MAE = 202.79982463
Explained variance = 274539.42919

 Modelo
Regularizador = None
Parametro de Regularizacao = 0
MSE = 213966.788636
RMSE = 462.5654

In [30]:
# Regularizacao L1 (Lasso)
print '\n Modelo 1'
LASSO_1 = linearRegression_MSE(trainingData, "l1", 0.0, 10000, 0.001)
print '\n Modelo 2'
LASSO_2 = linearRegression_MSE(trainingData, "l1", 0.0, 100000, 0.001)
print '\n Modelo 3'
LASSO_3 = linearRegression_MSE(trainingData, "l1", 0.0, 500000, 0.001)
print '\n Modelo 4'
LASSO_4 = linearRegression_MSE(trainingData, "l1", 0.01, 10000, 0.001)
print '\n Modelo 5'
LASSO_5 = linearRegression_MSE(trainingData, "l1", 0.01, 100000, 0.001)
print '\n Modelo 6'
LASSO_6 = linearRegression_MSE(trainingData, "l1", 0.01, 500000, 0.001)
print '\n Modelo 7'
LASSO_7 = linearRegression_MSE(trainingData, "l1", 0.025, 10000, 0.01)
print '\n Modelo 8'
LASSO_8 = linearRegression_MSE(trainingData, "l1", 0.025, 100000, 0.01)
print '\n Modelo 9'
LASSO_9 = linearRegression_MSE(trainingData, "l1", 0.025, 500000, 0.01)
print '\n Modelo 10'
LASSO_10 = linearRegression_MSE(trainingData, "l1", 0.05, 10000, 0.1)
print '\n Modelo 11'
LASSO_11 = linearRegression_MSE(trainingData, "l1", 0.05, 100000, 0.1)
print '\n Modelo 12'
LASSO_12 = linearRegression_MSE(trainingData, "l1", 0.05, 500000, 0.1)
print '\n Modelo 13'
LASSO_13 = linearRegression_MSE(trainingData, "l1", 0.1, 10000, 1.0)
print '\n Modelo 14'
LASSO_14 = linearRegression_MSE(trainingData, "l1", 0.1, 100000, 1.0)
print '\n Modelo 15'
LASSO_15 = linearRegression_MSE(trainingData, "l1", 0.1, 500000, 1.0)
print '\n Modelo 16'
LASSO_16 = linearRegression_MSE(trainingData, "l1", 10.0, 10000, 1.0)
print '\n Modelo 17'
LASSO_17 = linearRegression_MSE(trainingData, "l1", 10.0, 100000, 1.0)
print '\n Modelo 18'
LASSO_18 = linearRegression_MSE(trainingData, "l1", 10.0, 500000, 1.0)
print '\n Modelo 19'
LASSO_19 = linearRegression_MSE(trainingData, "l1", 100.0, 10000, 1.0)
print '\n Modelo 20'
LASSO_20 = linearRegression_MSE(trainingData, "l1", 100.0, 100000, 1.0)
print '\n Modelo 21'
LASSO_21 = linearRegression_MSE(trainingData, "l1", 100.0, 500000, 1.0)
print '\n Modelo 22'
LASSO_22 = linearRegression_MSE(trainingData, "l1", 1000.0, 10000, 1.0)
print '\n Modelo 23'
LASSO_23 = linearRegression_MSE(trainingData, "l1", 1000.0, 100000, 1.0)
print '\n Modelo 24'
LASSO_24 = linearRegression_MSE(trainingData, "l1", 1000.0, 500000, 1.0)


 Modelo 1
Regularizador = l1
Parametro de Regularizacao = 0.0
MSE = 311153.633152
RMSE = 557.81146739
R-squared = -1857.01389131
MAE = 263.419184866
Explained variance = 319599.107676

 Modelo 2
Regularizador = l1
Parametro de Regularizacao = 0.0
MSE = 311153.633152
RMSE = 557.81146739
R-squared = -1857.01389131
MAE = 263.419184866
Explained variance = 319599.107676

 Modelo 3
Regularizador = l1
Parametro de Regularizacao = 0.0
MSE = 311153.633152
RMSE = 557.81146739
R-squared = -1857.01389131
MAE = 263.419184866
Explained variance = 319599.107676

 Modelo 4
Regularizador = l1
Parametro de Regularizacao = 0.01
MSE = 311154.803838
RMSE = 557.812516745
R-squared = -1857.22127405
MAE = 263.420528702
Explained variance = 319599.878099

 Modelo 5
Regularizador = l1
Parametro de Regularizacao = 0.01
MSE = 311154.803838
RMSE = 557.812516745
R-squared = -1857.22127405
MAE = 263.420528702
Explained variance = 319599.878099

 Modelo 6
Regularizador = l1
Parametro de Regularizacao = 0.01
MSE = 3

In [31]:
# Regularizacao L2 (Ridge)
print '\n Modelo 1'
RIDGE_1 = linearRegression_MSE(trainingData, "l2", 0.0, 10000, 0.001)
print '\n Modelo 2'
RIDGE_2 = linearRegression_MSE(trainingData, "l2", 0.0, 100000, 0.001)
print '\n Modelo 3'
RIDGE_3 = linearRegression_MSE(trainingData, "l2", 0.0, 500000, 0.001)
print '\n Modelo 4'
RIDGE_4 = linearRegression_MSE(trainingData, "l2", 0.01, 10000, 0.001)
print '\n Modelo 5'
RIDGE_5 = linearRegression_MSE(trainingData, "l2", 0.01, 100000, 0.001)
print '\n Modelo 6'
RIDGE_6 = linearRegression_MSE(trainingData, "l2", 0.01, 500000, 0.001)
print '\n Modelo 7'
RIDGE_7 = linearRegression_MSE(trainingData, "l2", 0.1, 10000, 1.0)
print '\n Modelo 8'
RIDGE_8 = linearRegression_MSE(trainingData, "l2", 0.1, 100000, 1.0)
print '\n Modelo 9'
RIDGE_9 = linearRegression_MSE(trainingData, "l2", 0.1, 500000, 1.0)
print '\n Modelo 10'
RIDGE_10 = linearRegression_MSE(trainingData, "l2", 1.0, 10000, 1.0)
print '\n Modelo 11'
RIDGE_11 = linearRegression_MSE(trainingData, "l2", 1.0, 100000, 1.0)
print '\n Modelo 12'
RIDGE_12 = linearRegression_MSE(trainingData, "l2", 1.0, 500000, 1.0)
print '\n Modelo 13'
RIDGE_13 = linearRegression_MSE(trainingData, "l2", 5.0, 10000, 1.0)
print '\n Modelo 14'
RIDGE_14 = linearRegression_MSE(trainingData, "l2", 5.0, 100000, 1.0)
print '\n Modelo 15'
RIDGE_15 = linearRegression_MSE(trainingData, "l2", 5.0, 500000, 1.0)
print '\n Modelo 16'
RIDGE_16 = linearRegression_MSE(trainingData, "l2", 10.0, 10000, 1.0)
print '\n Modelo 17'
RIDGE_17 = linearRegression_MSE(trainingData, "l2", 10.0, 100000, 1.0)
print '\n Modelo 18'
RIDGE_18 = linearRegression_MSE(trainingData, "l2", 10.0, 500000, 1.0)
print '\n Modelo 19'
RIDGE_19 = linearRegression_MSE(trainingData, "l2", 20.0, 10000, 1.0)
print '\n Modelo 20'
RIDGE_20 = linearRegression_MSE(trainingData, "l2", 20.0, 100000, 1.0)
print '\n Modelo 21'
RIDGE_21 = linearRegression_MSE(trainingData, "l2", 20.0, 500000, 1.0)


 Modelo 1
Regularizador = l2
Parametro de Regularizacao = 0.0
MSE = 311153.633152
RMSE = 557.81146739
R-squared = -1857.01389131
MAE = 263.419184866
Explained variance = 319599.107676

 Modelo 2
Regularizador = l2
Parametro de Regularizacao = 0.0
MSE = 311153.633152
RMSE = 557.81146739
R-squared = -1857.01389131
MAE = 263.419184866
Explained variance = 319599.107676

 Modelo 3
Regularizador = l2
Parametro de Regularizacao = 0.0
MSE = 311153.633152
RMSE = 557.81146739
R-squared = -1857.01389131
MAE = 263.419184866
Explained variance = 319599.107676

 Modelo 4
Regularizador = l2
Parametro de Regularizacao = 0.01
MSE = 311157.64097
RMSE = 557.815059827
R-squared = -1857.82319897
MAE = 263.423132852
Explained variance = 319601.358708

 Modelo 5
Regularizador = l2
Parametro de Regularizacao = 0.01
MSE = 311157.64097
RMSE = 557.815059827
R-squared = -1857.82319897
MAE = 263.423132852
Explained variance = 319601.358708

 Modelo 6
Regularizador = l2
Parametro de Regularizacao = 0.01
MSE = 311

In [None]:
# Mais experimentos com arvore de decisao 
help(DecisionTree.trainRegressor)

In [None]:
# Mais experimentos com arvore de decisao 
dt_model = DecisionTree.trainRegressor(parsedData
                                       ,categoricalFeaturesInfo = {}
                                       , impurity='variance'
                                       , maxDepth=5
                                       , maxBins=32
                                       , minInstancesPerNode=1
                                       , minInfoGain=0.0)

In [None]:
# Evaluate model on test instances and compute test error
predictions = dt_model.predict(testData.map(lambda x: x.features))

In [None]:
labelsAndPredictions = testData.map(lambda lp: lp.label).zip(predictions)

In [None]:
testMSE = labelsAndPredictions.map(lambda lp: (lp[0] - lp[1]) * (lp[0] - lp[1])).sum() /\
    float(testData.count())

In [None]:
print('Test Mean Squared Error = ' + str(testMSE))
print('Learned regression tree model:')
print(model.toDebugString())

In [None]:
# Save and load model
#model.save(sc, "target/tmp/myDecisionTreeRegressionModel")
#sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeRegressionModel")

In [None]:
# Paramgrid: https://github.com/MingChen0919/learning-apache-spark/blob/master/linear-regression.ipynb
# Paramgrid: https://mapr.com/blog/churn-prediction-pyspark-using-mllib-and-ml-packages/


# Treinamento do modelo linear

In [None]:
linear_model = LinearRegressionWithSGD.train(data
                                             ,iterations=10
                                             ,step=0.1
                                             ,intercept=False)

In [None]:
# Aqui vamos usar o .map para o nosso conjunto de dados data, e usando a funcao lambda vamos pegar a coluna com os valores
# que no caso é o p.label e o resultado das predicoes no .peatures
true_vs_predicted = data.map(lambda p: (p.label, linear_model.predict(p.features)))

In [None]:
print "Linear Model predictions: " + str(true_vs_predicted.take(5))

In [None]:
# Agora vamos usar o regressor da arvore de decisao regressora. No caso quando se a algum tipo de variavel categorica
# e necessario passar como argumento categoricalFeaturesInfo. No caso vai ficar as...is
dt_model = DecisionTree.trainRegressor(data_dt
                                       ,{})

# A diferenca fundamental que precisa ser enendida e que no modelo linear, obrigatoriamente as variavels dummy precisam
# estar criadas no vetor de features, enquanto no modelo de arvore de decisao, como o algoritmo tem propriedades
# de quebra da arvore, nem sempre isso e necessario, mas e recomendado

In [None]:
preds = dt_model.predict(data_dt.map(lambda p: p.features))

In [None]:
actual = data.map(lambda p: p.label)

In [None]:
true_vs_predicted_dt = actual.zip(preds)

In [None]:
print "Decision Tree predictions: " + str(true_vs_predicted_dt.take(5))

In [None]:
print "Decision Tree depth: " + str(dt_model.depth())

In [None]:
print "Decision Tree number of nodes: " + str(dt_model.numNodes())

# Tunning do modelo 

In [None]:
# Atribuicao de index para todos os campos via chave e valor
data_with_idx = data.zipWithIndex().map(lambda (k, v): (v, k))

In [None]:
# Nesse caso a base de teste sera criada usando 20% de todo o conjunto de dados
test = data_with_idx.sample(False, 0.2, 42)

In [None]:
# O subtractByKey() faz o trabalho de remover da base inicial as instancias que tem overlap
# ou seja, esses serao os 80% da base de testes
train = data_with_idx.subtractByKey(test)

In [None]:
train_data = train.map(lambda (idx, p): p)
test_data = test.map(lambda (idx, p) : p)
train_size = train_data.count()
test_size = test_data.count()

In [None]:
print "Training data size: %d" % train_size
print "Test data size: %d" % test_size
print "Total data size: %d " % num_data
print "Train + Test size : %d" % (train_size + test_size)

In [None]:
data_with_idx_dt = data_dt.zipWithIndex().map(lambda (k, v): (v, k))
test_dt = data_with_idx_dt.sample(False, 0.2, 42)
train_dt = data_with_idx_dt.subtractByKey(test_dt)
train_data_dt = train_dt.map(lambda (idx, p): p)
test_data_dt = test_dt.map(lambda (idx, p) : p)

In [None]:
def evaluate(train, test, iterations, step, regParam, regType,intercept):
    model = LinearRegressionWithSGD.train(train
                                          ,iterations
                                          ,step
                                          ,regParam=regParam
                                          ,regType=regType
                                          ,intercept=intercept)
    
    tp = test.map(lambda p: (p.label, model.predict(p.features)))
    
    rmsle = np.sqrt(tp.map(lambda (t, p): squared_log_error(t, p)).mean())
    
    return rmsle

In [None]:
# Numero de interacoes como parametros
params = [1, 5, 10, 20, 50, 100]
metrics = [evaluate(train_data, test_data, param, 0.01, 0.0,'l2',False) for param in params]

print 'Parametros escolhidos:', params
print 'RMSLE dos parametros', metrics

In [None]:
matplotlib.pyplot.plot(params, metrics)
matplotlib.pyplot.show()

In [None]:
# Step size
params = [0.01, 0.025, 0.05, 0.1, 1.0]
metrics = [evaluate(train_data, test_data, 10, param, 0.0, 'l2',False) for param in params]

print 'Parametros escolhidos:', params
print 'RMSLE dos parametros', metrics

In [None]:
# Regularizacao Ridge (L2)
params = [0.0, 0.01, 0.1, 1.0, 5.0, 10.0, 20.0]
metrics = [evaluate(train_data, test_data, 10, 0.1, param, 'l2',False) for param in params]

print 'Parametros escolhidos:', params
print 'RMSLE dos parametros', metrics

In [None]:
matplotlib.pyplot.plot(params, metrics)
matplotlib.pyplot.xscale('log')
matplotlib.pyplot.show()

In [None]:
# Regularizacao L! (Lasso)
params = [0.0, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
metrics = [evaluate(train_data, test_data, 10, 0.1, param, 'l1',False) for param in params]

print 'Parametros escolhidos:', params
print 'RMSLE dos parametros', metrics

In [None]:
matplotlib.pyplot.plot(params, metrics)
matplotlib.pyplot.xscale('log')
matplotlib.pyplot.show()

In [None]:
model_l1 = LinearRegressionWithSGD.train(train_data, 10, 0.1,regParam=1.0, regType='l1', intercept=False)
model_l1_10 = LinearRegressionWithSGD.train(train_data, 10, 0.1,regParam=10.0, regType='l1', intercept=False)
model_l1_100 = LinearRegressionWithSGD.train(train_data, 10, 0.1,regParam=100.0, regType='l1', intercept=False)

In [None]:
print "L1 (1.0) number of zero weights: " + str(sum(model_l1.weights.array == 0))
print "L1 (10.0) number of zeros weights: " + str(sum(model_l1_10.weights.array == 0))
print "L1 (100.0) number of zeros weights: " + str(sum(model_l1_100.weights.array == 0))

In [None]:
# Como a regularizacao e mais agressiva o numero de numeros zeros e maior quanto mais se aumenta a regularizacao

In [None]:
# Intercepto
params = [False, True]
metrics = [evaluate(train_data, test_data, 10, 0.1, 1.0, 'l2', param)for param in params]
print 'Parametros escolhidos:', params
print 'RMSLE dos parametros', metrics

In [None]:
# Put in bar
matplotlib.pyplot.bar(params, metrics)
matplotlib.pyplot.xscale('log')
matplotlib.pyplot.show()

# Impacto dos parametros na arvore de decisao regressora

In [None]:
def evaluate_dt(train, test, maxDepth, maxBins):
    model = DecisionTree.trainRegressor(train
                                        , {}
                                        ,impurity='variance'
                                        ,maxDepth=maxDepth
                                        ,maxBins=maxBins)
    
    preds = model.predict(test.map(lambda p: p.features))
    
    actual = test.map(lambda p: p.label)
    
    tp = actual.zip(preds)
    
    rmsle = np.sqrt(tp.map(lambda (t, p): squared_log_error(t,p)).mean())
    
    return rmsle

In [None]:
# Tree Depth
params = [1, 2, 3, 4, 5, 10, 20]
metrics = [evaluate_dt(train_data_dt, test_data_dt, param, 32) for param in params]

print 'Parametros escolhidos:', params
print 'RMSLE dos parametros', metrics

plot(params, metrics)
fig = matplotlib.pyplot.gcf()

In [None]:
# Maximo de bins
params = [2, 4, 8, 16, 32, 64, 100]
metrics = [evaluate_dt(train_data_dt, test_data_dt, 5, param) for param in params]

print 'Parametros escolhidos:', params
print 'RMSLE dos parametros', metrics

plot(params, metrics)
fig = matplotlib.pyplot.gcf()

In [None]:
# Save and load model
#model.save(sc, "home/myDecisionTreeClassificationModel")


In [None]:
#sameModel = DecisionTreeModel.load(sc, "target/tmp/myDecisionTreeClassificationModel")