In [1]:
import os
import numpy as np

from pyspark import SparkContext
sc = SparkContext.getOrCreate()

from test_helper import Test
import os.path

fileName = os.path.join('D:/', 'IGCmodelo.csv')

numPartitions = 100
IGCraw = sc.textFile(fileName, numPartitions, 8)

In [2]:
numPoints = IGCraw.count()
print numPoints

2090


In [3]:
samplePoints = IGCraw.take(2094)
print samplePoints

[u'2.000,11.000,3.000,1.000,21.000,1.000,0.695,0.000,0.000,0.000,0.000,0.695', u'2.000,11.000,8.000,2.000,10.000,1.000,0.760,0.000,0.000,0.000,0.000,0.760', u'2.000,11.000,5.000,1.000,33.000,1.000,0.839,0.000,0.000,0.000,0.000,0.839', u'2.000,10.000,21.000,3.000,35.000,1.000,0.876,0.000,0.000,0.000,0.000,0.876', u'2.000,11.000,26.000,1.000,14.000,1.000,0.884,0.000,0.000,0.000,0.000,0.884', u'2.000,10.000,18.000,1.000,8.000,1.000,0.910,0.000,0.000,0.000,0.000,0.910', u'2.000,6.000,27.000,1.000,27.000,1.000,0.913,0.000,0.000,0.000,0.000,0.913', u'2.000,10.000,26.000,1.000,11.000,1.000,0.933,0.000,0.000,0.000,0.000,0.933', u'2.000,10.000,26.000,1.000,6.000,1.000,0.933,0.000,0.000,0.000,0.000,0.933', u'2.000,10.000,9.000,1.000,13.000,1.000,0.990,0.000,0.000,0.000,0.000,0.990', u'2.000,10.000,26.000,3.000,4.000,1.000,1.064,0.000,0.000,0.000,0.000,1.064', u'2.000,10.000,8.000,2.000,11.000,1.000,1.083,0.000,0.000,0.000,0.000,1.083', u'2.000,10.000,11.000,2.000,50.000,1.000,1.085,0.000,0.000,0

In [4]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import StandardScaler

In [5]:
IGCrdd = (IGCraw
        .map(lambda x: x.split(','),lambda y: float(y))
        )
print IGCrdd.take(2)

[[u'2.000', u'11.000', u'3.000', u'1.000', u'21.000', u'1.000', u'0.695', u'0.000', u'0.000', u'0.000', u'0.000', u'0.695'], [u'2.000', u'11.000', u'8.000', u'2.000', u'10.000', u'1.000', u'0.760', u'0.000', u'0.000', u'0.000', u'0.000', u'0.760']]


In [6]:
features = IGCrdd.map(lambda row: row[0:11])

standardizer = StandardScaler()
model = standardizer.fit(features)
features_transform = model.transform(features)
features_transform.take(2)

[DenseVector([3.4638, 3.954, 0.3952, 0.047, 0.0511, 11.63, 1.355, 0.0, 0.0, 0.0, 0.0]),
 DenseVector([3.4638, 3.954, 1.0539, 0.094, 0.0243, 11.63, 1.4817, 0.0, 0.0, 0.0, 0.0])]

In [7]:
lab = IGCrdd.map(lambda row: row[11])
print lab.take(3)

[u'0.695', u'0.760', u'0.839']


In [8]:
transformedData = lab.zip(features_transform)
transformedData = transformedData.map(lambda row: LabeledPoint(row[0],[row[1]]))
print transformedData.take(2)

[LabeledPoint(0.695, [3.46378349553,3.95399053085,0.395212652209,0.0470228736578,0.0510893363711,11.6300309624,1.35502008807,0.0,0.0,0.0,0.0]), LabeledPoint(0.76, [3.46378349553,3.95399053085,1.05390040589,0.0940457473155,0.0243282554148,11.6300309624,1.48174858552,0.0,0.0,0.0,0.0])]


In [9]:
trainingData, testingData = transformedData.randomSplit([0.8,0.2],seed=1234)

def getMSE(model):
    valuesAndPreds = testingData.map(lambda p: (p.label, model.predict(p.features[0])))
    MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count()
    return MSE

In [10]:
from pyspark.mllib.regression import LinearRegressionWithSGD
model = LinearRegressionWithSGD.train(trainingData, iterations=10, step=0.1)
print "Regressão linear: "
print("Mean Squared Error = " + str(getMSE(model)))



Regressão linear: 
Mean Squared Error = 2.23210734621e+18


In [11]:
from pyspark.mllib.tree import DecisionTree
from pyspark.mllib.linalg import SparseVector

model = DecisionTree.trainRegressor(trainingData, {})

In [12]:
print(model)

DecisionTreeModel regressor of depth 5 with 63 nodes


In [37]:
print testingData.take(5)

[LabeledPoint(1.064, [3.46378349553,3.59453684623,3.42517631914,0.141068620973,0.00973130216592,11.6300309624,2.07444801973,0.0,0.0,0.0,0.0]), LabeledPoint(1.266, [3.46378349553,3.59453684623,0.658687753682,0.0470228736578,0.06568628962,11.6300309624,2.46828119641,0.0,0.0,0.0,0.0]), LabeledPoint(1.28, [3.46378349553,3.23508316161,2.10780081178,0.0940457473155,0.136238230323,11.6300309624,2.49557656509,0.0,0.0,0.0,0.0]), LabeledPoint(1.311, [3.46378349553,1.79726842312,0.922162855154,0.0940457473155,0.0705519407029,11.6300309624,2.55601631002,0.0,0.0,0.0,0.0]), LabeledPoint(1.337, [3.46378349553,3.23508316161,2.10780081178,0.0940457473155,0.0754175917859,11.6300309624,2.606707709,0.0,0.0,0.0,0.0])]


In [44]:
print "Testando modelo arvore de decisao: \n"
print "Valor IGC previso para a entrada [3.46378349553,3.59453684623,3.42517631914,0.141068620973,0.00973130216592,11.6300309624,2.07444801973,0.0,0.0,0.0,0.0]:\n"
print model.predict([3.46378349553,3.59453684623,3.42517631914,0.141068620973,0.00973130216592,11.6300309624,2.07444801973,0.0,0.0,0.0,0.0])
print "\nValor IGC real: 1.064"

Testando modelo arvore de decisao: 

Valor IGC previso para a entrada [3.46378349553,3.59453684623,3.42517631914,0.141068620973,0.00973130216592,11.6300309624,2.07444801973,0.0,0.0,0.0,0.0]:

1.23296153846

Valor IGC real: 1.064


In [45]:
print "Testando modelo arvore de decisao: \n"
print "Valor IGC previso para a entrada [3.46378349553,3.59453684623,3.42517631914,0.141068620973,0.00973130216592,11.6300309624,2.07444801973,0.0,0.0,0.0,0.0]:\n"
print model.predict([3.46378349553,3.59453684623,0.658687753682,0.0470228736578,0.06568628962,11.6300309624,2.46828119641,0.0,0.0,0.0,0.0])
print "\nValor IGC real: 1.266"

Testando modelo arvore de decisao: 

Valor IGC previso para a entrada [3.46378349553,3.59453684623,3.42517631914,0.141068620973,0.00973130216592,11.6300309624,2.07444801973,0.0,0.0,0.0,0.0]:

1.23296153846

Valor IGC real: 1.266


In [46]:
print "Testando modelo arvore de decisao: \n"
print "Valor IGC previso para a entrada [3.46378349553,3.59453684623,3.42517631914,0.141068620973,0.00973130216592,11.6300309624,2.07444801973,0.0,0.0,0.0,0.0]:\n"
print model.predict([3.46378349553,3.23508316161,2.10780081178,0.0940457473155,0.136238230323,11.6300309624,2.49557656509,0.0,0.0,0.0,0.0])
print "\nValor IGC real: 1.28"

Testando modelo arvore de decisao: 

Valor IGC previso para a entrada [3.46378349553,3.59453684623,3.42517631914,0.141068620973,0.00973130216592,11.6300309624,2.07444801973,0.0,0.0,0.0,0.0]:

1.23296153846

Valor IGC real: 1.28
