In [34]:
import os
import numpy as np

from pyspark import SparkContext
sc = SparkContext.getOrCreate()

from test_helper import Test
import os.path

fileName = os.path.join('D:/', 'IGCFINAL.csv')

numPartitions = 4
IGCraw = sc.textFile(fileName, numPartitions, 8)

In [35]:
numPoints = IGCraw.count()
print numPoints

2119


In [36]:
samplePoints = IGCraw.take(2)
print samplePoints

[u'5.0,8.0,84.0,0.8274,2.7727,0.1201,4.2806,0.0525,4.6019,2240.0,1284.0,642.0,2240.0,1.0000,35259.0,3.049821', u'5.0,8.0,63.0,0.5425,3.3987,0.2337,4.5397,0.2238,4.7079,3034.0,2497.0,480.0,2874.0,0.9473,44274.0,3.958344']


In [37]:
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.feature import StandardScaler

In [38]:
IGCrdd = (IGCraw
        .map(lambda x: x.split(','),lambda y: float(y))
        )
print IGCrdd.take(2)

[[u'5.0', u'8.0', u'84.0', u'0.8274', u'2.7727', u'0.1201', u'4.2806', u'0.0525', u'4.6019', u'2240.0', u'1284.0', u'642.0', u'2240.0', u'1.0000', u'35259.0', u'3.049821'], [u'5.0', u'8.0', u'63.0', u'0.5425', u'3.3987', u'0.2337', u'4.5397', u'0.2238', u'4.7079', u'3034.0', u'2497.0', u'480.0', u'2874.0', u'0.9473', u'44274.0', u'3.958344']]


In [39]:
features = IGCrdd.map(lambda row: row[0:14])

standardizer = StandardScaler()
model = standardizer.fit(features)
features_transform = model.transform(features)
features_transform.take(2)

[DenseVector([7.519, 3.3099, 3.9732, 9.6856, 5.3817, 2.1911, 2.8244, 1.48, 3.4977, 5.4795, 4.9566, 4.5409, 5.6773, 7.5224]),
 DenseVector([7.519, 3.3099, 2.9799, 6.3505, 6.5968, 4.2636, 2.9954, 6.309, 3.5782, 7.4218, 9.6391, 3.395, 7.2842, 7.126])]

In [40]:
lab = IGCrdd.map(lambda row: row[15])
print lab.take(3)

[u'3.049821', u'3.958344', u'3.108116']


In [41]:
transformedData = lab.zip(features_transform)
transformedData = transformedData.map(lambda row: LabeledPoint(row[0],[row[1]]))
print transformedData.take(2)

[LabeledPoint(3.049821, [7.51897954649,3.30990452554,3.97318036511,9.68559447268,5.38172174332,2.1911019644,2.8244270402,1.47999885681,3.49766650099,5.47950519146,4.95661141064,4.5408550471,5.67731579915,7.52239710004]), LabeledPoint(3.958344, [7.51897954649,3.30990452554,2.97988527383,6.35053783107,6.59676765933,4.26361806063,2.99538649591,6.30902369819,3.57823162607,7.42179408522,9.63914228379,3.39503181091,7.28419893159,7.12596677287])]


In [42]:
trainingData, testingData = transformedData.randomSplit([0.8,0.2],seed=1234)

def getMSE(model):
    valuesAndPreds = testingData.map(lambda p: (p.label, model.predict(p.features[0])))
    MSE = valuesAndPreds.map(lambda (v, p): (v - p)**2).reduce(lambda x, y: x + y) / valuesAndPreds.count()
    return MSE

In [43]:
from pyspark.mllib.regression import LinearRegressionWithSGD
model = LinearRegressionWithSGD.train(trainingData, iterations=10, step=0.1)
print "Regressão linear: "
print("Mean Squared Error = " + str(getMSE(model)))

Regressão linear: 
Mean Squared Error = 1.5152675667e+21


In [44]:
from pyspark.mllib.tree import DecisionTree
from pyspark.mllib.linalg import SparseVector

model = DecisionTree.trainRegressor(trainingData, {})

In [45]:
print(model)

DecisionTreeModel regressor of depth 5 with 63 nodes


In [46]:
print testingData.take(5)

[LabeledPoint(2.688041, [7.51897954649,4.55111872261,1.18249415628,10.6829508288,4.89803253842,1.06544841566,2.87622293859,0.817523178049,3.42021757414,1.15705623016,0.80294016621,1.55605624667,1.17601541554,7.37947155514]), LabeledPoint(2.605417, [7.51897954649,4.55111872261,0.945995325025,11.5948529432,5.03098884073,0.173317807343,2.63928144672,0.0,0.0,0.567520180544,0.154411570425,0.841684969789,0.588007707769,7.52239710004]), LabeledPoint(2.58, [4.5113877279,3.72364259123,0.0472997662513,11.7060605181,5.00769722573,0.0,0.0,0.0,0.0,0.146772460486,0.142830702643,0.134386675849,0.149536442924,7.39677306847]), LabeledPoint(2.639234, [3.0075918186,3.72364259123,0.520297428764,11.3548787026,5.0410818739,0.547319391608,2.63928144672,0.0,0.0,0.337576659117,0.196874752292,0.480962839879,0.34469417352,7.41332234209]), LabeledPoint(1.944021, [4.5113877279,3.72364259123,0.236498831256,11.7060605181,3.77324163055,0.0,0.0,0.0,0.0,0.173680744908,0.0270220248244,0.134386675849,0.177416118723,7.416

In [47]:
print "Testando modelo arvore de decisao: \n"
print "Valor IGC previso para a entrada [3.46378349553,3.59453684623,3.42517631914,0.141068620973,0.00973130216592,11.6300309624,2.07444801973,0.0,0.0,0.0,0.0]:\n"
print model.predict([7.51897954649,4.55111872261,1.18249415628,10.6829508288,4.89803253842,1.06544841566,2.87622293859,0.817523178049,3.42021757414,1.15705623016,0.80294016621,1.55605624667,1.17601541554,7.37947155514])
print "\nValor IGC real: 2.688041"

Testando modelo arvore de decisao: 

Valor IGC previso para a entrada [3.46378349553,3.59453684623,3.42517631914,0.141068620973,0.00973130216592,11.6300309624,2.07444801973,0.0,0.0,0.0,0.0]:

2.58489669128

Valor IGC real: 2.688041


In [51]:
print "Testando modelo arvore de decisao: \n"
print "Valor IGC previso para a entrada [3.46378349553,3.59453684623,3.42517631914,0.141068620973,0.00973130216592,11.6300309624,2.07444801973,0.0,0.0,0.0,0.0]:\n"
print model.predict([7.51897954649,4.55111872261,0.945995325025,11.5948529432,5.03098884073,0.173317807343,2.63928144672,0.0,0.0,0.567520180544,0.154411570425,0.841684969789,0.588007707769,7.52239710004])
print "\nValor IGC real: 2.605417"

Testando modelo arvore de decisao: 

Valor IGC previso para a entrada [3.46378349553,3.59453684623,3.42517631914,0.141068620973,0.00973130216592,11.6300309624,2.07444801973,0.0,0.0,0.0,0.0]:

2.58489669128

Valor IGC real: 2.605417


In [52]:
print "Testando modelo arvore de decisao: \n"
print "Valor IGC previso para a entrada [3.46378349553,3.59453684623,3.42517631914,0.141068620973,0.00973130216592,11.6300309624,2.07444801973,0.0,0.0,0.0,0.0]:\n"
print model.predict([3.0075918186,3.72364259123,0.520297428764,11.3548787026,5.0410818739,0.547319391608,2.63928144672,0.0,0.0,0.337576659117,0.196874752292,0.480962839879,0.34469417352,7.41332234209])
print "\nValor IGC real: 2.639234"

Testando modelo arvore de decisao: 

Valor IGC previso para a entrada [3.46378349553,3.59453684623,3.42517631914,0.141068620973,0.00973130216592,11.6300309624,2.07444801973,0.0,0.0,0.0,0.0]:

2.58489669128

Valor IGC real: 2.639234


In [55]:
print "\nDicionario\n"
print "0 Tipo Org. Acadêmica"
print "1 Categ. Administrativa"
print "2 Nr. de Cursos com CPC no Triênio"
print "3 alfa (Proporção de Graduandos)"
print "4 Conceito médio da Graduação"
print "5 beta (Proporção de Mestrandos - Equivalente)"
print "6 Conceito Médio do Mestrado"
print "7 gama (Proporção de Doutorandos - Equivalente)"
print "8 Conceito Médio do doutorado"
print "9 Total de Docentes"
print "10 Total de Docentes com Doutorado"
print "11 Total de Docentes com Mesttrado"
print "12 Total de Docentes Ativos"
print "13 Razao de Docentes Ativos"
print "14 Total de Alunos da IES"
print "15 IGC (Contínuo) Objetivo"


Dicionario

0 Tipo Org. Acadêmica
1 Categ. Administrativa
2 Nr. de Cursos com CPC no Triênio
3 alfa (Proporção de Graduandos)
4 Conceito médio da Graduação
5 beta (Proporção de Mestrandos - Equivalente)
6 Conceito Médio do Mestrado
7 gama (Proporção de Doutorandos - Equivalente)
8 Conceito Médio do doutorado
9 Total de Docentes
10 Total de Docentes com Doutorado
11 Total de Docentes com Mesttrado
12 Total de Docentes Ativos
13 Razao de Docentes Ativos
14 Total de Alunos da IES
15 IGC (Contínuo) Objetivo


In [57]:
print "Regras da arvore de decisao"
print(model.toDebugString())

Regras da arvore de decisao
DecisionTreeModel regressor of depth 5 with 63 nodes
  If (feature 4 <= 4.8821166014999005)
   If (feature 4 <= 3.847386604974796)
    If (feature 4 <= 3.216183838383229)
     If (feature 4 <= 2.8039222528234236)
      If (feature 3 <= 8.728038722297562)
       Predict: 2.860333
      Else (feature 3 > 8.728038722297562)
       Predict: 1.22206225
     Else (feature 4 > 2.8039222528234236)
      If (feature 11 <= 0.049510880575815076)
       Predict: 1.531737789473684
      Else (feature 11 > 0.049510880575815076)
       Predict: 1.5765907714285716
    Else (feature 4 > 3.216183838383229)
     If (feature 4 <= 3.502476606133094)
      If (feature 13 <= 7.3591610829713625)
       Predict: 1.7522984736842107
      Else (feature 13 > 7.3591610829713625)
       Predict: 1.7285519411764714
     Else (feature 4 > 3.502476606133094)
      If (feature 4 <= 3.689585913326904)
       Predict: 1.856317037735849
      Else (feature 4 > 3.689585913326904)
       Predict: