# Caso de uso - Regresion Logistica al saldo comercial

In [1]:
from pyspark.mllib.classification import NaiveBayes, NaiveBayesModel
from pyspark.mllib.linalg import Vectors
from pyspark.mllib.regression import LabeledPoint

In [21]:
# importando saldo comercial (del caso de uso 1)
saldo_comercial = spark.read.option("header", True)\
.option("inferSchema","true")\
.csv("saldo_comercial_nacional.csv")

                                                                                

In [32]:
# creacion de una vista temporal
saldo_comercial.createOrReplaceTempView("saldo_label")

In [72]:
# concatenamos las columnas gestin departamento y descripcion como descripcion
saldo_label = spark.sql("""
SELECT concat(gestion, departamento, descripcion) descrpcion, import_millones, export_millones, outcome
FROM saldo_label
""")

In [73]:
from pyspark.ml.feature import RFormula

In [84]:
supervised = RFormula(formula="outcome ~. + descrpcion:import_millones + descrpcion:export_millones")

In [85]:
# RFormula es un estimator, por eso llama a la funcion fit()
fittedRF=supervised.fit(saldo_label)

                                                                                

In [86]:
# fittedRF es un transformer
preparedDF=fittedRF.transform(saldo_label)

In [87]:
preparedDF.show(truncate=False)

+------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+---------------+-------+-------------------------------------------------------------------+-----+
|descrpcion                                                                                                                                                        |import_millones|export_millones|outcome|features                                                           |label|
+------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------+---------------+-------+-------------------------------------------------------------------+-----+
|2016SANTA CRUZFabricación de otros artículos de papel y cartón.                                                                                                   

In [88]:
train, test=preparedDF.randomSplit([0.7,0.3])

In [89]:
from pyspark.ml.classification import LogisticRegression
lr=LogisticRegression(labelCol="label", featuresCol="features")

In [90]:
print(lr.explainParams())

aggregationDepth: suggested depth for treeAggregate (>= 2). (default: 2)
elasticNetParam: the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty. (default: 0.0)
family: The name of family which is a description of the label distribution to be used in the model. Supported options: auto, binomial, multinomial (default: auto)
featuresCol: features column name. (default: features, current: features)
fitIntercept: whether to fit an intercept term. (default: True)
labelCol: label column name. (default: label, current: label)
lowerBoundsOnCoefficients: The lower bounds on coefficients if fitting under bound constrained optimization. The bound matrix must be compatible with the shape (1, number of features) for binomial regression, or (number of classes, number of features) for multinomial regression. (undefined)
lowerBoundsOnIntercepts: The lower bounds on intercepts if fitting under bound constrained optimization. The

In [91]:
fittedLR=lr.fit(train)

2021-08-17 21:11:06,405 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 2.5 MiB
2021-08-17 21:11:07,846 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 2.5 MiB
2021-08-17 21:11:08,034 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 2.5 MiB
2021-08-17 21:11:08,780 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 2.5 MiB
2021-08-17 21:11:08,939 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 2.5 MiB
2021-08-17 21:11:09,143 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 2.5 MiB
2021-08-17 21:11:09,314 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 2.5 MiB
2021-08-17 21:11:09,467 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 2.5 MiB
2021-08-17 21:11:09,638 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 2.5 MiB
2021-08-17 21:11:09,879 WARN scheduler.DAGScheduler: Broadcasting large task binar

In [92]:
outputDF=fittedLR.transform(train)

In [95]:
outputDF

DataFrame[descrpcion: string, import_millones: double, export_millones: double, outcome: int, features: vector, label: double, rawPrediction: vector, probability: vector, prediction: double]

In [99]:
outputDF.select("descrpcion","label","probability","prediction").show(10, truncate= False)

2021-08-17 21:18:24,951 WARN scheduler.DAGScheduler: Broadcasting large task binary with size 2.6 MiB


+----------------------------------------------------------------------------------------------------------------------+-----+------------------------------------------+----------+
|descrpcion                                                                                                            |label|probability                               |prediction|
+----------------------------------------------------------------------------------------------------------------------+-----+------------------------------------------+----------+
|2016CHUQUISACAFabricación de aparatos de distribución y control de la energía eléctrica.                              |0.0  |[0.9999999958953978,4.104602213672592E-9] |0.0       |
|2016CHUQUISACAFabricación de artículos de cuchillería, herramientas de mano y artículos de ferretería.                |0.0  |[0.99999999589574,4.104260042936403E-9]   |0.0       |
|2016CHUQUISACAFabricación de instrumentos de óptica y equipo fotográfico.                     

# Conclusiones
Segun los resultados del modelo  muestra:

     La actividad en Chuquisaca - Fabricación de aparatos de distribución y control de la energía eléctrica es muy probable que se exporte.
     Pero en COCHABAMBA la actividad Fabricación de bombas, compresores, grifos y válvulas. Es muy probable que se importe