# CASO DE USO  IDENTIFICACION DE GRUPOS HOMOGENEOS EN LAS IMPORTACIONES (CLUSTERING)

In [101]:
import pyspark
sc

In [102]:
from pyspark.sql import functions
from pyspark.sql import SparkSession


## Importacion de datasets

In [103]:
# recuperando importaciones
importaciones = spark.read.option("header", True)\
.option("inferSchema","true")\
.csv("import_post3.csv")

                                                                                

In [104]:
importaciones.show(10)

+-------+---+-------+-------+-------+------+----------+------------+
|gestion|mes|  kilos|    fro|    fob|ciiur3|   nandina|departamento|
+-------+---+-------+-------+-------+------+----------+------------+
|   2020|  4|12600.0|13134.0|11100.0|  3410|8408209000|      LA PAZ|
|   2020|  4|  300.0|  320.0|  270.0|  3430|8708302900|      LA PAZ|
|   2020|  6|  284.0| 8017.0| 7872.0|  3430|8708992900|  SANTA CRUZ|
|   2020|  4|   10.0|   26.0|   25.0|  3430|8708299000|      LA PAZ|
|   2020|  4|13934.0|28933.0|27262.0|  3430|8708299000|      LA PAZ|
|   2020|  4| 1137.0| 1051.0|  925.0|  3430|8708302900|      LA PAZ|
|   2020|  4|  500.0|  598.0|  540.0|  3430|8708401000|      LA PAZ|
|   2020|  4| 2208.0| 4725.0| 4260.0|  3430|8708501100|      LA PAZ|
|   2020|  4| 6686.0|17714.0|16745.0|  3430|8708501900|      LA PAZ|
|   2020|  4| 1382.0| 2665.0| 2380.0|  3430|8708100000|      LA PAZ|
+-------+---+-------+-------+-------+------+----------+------------+
only showing top 10 rows



In [105]:
importaciones.createOrReplaceTempView("importaciones");

In [115]:
# analizando el saldo comercial de La Paz en la gestion 2020
_importaciones_agrupado = spark.sql("""
SELECT gestion,  ciiur3 ,departamento, sum(kilos) kilos, sum(fro) fro, sum(fob) fob
FROM importaciones
GROUP BY gestion, ciiur3, departamento
ORDER BY gestion, ciiur3, departamento
""")

In [107]:
_importaciones_agrupado.printSchema()

root
 |-- gestion: integer (nullable = true)
 |-- ciiur3: string (nullable = true)
 |-- departamento: string (nullable = true)
 |-- kilos: double (nullable = true)
 |-- fro: double (nullable = true)
 |-- fob: double (nullable = true)



In [119]:
# construimos el vector de features con las variables, geston, mes, kilos (volumen), fob (precio)

from pyspark.ml.feature import VectorAssembler
v_assembler = VectorAssembler()\
.setInputCols(["gestion", "kilos", "fob"])\
.setOutputCol("features")

_importaciones = v_assembler.transform(_importaciones_agrupado)

In [120]:
_importaciones.select("gestion","ciiur3", "departamento", "kilos", "fob", "features").show(100)



+-------+------+------------+------------+-----------+--------------------+
|gestion|ciiur3|departamento|       kilos|        fob|            features|
+-------+------+------------+------------+-----------+--------------------+
|   2016|  0111|        BENI|     22590.0|     2396.0|[2016.0,22590.0,2...|
|   2016|  0111|  CHUQUISACA|      4752.0|     1168.0|[2016.0,4752.0,11...|
|   2016|  0111|  COCHABAMBA|    269824.0|   156292.0|[2016.0,269824.0,...|
|   2016|  0111|      LA PAZ|1.33092082E8|1.9987496E7|[2016.0,1.3309208...|
|   2016|  0111|       ORURO|   1410313.0|  1757926.0|[2016.0,1410313.0...|
|   2016|  0111|      POTOSI|   9057246.0|  1491505.0|[2016.0,9057246.0...|
|   2016|  0111|  SANTA CRUZ|   8032462.0|1.8404285E7|[2016.0,8032462.0...|
|   2016|  0111|      TARIJA|1.45041639E8|2.7028639E7|[2016.0,1.4504163...|
|   2016|  0112|  COCHABAMBA|    473782.0|   869847.0|[2016.0,473782.0,...|
|   2016|  0112|      LA PAZ|  1.941103E7|  1744818.0|[2016.0,1.941103E...|
|   2016|  0

                                                                                

In [121]:
# implementacion del k-means
from pyspark.ml.clustering import KMeans 
k_means = KMeans().setK(5)

In [122]:
print(k_means.explainParams())

distanceMeasure: the distance measure. Supported options: 'euclidean' and 'cosine'. (default: euclidean)
featuresCol: features column name. (default: features)
initMode: The initialization algorithm. This can be either "random" to choose random points as initial cluster centers, or "k-means||" to use a parallel variant of k-means++ (default: k-means||)
initSteps: The number of steps for k-means|| initialization mode. Must be > 0. (default: 2)
k: The number of clusters to create. Must be > 1. (default: 2, current: 5)
maxIter: max number of iterations (>= 0). (default: 20)
predictionCol: prediction column name. (default: prediction)
seed: random seed. (default: 7969353092125344463)
tol: the convergence tolerance for iterative algorithms (>= 0). (default: 0.0001)
weightCol: weight column name. If this is not set or empty, we treat all instance weights as 1.0. (undefined)


In [123]:
# ajuste del modelo
k_means_model = k_means.fit(_importaciones)

                                                                                

In [125]:
summary = k_means_model.summary

In [126]:
# del banco de datos se puede observar una gran concentracion en el primer grupo
print(summary.clusterSizes)

[5329, 12, 42, 2, 185]


In [127]:
# centroides
centers = k_means_model.clusterCenters() 
centers

[array([2.01846425e+03, 2.13425814e+06, 3.80515825e+06]),
 array([2.01791667e+03, 4.36621299e+08, 2.61079824e+08]),
 array([2.01826190e+03, 2.00144779e+08, 1.02914995e+08]),
 array([2.01900000e+03, 7.38093248e+08, 5.12983084e+08]),
 array([2.01803784e+03, 2.50893011e+07, 9.03818327e+07])]

In [128]:
# aplicamos modelo
clustered_importaciones = k_means_model.transform(_importaciones)

In [136]:

clustered_importaciones.select("gestion", "ciiur3","departamento", "features", "prediction")\
.orderBy("prediction", ascending=False).show(100)



+-------+------+----------------+--------------------+----------+
|gestion|ciiur3|    departamento|            features|prediction|
+-------+------+----------------+--------------------+----------+
|   2018|  2924|      SANTA CRUZ|[2018.0,2.1374989...|         4|
|   2018|  2811|           ORURO|[2018.0,2.0138683...|         4|
|   2018|  2924|      COCHABAMBA|[2018.0,1.166156E...|         4|
|   2018|  2429|           ORURO|[2018.0,2.5404595...|         4|
|   2018|  2520|          LA PAZ|[2018.0,3.1861604...|         4|
|   2018|  2924|          LA PAZ|[2018.0,1.3724712...|         4|
|   2017|  2921|      SANTA CRUZ|[2017.0,1.0166515...|         4|
|   2018|  2413|      SANTA CRUZ|[2018.0,5.6691349...|         4|
|   2018|  2424|      SANTA CRUZ|[2018.0,4.2180017...|         4|
|   2017|  2413|           ORURO|[2017.0,8.7178678...|         4|
|   2018|  2520|      SANTA CRUZ|[2018.0,2.3732285...|         4|
|   2018|  2911|      SANTA CRUZ|[2018.0,1.0106195...|         4|
|   2018| 

                                                                                

### Conclusiones 
Podemos observar, dada las cinco agrupaciones creadas que: las importaciones mas significativas en cuanto a valor y volumen estan en el grupo 4 y son los productos de las siguientes actividades (ciiur3, las descripciones estan disponibles en el caso de uso 1) :

    2924|   
    2811|  
    2924| 
    2429|
    2520|

por lo tanto los productos de las actividades mencionadas son candidatas a nichos de mercado factibles
y la mayoria de importaciones regulares, llamadas pequenias estan agrupadas en el grupo 0

In [137]:
# evaluacion del modelo
from pyspark.ml.evaluation import ClusteringEvaluator

clustering_evaluator = ClusteringEvaluator()\
.setPredictionCol("prediction") 
clustering_evaluator.evaluate(clustered_importaciones)

                                                                                

0.946498651359283

In [76]:
# guardando el modelo en el HDFS
k_means_model.save("models/model-cluster-import")