In [1]:
import findspark

findspark.add_jars('/app/postgresql-42.1.4.jar')
findspark.init()

In [2]:
# Importar paquetes
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import sys
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType
from pyspark.sql import functions as F
from pyspark.sql.window import Window

In [3]:
from pyspark.sql import SparkSession
spark = (
    SparkSession.builder
    .appName("Exports:ETL")
    .config("spark.driver.memory", "512m")
    .config("spark.driver.cores", "1")
    .config("spark.executor.memory", "512m")
    .config("spark.executor.cores", "1")
    .config("spark.sql.shuffle.partitions", "2")
    .getOrCreate()
)

In [4]:
# Importar csv y chequear las primeras observaciones
df = pd.read_csv("/dataset/exports/dataset_exports.csv")
df.head()

Unnamed: 0,Pais,Alimentos y Bienes de Consumo,Combustibles,Bs Capital y Eq Transporte,Servicios
0,AFG,0.263417,0.015144,0.0,0.561113
1,ALB,0.205064,0.04181,0.014504,0.505531
2,ATG,0.438246,0.011037,0.351556,0.0
3,ARG,0.31908,0.027268,0.112044,0.196831
4,ARM,0.164575,0.030888,0.012032,0.504127


In [5]:
# Ver cómo importa las variables
df.dtypes

Pais                              object
Alimentos y Bienes de Consumo    float64
Combustibles                     float64
Bs Capital y Eq Transporte       float64
Servicios                        float64
dtype: object

In [6]:
# Descriptivas
df.describe()

Unnamed: 0,Alimentos y Bienes de Consumo,Combustibles,Bs Capital y Eq Transporte,Servicios
count,134.0,134.0,134.0,134.0
mean,0.201606,0.104912,0.130454,0.311298
std,0.15012,0.184281,0.146813,0.234506
min,6.6e-05,0.0,0.0,0.0
25%,0.094992,0.009068,0.02257,0.155531
50%,0.189705,0.030297,0.069291,0.23168
75%,0.267029,0.092149,0.191982,0.4278
max,0.922326,0.903018,0.660417,0.993419


In [7]:
X = df[df.columns[1:]]
X.head()

Unnamed: 0,Alimentos y Bienes de Consumo,Combustibles,Bs Capital y Eq Transporte,Servicios
0,0.263417,0.015144,0.0,0.561113
1,0.205064,0.04181,0.014504,0.505531
2,0.438246,0.011037,0.351556,0.0
3,0.31908,0.027268,0.112044,0.196831
4,0.164575,0.030888,0.012032,0.504127


In [8]:
X = spark.createDataFrame(df)
#n_samples, n_features = X.shape

In [9]:
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler

vecAssembler = VectorAssembler(inputCols=["Alimentos y Bienes de Consumo","Combustibles","Bs Capital y Eq Transporte", "Servicios"], outputCol="features")
new_df = vecAssembler.transform(X)
new_df.show()



+----+-----------------------------+--------------------+--------------------------+-------------------+--------------------+
|Pais|Alimentos y Bienes de Consumo|        Combustibles|Bs Capital y Eq Transporte|          Servicios|            features|
+----+-----------------------------+--------------------+--------------------------+-------------------+--------------------+
| AFG|                  0.263417141|         0.015143603|                       0.0|        0.561112985|[0.263417141,0.01...|
| ALB|                  0.205063818|         0.041810141|               0.014504183| 0.5055307139999999|[0.205063818,0.04...|
| ATG|                  0.438245802|         0.011037168|                0.35155603|                0.0|[0.438245802,0.01...|
| ARG|                  0.319079542|         0.027268256|               0.112044424|        0.196830582|[0.319079542,0.02...|
| ARM|                  0.164574629|0.030888277000000002|               0.012032132| 0.5041272720000001|[0.164574629,0

In [10]:
kmeans = KMeans().setK(4).setSeed(1)
pred_clusters = kmeans.fit(new_df.select('features'))
# Cluster centroid sirve para caracterizarlos

In [11]:
transformed = pred_clusters.transform(new_df)
transformed.show()    

+----+-----------------------------+--------------------+--------------------------+-------------------+--------------------+----------+
|Pais|Alimentos y Bienes de Consumo|        Combustibles|Bs Capital y Eq Transporte|          Servicios|            features|prediction|
+----+-----------------------------+--------------------+--------------------------+-------------------+--------------------+----------+
| AFG|                  0.263417141|         0.015143603|                       0.0|        0.561112985|[0.263417141,0.01...|         3|
| ALB|                  0.205063818|         0.041810141|               0.014504183| 0.5055307139999999|[0.205063818,0.04...|         3|
| ATG|                  0.438245802|         0.011037168|                0.35155603|                0.0|[0.438245802,0.01...|         2|
| ARG|                  0.319079542|         0.027268256|               0.112044424|        0.196830582|[0.319079542,0.02...|         2|
| ARM|                  0.164574629|0.030

In [12]:
from pyspark.sql.functions import col

transformed = transformed.select(col("Pais").alias("pais"), col("Alimentos y Bienes de Consumo").alias("alimentos_consumo"),col("Combustibles").alias("combustibles"),col("Bs Capital y Eq Transporte").alias("capital_tranporte"),col("Servicios").alias("servicios"),col("prediction").alias("kmeans"))
transformed.show()


+----+--------------------+--------------------+--------------------+-------------------+------+
|pais|   alimentos_consumo|        combustibles|   capital_tranporte|          servicios|kmeans|
+----+--------------------+--------------------+--------------------+-------------------+------+
| AFG|         0.263417141|         0.015143603|                 0.0|        0.561112985|     3|
| ALB|         0.205063818|         0.041810141|         0.014504183| 0.5055307139999999|     3|
| ATG|         0.438245802|         0.011037168|          0.35155603|                0.0|     2|
| ARG|         0.319079542|         0.027268256|         0.112044424|        0.196830582|     2|
| ARM|         0.164574629|0.030888277000000002|         0.012032132| 0.5041272720000001|     3|
| ABW|         0.028001825|7.829999999999999E-5|         0.003523283| 0.9633641959999999|     3|
| AUS|          0.14148959| 0.20000406399999998|0.060076437999999996|         0.20943325|     0|
| AUT|         0.134270552|   

In [13]:
transformed \
    .write \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://postgres/workshop") \
    .option("dbtable", "workshop.exports") \
    .option("user", "workshop") \
    .option("password", "w0rkzh0p") \
    .option("driver", "org.postgresql.Driver") \
    .mode('overwrite') \
    .save()

In [14]:
transformed.createOrReplaceTempView("exports")

In [15]:
transposed =spark.sql("Select pais, kmeans, stack(4, 'alimentos_consumo',alimentos_consumo, 'combustibles',combustibles,'capital_tranporte',capital_tranporte,'servicios',servicios) as (metric, value) from exports")

In [16]:
transposed.show()

+----+------+-----------------+--------------------+
|pais|kmeans|           metric|               value|
+----+------+-----------------+--------------------+
| AFG|     3|alimentos_consumo|         0.263417141|
| AFG|     3|     combustibles|         0.015143603|
| AFG|     3|capital_tranporte|                 0.0|
| AFG|     3|        servicios|         0.561112985|
| ALB|     3|alimentos_consumo|         0.205063818|
| ALB|     3|     combustibles|         0.041810141|
| ALB|     3|capital_tranporte|         0.014504183|
| ALB|     3|        servicios|  0.5055307139999999|
| ATG|     2|alimentos_consumo|         0.438245802|
| ATG|     2|     combustibles|         0.011037168|
| ATG|     2|capital_tranporte|          0.35155603|
| ATG|     2|        servicios|                 0.0|
| ARG|     2|alimentos_consumo|         0.319079542|
| ARG|     2|     combustibles|         0.027268256|
| ARG|     2|capital_tranporte|         0.112044424|
| ARG|     2|        servicios|         0.1968

In [17]:
transposed \
    .write \
    .format("jdbc") \
    .option("url", "jdbc:postgresql://postgres/workshop") \
    .option("dbtable", "workshop.exports_transposed") \
    .option("user", "workshop") \
    .option("password", "w0rkzh0p") \
    .option("driver", "org.postgresql.Driver") \
    .mode('overwrite') \
    .save()