## Leemos CSV, infiriendo el schema

In [1]:
df = sqlContext.read.csv('../../data/big-mac-source-data.csv', header=True, inferSchema=True)

In [2]:
df.schema

StructType(List(StructField(name,StringType,true),StructField(iso_a3,StringType,true),StructField(currency_code,StringType,true),StructField(local_price,DoubleType,true),StructField(dollar_ex,DoubleType,true),StructField(GDP_dollar,DoubleType,true),StructField(date,TimestampType,true)))

In [3]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- iso_a3: string (nullable = true)
 |-- currency_code: string (nullable = true)
 |-- local_price: double (nullable = true)
 |-- dollar_ex: double (nullable = true)
 |-- GDP_dollar: double (nullable = true)
 |-- date: timestamp (nullable = true)



### Definimos el schema a mano

In [4]:
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, TimestampType

In [5]:
Schema = StructType([ StructField("name", StringType(), True),
                      StructField("iso_a3", StringType(), True),
                      StructField("currency_code", StringType(), True),
                      StructField("local_price", DoubleType(), True),
                      StructField("dollar_ex", DoubleType(), True),
                      StructField("GDP_dollar", DoubleType(), True),
                     StructField("date", TimestampType(), True)
                    ])

In [6]:
df = sqlContext.read.csv('../../data/big-mac-source-data.csv', header=True, schema=Schema)

In [7]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- iso_a3: string (nullable = true)
 |-- currency_code: string (nullable = true)
 |-- local_price: double (nullable = true)
 |-- dollar_ex: double (nullable = true)
 |-- GDP_dollar: double (nullable = true)
 |-- date: timestamp (nullable = true)



## Información general sobre los datos

In [8]:
df.count()

1514

In [10]:
df.head(5)

[Row(name='Argentina', iso_a3='ARG', currency_code='ARS', local_price=2.5, dollar_ex=1.0, GDP_dollar=None, date=datetime.datetime(2000, 4, 1, 0, 0)),
 Row(name='Australia', iso_a3='AUS', currency_code='AUD', local_price=2.59, dollar_ex=1.68, GDP_dollar=None, date=datetime.datetime(2000, 4, 1, 0, 0)),
 Row(name='Brazil', iso_a3='BRA', currency_code='BRL', local_price=2.95, dollar_ex=1.79, GDP_dollar=None, date=datetime.datetime(2000, 4, 1, 0, 0)),
 Row(name='Britain', iso_a3='GBR', currency_code='GBP', local_price=1.9, dollar_ex=0.632911392, GDP_dollar=None, date=datetime.datetime(2000, 4, 1, 0, 0)),
 Row(name='Canada', iso_a3='CAN', currency_code='CAD', local_price=2.85, dollar_ex=1.47, GDP_dollar=None, date=datetime.datetime(2000, 4, 1, 0, 0))]

In [11]:
df.show()

+--------------+------+-------------+-----------+-----------+----------+-------------------+
|          name|iso_a3|currency_code|local_price|  dollar_ex|GDP_dollar|               date|
+--------------+------+-------------+-----------+-----------+----------+-------------------+
|     Argentina|   ARG|          ARS|        2.5|        1.0|      null|2000-04-01 00:00:00|
|     Australia|   AUS|          AUD|       2.59|       1.68|      null|2000-04-01 00:00:00|
|        Brazil|   BRA|          BRL|       2.95|       1.79|      null|2000-04-01 00:00:00|
|       Britain|   GBR|          GBP|        1.9|0.632911392|      null|2000-04-01 00:00:00|
|        Canada|   CAN|          CAD|       2.85|       1.47|      null|2000-04-01 00:00:00|
|         Chile|   CHL|          CLP|     1260.0|      514.0|      null|2000-04-01 00:00:00|
|         China|   CHN|          CNY|        9.9|       8.28|      null|2000-04-01 00:00:00|
|Czech Republic|   CZE|          CZK|      54.37|       39.1|      nul

In [12]:
df.describe().show()

+-------+---------+------+-------------+------------------+-----------------+------------------+
|summary|     name|iso_a3|currency_code|       local_price|        dollar_ex|        GDP_dollar|
+-------+---------+------+-------------+------------------+-----------------+------------------+
|  count|     1514|  1514|         1514|              1514|             1514|               901|
|   mean|     null|  null|         null| 9058.013232345338| 3450.15151106356|27514.171355266484|
| stddev|     null|  null|         null|173614.45356185062|66303.78425411126|21072.286557425992|
|    min|Argentina|   ARE|          AED|               0.0|              0.0|          1049.749|
|    max|  Vietnam|   ZAF|          ZAR|         4000000.0|        1600500.0|        100578.968|
+-------+---------+------+-------------+------------------+-----------------+------------------+



In [13]:
df.summary().show()

+-------+---------+------+-------------+------------------+-----------------+------------------+
|summary|     name|iso_a3|currency_code|       local_price|        dollar_ex|        GDP_dollar|
+-------+---------+------+-------------+------------------+-----------------+------------------+
|  count|     1514|  1514|         1514|              1514|             1514|               901|
|   mean|     null|  null|         null| 9058.013232345338| 3450.15151106356|27514.171355266484|
| stddev|     null|  null|         null|173614.45356185062|66303.78425411126|21072.286557425992|
|    min|Argentina|   ARE|          AED|               0.0|              0.0|          1049.749|
|    25%|     null|  null|         null|              4.56|           1.2112|          9565.962|
|    50%|     null|  null|         null|              14.9|          5.75305|          21558.78|
|    75%|     null|  null|         null|              85.0|           33.135|         43724.031|
|    max|  Vietnam|   ZAF|    

In [14]:
df.columns

['name',
 'iso_a3',
 'currency_code',
 'local_price',
 'dollar_ex',
 'GDP_dollar',
 'date']

In [15]:
df.local_price

Column<b'local_price'>

## Agrupando y agregando

### Usando agg y un dict

In [19]:
df.groupby('name').agg({'local_price': 'max', 'dollar_ex': 'max'}).show()

+-----------+-----------------+----------------+
|       name|   max(dollar_ex)|max(local_price)|
+-----------+-----------------+----------------+
|     Russia|           74.655|           137.0|
|     Sweden|             10.4|            52.0|
|Philippines|             55.9|           163.0|
|   Malaysia|           4.4715|            9.05|
|  Singapore|             1.82|             5.8|
|    Britain|       0.82877507|            3.29|
|     Turkey|        1600500.0|       4000000.0|
|    Germany|      0.955246692|            4.14|
|     Jordan|          0.71005|             2.3|
|     France|      0.955246692|             4.2|
|     Greece|      0.955246692|            3.35|
|  Sri Lanka|          182.245|           640.0|
|     Taiwan|             34.8|            79.0|
|   Slovakia|0.892339267389462|             3.2|
|  Argentina|          41.8045|           120.0|
|    Belgium|      0.955246692|             4.2|
|      Qatar|          3.64175|            13.0|
|    Finland|      0

### Usando las funciones para cada operación

In [17]:
df.groupby('name').mean('local_price').show()

+-----------+------------------+
|       name|  avg(local_price)|
+-----------+------------------+
|     Russia| 79.88870967741936|
|     Sweden| 39.60491358870968|
|Philippines|112.00399999999999|
|   Malaysia| 6.878709677419354|
|  Singapore| 4.442258064516129|
|    Britain|  2.51516129032258|
|     Turkey|403455.45827586204|
|    Germany|3.7529411764705882|
|     Jordan|2.0666666666666664|
|     France|3.9470588235294124|
|     Greece|3.1819411764705885|
|  Sri Lanka| 350.5769230769231|
|     Taiwan| 73.54838709677419|
|   Slovakia|3.1333333333333333|
|  Argentina|27.982580645161292|
|    Belgium| 3.900000000000001|
|      Qatar|12.333333333333334|
|    Finland| 4.227058823529412|
|  Nicaragua|105.33333333333333|
|       Peru| 9.696142857142856|
+-----------+------------------+
only showing top 20 rows



In [18]:
df.groupby('name').max('local_price').show()

+-----------+----------------+
|       name|max(local_price)|
+-----------+----------------+
|     Russia|           137.0|
|     Sweden|            52.0|
|Philippines|           163.0|
|   Malaysia|            9.05|
|  Singapore|             5.8|
|    Britain|            3.29|
|     Turkey|       4000000.0|
|    Germany|            4.14|
|     Jordan|             2.3|
|     France|             4.2|
|     Greece|            3.35|
|  Sri Lanka|           640.0|
|     Taiwan|            79.0|
|   Slovakia|             3.2|
|  Argentina|           120.0|
|    Belgium|             4.2|
|      Qatar|            13.0|
|    Finland|            4.75|
|  Nicaragua|           110.0|
|       Peru|            11.0|
+-----------+----------------+
only showing top 20 rows



### Con agg y usando las funciones de spark

In [20]:
from pyspark.sql.functions import count, avg, min, max, round

In [21]:
df.groupby('name').agg(avg('local_price'), min('local_price'), max('local_price')).show()

+-----------+------------------+----------------+----------------+
|       name|  avg(local_price)|min(local_price)|max(local_price)|
+-----------+------------------+----------------+----------------+
|     Russia| 79.88870967741936|            35.0|           137.0|
|     Sweden| 39.60491358870968|            24.0|            52.0|
|Philippines|112.00399999999999|            59.0|           163.0|
|   Malaysia| 6.878709677419354|            4.52|            9.05|
|  Singapore| 4.442258064516129|             3.2|             5.8|
|    Britain|  2.51516129032258|            1.88|            3.29|
|     Turkey|403455.45827586204|             4.0|       4000000.0|
|    Germany|3.7529411764705882|             3.4|            4.14|
|     Jordan|2.0666666666666664|            1.95|             2.3|
|     France|3.9470588235294124|             3.5|             4.2|
|     Greece|3.1819411764705885|             2.6|            3.35|
|  Sri Lanka| 350.5769230769231|           140.0|           64

### Con agg y usando UDF

In [22]:
from pyspark.sql.functions import pandas_udf, PandasUDFType

In [23]:
@pandas_udf('double', PandasUDFType.GROUPED_AGG)
def max_udf(v):
    return v.max()

In [24]:
df.groupby('name').agg(max_udf(df.local_price)).show()

+-----------+--------------------+
|       name|max_udf(local_price)|
+-----------+--------------------+
|     Russia|               137.0|
|     Sweden|                52.0|
|Philippines|               163.0|
|   Malaysia|                9.05|
|  Singapore|                 5.8|
|    Britain|                3.29|
|     Turkey|           4000000.0|
|    Germany|                4.14|
|     Jordan|                 2.3|
|     France|                 4.2|
|     Greece|                3.35|
|  Sri Lanka|               640.0|
|     Taiwan|                79.0|
|   Slovakia|                 3.2|
|  Argentina|               120.0|
|    Belgium|                 4.2|
|      Qatar|                13.0|
|    Finland|                4.75|
|  Nicaragua|               110.0|
|       Peru|                11.0|
+-----------+--------------------+
only showing top 20 rows



## Funciones de agregación sin grupos

In [29]:
df.groupBy().agg({ 'local_price': 'mean' }).show()

+-----------------+
| avg(local_price)|
+-----------------+
|9058.013232345338|
+-----------------+



In [26]:
df.agg({ 'local_price': 'mean' }).show()

+-----------------+
| avg(local_price)|
+-----------------+
|9058.013232345338|
+-----------------+



## Filtrando filas

In [30]:
df.filter('name = "Argentina"').count()

31

In [31]:
df.filter('name = "Argentina"').show()

+---------+------+-------------+-----------+---------+----------+-------------------+
|     name|iso_a3|currency_code|local_price|dollar_ex|GDP_dollar|               date|
+---------+------+-------------+-----------+---------+----------+-------------------+
|Argentina|   ARG|          ARS|        2.5|      1.0|      null|2000-04-01 00:00:00|
|Argentina|   ARG|          ARS|        2.5|      1.0|      null|2001-04-01 00:00:00|
|Argentina|   ARG|          ARS|        2.5|     3.13|      null|2002-04-01 00:00:00|
|Argentina|   ARG|          ARS|        4.1|     2.88|      null|2003-04-01 00:00:00|
|Argentina|   ARG|          ARS|       4.36|     2.95|      null|2004-05-01 00:00:00|
|Argentina|   ARG|          ARS|       4.75|    2.897|      null|2005-06-01 00:00:00|
|Argentina|   ARG|          ARS|       4.75|   3.0638|      null|2006-01-01 00:00:00|
|Argentina|   ARG|          ARS|        7.0|   3.0565|      null|2006-05-01 00:00:00|
|Argentina|   ARG|          ARS|       8.25|  3.08875|

In [32]:
df.filter(df.name == 'Argentina').count()

31

In [33]:
df.filter(df.dollar_ex > 100).show()

+-----------+------+-------------+-----------+---------+----------+-------------------+
|       name|iso_a3|currency_code|local_price|dollar_ex|GDP_dollar|               date|
+-----------+------+-------------+-----------+---------+----------+-------------------+
|      Chile|   CHL|          CLP|     1260.0|    514.0|      null|2000-04-01 00:00:00|
|    Hungary|   HUN|          HUF|      339.0|    279.0|      null|2000-04-01 00:00:00|
|  Indonesia|   IDN|          IDR|    14500.0|   7945.0|      null|2000-04-01 00:00:00|
|      Japan|   JPN|          JPY|      294.0|    106.0|      null|2000-04-01 00:00:00|
|South Korea|   KOR|          KRW|     3000.0|   1108.0|      null|2000-04-01 00:00:00|
|      Chile|   CHL|          CLP|     1260.0|    601.0|      null|2001-04-01 00:00:00|
|    Hungary|   HUN|          HUF|      399.0|    303.0|      null|2001-04-01 00:00:00|
|  Indonesia|   IDN|          IDR|    14700.0|  10855.0|      null|2001-04-01 00:00:00|
|      Japan|   JPN|          JP

In [34]:
df.filter(df.name.isin("Argentina", "Uruguay")).show()

+---------+------+-------------+-----------+---------+----------+-------------------+
|     name|iso_a3|currency_code|local_price|dollar_ex|GDP_dollar|               date|
+---------+------+-------------+-----------+---------+----------+-------------------+
|Argentina|   ARG|          ARS|        2.5|      1.0|      null|2000-04-01 00:00:00|
|Argentina|   ARG|          ARS|        2.5|      1.0|      null|2001-04-01 00:00:00|
|Argentina|   ARG|          ARS|        2.5|     3.13|      null|2002-04-01 00:00:00|
|Argentina|   ARG|          ARS|        4.1|     2.88|      null|2003-04-01 00:00:00|
|Argentina|   ARG|          ARS|       4.36|     2.95|      null|2004-05-01 00:00:00|
|  Uruguay|   URY|          UYU|       29.8|   29.735|      null|2004-05-01 00:00:00|
|Argentina|   ARG|          ARS|       4.75|    2.897|      null|2005-06-01 00:00:00|
|  Uruguay|   URY|          UYU|       44.0|   24.125|      null|2005-06-01 00:00:00|
|Argentina|   ARG|          ARS|       4.75|   3.0638|

In [35]:
df[df.name.isin("Argentina", "Uruguay")].show()

+---------+------+-------------+-----------+---------+----------+-------------------+
|     name|iso_a3|currency_code|local_price|dollar_ex|GDP_dollar|               date|
+---------+------+-------------+-----------+---------+----------+-------------------+
|Argentina|   ARG|          ARS|        2.5|      1.0|      null|2000-04-01 00:00:00|
|Argentina|   ARG|          ARS|        2.5|      1.0|      null|2001-04-01 00:00:00|
|Argentina|   ARG|          ARS|        2.5|     3.13|      null|2002-04-01 00:00:00|
|Argentina|   ARG|          ARS|        4.1|     2.88|      null|2003-04-01 00:00:00|
|Argentina|   ARG|          ARS|       4.36|     2.95|      null|2004-05-01 00:00:00|
|  Uruguay|   URY|          UYU|       29.8|   29.735|      null|2004-05-01 00:00:00|
|Argentina|   ARG|          ARS|       4.75|    2.897|      null|2005-06-01 00:00:00|
|  Uruguay|   URY|          UYU|       44.0|   24.125|      null|2005-06-01 00:00:00|
|Argentina|   ARG|          ARS|       4.75|   3.0638|

## Descartando columnas

In [36]:
df.drop('name', 'iso_a3', 'currency_code', 'GDP_dollar').show()

+-----------+-----------+-------------------+
|local_price|  dollar_ex|               date|
+-----------+-----------+-------------------+
|        2.5|        1.0|2000-04-01 00:00:00|
|       2.59|       1.68|2000-04-01 00:00:00|
|       2.95|       1.79|2000-04-01 00:00:00|
|        1.9|0.632911392|2000-04-01 00:00:00|
|       2.85|       1.47|2000-04-01 00:00:00|
|     1260.0|      514.0|2000-04-01 00:00:00|
|        9.9|       8.28|2000-04-01 00:00:00|
|      54.37|       39.1|2000-04-01 00:00:00|
|      24.75|       8.04|2000-04-01 00:00:00|
|       2.56|1.075268817|2000-04-01 00:00:00|
|       10.2|       7.79|2000-04-01 00:00:00|
|      339.0|      279.0|2000-04-01 00:00:00|
|    14500.0|     7945.0|2000-04-01 00:00:00|
|       14.5|       4.05|2000-04-01 00:00:00|
|      294.0|      106.0|2000-04-01 00:00:00|
|       4.52|        3.8|2000-04-01 00:00:00|
|       20.9|       9.41|2000-04-01 00:00:00|
|        3.4|       2.01|2000-04-01 00:00:00|
|        5.5|        4.3|2000-04-0

## Evolución del precio del BigMac en Argentina

### En pesos

In [37]:
info_ar = df.filter('name = "Argentina"').drop('name', 'iso_a3', 'currency_code', 'GDP_dollar')
info_ar.show()

+-----------+---------+-------------------+
|local_price|dollar_ex|               date|
+-----------+---------+-------------------+
|        2.5|      1.0|2000-04-01 00:00:00|
|        2.5|      1.0|2001-04-01 00:00:00|
|        2.5|     3.13|2002-04-01 00:00:00|
|        4.1|     2.88|2003-04-01 00:00:00|
|       4.36|     2.95|2004-05-01 00:00:00|
|       4.75|    2.897|2005-06-01 00:00:00|
|       4.75|   3.0638|2006-01-01 00:00:00|
|        7.0|   3.0565|2006-05-01 00:00:00|
|       8.25|  3.08875|2007-01-01 00:00:00|
|       8.25|   3.0915|2007-06-01 00:00:00|
|       11.0|   3.0195|2008-06-01 00:00:00|
|       11.5|  3.81125|2009-07-01 00:00:00|
|        7.0|   3.7985|2010-01-01 00:00:00|
|       14.0|  3.93375|2010-07-01 00:00:00|
|       20.0|   4.1325|2011-07-01 00:00:00|
|       20.0|   4.3135|2012-01-01 00:00:00|
|       19.0|  4.56625|2012-07-01 00:00:00|
|       19.0|   4.9765|2013-01-01 00:00:00|
|       21.0|   5.4125|2013-07-01 00:00:00|
|       21.0|  6.92125|2014-01-0

In [38]:
import time
import datetime

In [39]:
data_ar = info_ar.rdd\
                 .map(lambda x: (time.mktime(x.date.timetuple()), x.local_price))\
                 .collect()

In [40]:
data_ar_list = list()
for data in data_ar:
    data_list = list(data)
    item = list()
    item.append(data_list[0]*1000)
    item.append(float(data_list[1]))
    data_ar_list.append(item)
#data_ar_list

In [41]:
from highcharts import Highchart

In [42]:
chart = Highchart()

chart.set_options('chart', {})

options = {
    'title': {
        'text': 'Precio BigMac en Pesos en Argentina'
    },
    'xAxis': {
        'type': 'datetime'
    },
    'yAxis': {
        'title': {
            'text': 'Precio'
        }
    },
    'legend': {
        'enabled': False
    },
    'tooltip': {
        'xDateFormat': '%m/%Y'
    }
}

chart.set_dict_options(options)

chart.add_data_set(data_ar_list, 'area', 'Precio ($)', marker={'enabled': False}) 

chart

### En dólares

In [44]:
data_ar_usd = df.filter('name = "Argentina"')\
                .drop('name', 'iso_a3', 'currency_code', 'GDP_dollar')\
                .rdd\
                .map(lambda x: (int(time.mktime(x.date.timetuple())), float(x.local_price)/float(x.dollar_ex)))\
                .collect()

In [45]:
data_ar_usd

[(954558000, 2.5),
 (986094000, 2.5),
 (1017630000, 0.7987220447284346),
 (1049166000, 1.423611111111111),
 (1083380400, 1.4779661016949153),
 (1117594800, 1.6396272005522956),
 (1136084400, 1.550362295188981),
 (1146452400, 2.290201210534925),
 (1167620400, 2.670983407527317),
 (1180666800, 2.668607472100922),
 (1212289200, 3.642987249544627),
 (1246417200, 3.0173827484421123),
 (1262314800, 1.842832697117283),
 (1277953200, 3.558945027009851),
 (1309489200, 4.839685420447671),
 (1325386800, 4.636606004404776),
 (1341111600, 4.160963591568573),
 (1357009200, 3.8179443383904355),
 (1372647600, 3.8799076212471135),
 (1388545200, 3.0341340075853354),
 (1404183600, 2.5707727620504977),
 (1420081200, 3.2520325203252036),
 (1435719600, 3.0651340996168583),
 (1451617200, 2.3897025544472004),
 (1467342000, 3.3478406427854033),
 (1483239600, 3.4683903515686585),
 (1498878000, 4.1255341093266535),
 (1514775600, 3.9603960396039604),
 (1530414000, 2.7051397655545535),
 (1546311600, 2.669870511280

In [46]:
data_ar_usd_list = list()
for data in data_ar_usd:
    data_list = list(data)
    item = list()
    item.append(data_list[0]*1000)
    item.append(data_list[1])
    data_ar_usd_list.append(item)
#data_ar_usd_list

In [47]:
chart = Highchart()

chart.set_options('chart', {})

options = {
    'title': {
        'text': 'Precio BigMac en USD en Argentina'
    },
    'xAxis': {
        'type': 'datetime'
    },
    'yAxis': {
        'title': {
            'text': 'Precio (USD)'
        }
    },
    'legend': {
        'enabled': False
    },
    'tooltip': {
        'xDateFormat': '%m/%Y'
    }
}

chart.set_dict_options(options)

chart.add_data_set(data_ar_usd_list, 'area', 'Precio (USD)', marker={'enabled': False}) 

chart

## Agregar una columna

In [48]:
df_usd = df.withColumn('usd_price', round((df.local_price / df.dollar_ex),2))

In [49]:
df_usd.show()

+--------------+------+-------------+-----------+-----------+----------+-------------------+---------+
|          name|iso_a3|currency_code|local_price|  dollar_ex|GDP_dollar|               date|usd_price|
+--------------+------+-------------+-----------+-----------+----------+-------------------+---------+
|     Argentina|   ARG|          ARS|        2.5|        1.0|      null|2000-04-01 00:00:00|      2.5|
|     Australia|   AUS|          AUD|       2.59|       1.68|      null|2000-04-01 00:00:00|     1.54|
|        Brazil|   BRA|          BRL|       2.95|       1.79|      null|2000-04-01 00:00:00|     1.65|
|       Britain|   GBR|          GBP|        1.9|0.632911392|      null|2000-04-01 00:00:00|      3.0|
|        Canada|   CAN|          CAD|       2.85|       1.47|      null|2000-04-01 00:00:00|     1.94|
|         Chile|   CHL|          CLP|     1260.0|      514.0|      null|2000-04-01 00:00:00|     2.45|
|         China|   CHN|          CNY|        9.9|       8.28|      null|2

In [50]:
df_usd.groupBy().avg('usd_price').show()

+------------------+
|    avg(usd_price)|
+------------------+
|3.3926437541308734|
+------------------+



## Info de todos los paises para la última medición

In [51]:
countries_df = df_usd.filter('date = "2019-07-09"')

In [52]:
countries_df.count()

72

In [53]:
countries_df.show()

+--------------------+------+-------------+-----------+-----------------+----------+-------------------+---------+
|                name|iso_a3|currency_code|local_price|        dollar_ex|GDP_dollar|               date|usd_price|
+--------------------+------+-------------+-----------+-----------------+----------+-------------------+---------+
|United Arab Emirates|   ARE|          AED|      14.75|          3.67315|  37732.66|2019-07-09 00:00:00|     4.02|
|           Argentina|   ARG|          ARS|      120.0|          41.8045|  14588.01|2019-07-09 00:00:00|     2.87|
|           Australia|   AUS|          AUD|       6.15|  1.4436263894904|  55957.72|2019-07-09 00:00:00|     4.26|
|             Austria|   AUT|          EUR|       3.81|0.892339267389462|  47383.87|2019-07-09 00:00:00|     4.27|
|          Azerbaijan|   AZE|          AZN|       3.95|           1.6965|   4200.33|2019-07-09 00:00:00|     2.33|
|             Belgium|   BEL|          EUR|        4.2|0.892339267389462|  43672

## Ordenando

In [55]:
countries_sorted_df = countries_df.orderBy(countries_df.usd_price.asc())

In [57]:
countries_sorted_df.explain(True)

== Parsed Logical Plan ==
Sort [usd_price#1306 ASC NULLS FIRST], true
+- Filter (date#30 = cast(2019-07-09 as timestamp))
   +- Project [name#24, iso_a3#25, currency_code#26, local_price#27, dollar_ex#28, GDP_dollar#29, date#30, round((local_price#27 / dollar_ex#28), 2) AS usd_price#1306]
      +- Relation[name#24,iso_a3#25,currency_code#26,local_price#27,dollar_ex#28,GDP_dollar#29,date#30] csv

== Analyzed Logical Plan ==
name: string, iso_a3: string, currency_code: string, local_price: double, dollar_ex: double, GDP_dollar: double, date: timestamp, usd_price: double
Sort [usd_price#1306 ASC NULLS FIRST], true
+- Filter (date#30 = cast(2019-07-09 as timestamp))
   +- Project [name#24, iso_a3#25, currency_code#26, local_price#27, dollar_ex#28, GDP_dollar#29, date#30, round((local_price#27 / dollar_ex#28), 2) AS usd_price#1306]
      +- Relation[name#24,iso_a3#25,currency_code#26,local_price#27,dollar_ex#28,GDP_dollar#29,date#30] csv

== Optimized Logical Plan ==
Sort [usd_price#1306 AS

In [56]:
countries_sorted_df.show()

+------------+------+-------------+-----------+-----------------+----------+-------------------+---------+
|        name|iso_a3|currency_code|local_price|        dollar_ex|GDP_dollar|               date|usd_price|
+------------+------+-------------+-----------+-----------------+----------+-------------------+---------+
|      Russia|   RUS|          RUB|      130.0|         63.83875|  10961.99|2019-07-09 00:00:00|     2.04|
|    Malaysia|   MYS|          MYR|       8.85|           4.1425|   9827.67|2019-07-09 00:00:00|     2.14|
|South Africa|   ZAF|          ZAR|       31.0|           14.175|   6182.25|2019-07-09 00:00:00|     2.19|
|     Romania|   ROU|          RON|        9.3|           4.2192|  10761.71|2019-07-09 00:00:00|      2.2|
|     Ukraine|   UKR|          UAH|       57.0|          25.6475|   2655.94|2019-07-09 00:00:00|     2.22|
|   Indonesia|   IDN|          IDR|    32000.0|          14130.0|   3884.72|2019-07-09 00:00:00|     2.26|
|      Taiwan|   TWN|          TWD|  

In [58]:
countries_info = countries_sorted_df.drop('iso_a3', 'currency_code', 'local_price', 'dollar_ex', 'GDP_dollar', 'date')\
                                    .collect()

In [59]:
countries_info

[Row(name='Russia', usd_price=2.04),
 Row(name='Malaysia', usd_price=2.14),
 Row(name='South Africa', usd_price=2.19),
 Row(name='Romania', usd_price=2.2),
 Row(name='Ukraine', usd_price=2.22),
 Row(name='Indonesia', usd_price=2.26),
 Row(name='Taiwan', usd_price=2.31),
 Row(name='Azerbaijan', usd_price=2.33),
 Row(name='Moldova', usd_price=2.41),
 Row(name='Turkey', usd_price=2.44),
 Row(name='Egypt', usd_price=2.53),
 Row(name='Hong Kong', usd_price=2.62),
 Row(name='Mexico', usd_price=2.65),
 Row(name='India', usd_price=2.67),
 Row(name='Slovenia', usd_price=2.69),
 Row(name='Philippines', usd_price=2.77),
 Row(name='Vietnam', usd_price=2.8),
 Row(name='Poland', usd_price=2.84),
 Row(name='Argentina', usd_price=2.87),
 Row(name='China', usd_price=3.05),
 Row(name='Pakistan', usd_price=3.05),
 Row(name='Hungary', usd_price=3.1),
 Row(name='Oman', usd_price=3.14),
 Row(name='Lithuania', usd_price=3.19),
 Row(name='Latvia', usd_price=3.19),
 Row(name='Peru', usd_price=3.19),
 Row(name=

In [60]:
countries = list()
for info in countries_info:
    countries.append(info.name)
#countries

In [61]:
countries_price = list()
for info in countries_info:
    countries_price.append(info.usd_price)
#countries_price

In [62]:
chart = Highchart()

chart.set_options('chart', {'inverted': True, 'height': 1500})

options = {
    'title': {
        'text': 'Precio BigMac en USD Enero 2019'
    },
    'xAxis': {
        'categories': countries
    },
    'plotOptions': {
        'series': {
            'pointWidth': 5
        } 
    },
    'yAxis': {
        'min': 0,
        'title': {
            'text': 'Precio (USD)'
        }
    },
    'legend': {
        'enabled': False
    },
    'tooltip': {
        'xDateFormat': '%m/%Y'
    }
}

chart.set_dict_options(options)

chart.add_data_set(countries_price, 'bar', 'Precio (USD)', marker={'enabled': False}) 

chart

In [63]:
countries_price_usa = list()
for info in countries_info:
    countries_price_usa.append(info.usd_price-5.74)
#countries_price_usa

In [64]:
chart = Highchart()

chart.set_options('chart', {'inverted': True, 'height': 1500})

options = {
    'title': {
        'text': 'Precio BigMac en USD con USA como referencia (Enero 2019)'
    },
    'xAxis': {
        'categories': countries
    },
    'plotOptions': {
        'series': {
            'pointWidth': 5
        } 
    },
    'yAxis': {
        'title': {
            'text': 'Precio (USD)'
        }
    },
    'legend': {
        'enabled': False
    },
    'tooltip': {
        'xDateFormat': '%m/%Y'
    }
}

chart.set_dict_options(options)

chart.add_data_set(countries_price_usa, 'bar', 'Precio (USD)', marker={'enabled': False}) 

chart

## Evolución del precio del BigMac en USA

In [65]:
data_us = df.filter('name = "United States"')\
            .drop('name', 'iso_a3', 'currency_code', 'GDP_dollar')\
            .rdd\
            .map(lambda x: (time.mktime(x.date.timetuple()), x.local_price))\
            .collect()

In [66]:
data_us_list = list()
for data in data_us:
    data_list = list(data)
    item = list()
    item.append(data_list[0]*1000)
    item.append(float(data_list[1]))
    data_us_list.append(item)
#data_us_list

In [67]:
chart = Highchart()

chart.set_options('chart', {})

options = {
    'title': {
        'text': 'Precio BigMac USA'
    },
    'xAxis': {
        'type': 'datetime'
    },
    'yAxis': {
        'title': {
            'text': 'Precio (USD)'
        }
    },
    'legend': {
        'enabled': False
    },
    'tooltip': {
        'xDateFormat': '%m/%Y'
    }
}

chart.set_dict_options(options)

chart.add_data_set(data_us_list, 'area', 'Precio (USD)', marker={'enabled': False}) 

chart

## Eliminar duplicados

In [68]:
df.dropDuplicates(['name']).count()

74

## Eliminar nulos

In [69]:
df.count()

1514

In [70]:
df.dropna().count()

901

Parameters
how – ‘any’ or ‘all’. If ‘any’, drop a row if it contains any nulls. If ‘all’, drop a row only if all its values are null.

thresh – int, default None If specified, drop rows that have less than thresh non-null values. This overwrites the how parameter.

subset – optional list of column names to consider.

In [71]:
df.dropna(thresh=7).count()

901

In [72]:
df.dropna(subset=['name']).count()

1514

## Completar nulos

In [73]:
df.show()

+--------------+------+-------------+-----------+-----------+----------+-------------------+
|          name|iso_a3|currency_code|local_price|  dollar_ex|GDP_dollar|               date|
+--------------+------+-------------+-----------+-----------+----------+-------------------+
|     Argentina|   ARG|          ARS|        2.5|        1.0|      null|2000-04-01 00:00:00|
|     Australia|   AUS|          AUD|       2.59|       1.68|      null|2000-04-01 00:00:00|
|        Brazil|   BRA|          BRL|       2.95|       1.79|      null|2000-04-01 00:00:00|
|       Britain|   GBR|          GBP|        1.9|0.632911392|      null|2000-04-01 00:00:00|
|        Canada|   CAN|          CAD|       2.85|       1.47|      null|2000-04-01 00:00:00|
|         Chile|   CHL|          CLP|     1260.0|      514.0|      null|2000-04-01 00:00:00|
|         China|   CHN|          CNY|        9.9|       8.28|      null|2000-04-01 00:00:00|
|Czech Republic|   CZE|          CZK|      54.37|       39.1|      nul

In [74]:
df.fillna(0).show()

+--------------+------+-------------+-----------+-----------+----------+-------------------+
|          name|iso_a3|currency_code|local_price|  dollar_ex|GDP_dollar|               date|
+--------------+------+-------------+-----------+-----------+----------+-------------------+
|     Argentina|   ARG|          ARS|        2.5|        1.0|       0.0|2000-04-01 00:00:00|
|     Australia|   AUS|          AUD|       2.59|       1.68|       0.0|2000-04-01 00:00:00|
|        Brazil|   BRA|          BRL|       2.95|       1.79|       0.0|2000-04-01 00:00:00|
|       Britain|   GBR|          GBP|        1.9|0.632911392|       0.0|2000-04-01 00:00:00|
|        Canada|   CAN|          CAD|       2.85|       1.47|       0.0|2000-04-01 00:00:00|
|         Chile|   CHL|          CLP|     1260.0|      514.0|       0.0|2000-04-01 00:00:00|
|         China|   CHN|          CNY|        9.9|       8.28|       0.0|2000-04-01 00:00:00|
|Czech Republic|   CZE|          CZK|      54.37|       39.1|       0.

value:

* It will take a dictionary to specify which column will replace with which value.
* A value (int , float, string) for all columns.
 
subset: Specify some selected columns.

## Reemplazar valores

In [75]:
df.replace({'Argentina': 'AR'}, subset=['name']).show()

+--------------+------+-------------+-----------+-----------+----------+-------------------+
|          name|iso_a3|currency_code|local_price|  dollar_ex|GDP_dollar|               date|
+--------------+------+-------------+-----------+-----------+----------+-------------------+
|            AR|   ARG|          ARS|        2.5|        1.0|      null|2000-04-01 00:00:00|
|     Australia|   AUS|          AUD|       2.59|       1.68|      null|2000-04-01 00:00:00|
|        Brazil|   BRA|          BRL|       2.95|       1.79|      null|2000-04-01 00:00:00|
|       Britain|   GBR|          GBP|        1.9|0.632911392|      null|2000-04-01 00:00:00|
|        Canada|   CAN|          CAD|       2.85|       1.47|      null|2000-04-01 00:00:00|
|         Chile|   CHL|          CLP|     1260.0|      514.0|      null|2000-04-01 00:00:00|
|         China|   CHN|          CNY|        9.9|       8.28|      null|2000-04-01 00:00:00|
|Czech Republic|   CZE|          CZK|      54.37|       39.1|      nul

Parameters
to_replace – bool, int, long, float, string, list or dict. Value to be replaced. If the value is a dict, then value is ignored or can be omitted, and to_replace must be a mapping between a value and a replacement.

value – bool, int, long, float, string, list or None. The replacement value must be a bool, int, long, float, string or None. If value is a list, value should be of the same length and type as to_replace. If value is a scalar and to_replace is a sequence, then value is used as a replacement for each item in to_replace.

subset – optional list of column names to consider. Columns specified in subset that do not have matching data type are ignored. For example, if value is a string, and subset contains a non-string column, then the non-string column is simply ignored.

## Columna a partir de expresión SQL

In [77]:
df.selectExpr("local_price * dollar_ex as usd_price").show()

+------------------+
|         usd_price|
+------------------+
|               2.5|
|4.3511999999999995|
| 5.280500000000001|
|      1.2025316448|
|            4.1895|
|          647640.0|
|            81.972|
|          2125.867|
|198.98999999999998|
|     2.75268817152|
|            79.458|
|           94581.0|
|        1.152025E8|
|58.724999999999994|
|           31164.0|
|            17.176|
|196.66899999999998|
| 6.833999999999999|
|             23.65|
|           1125.75|
+------------------+
only showing top 20 rows



## Renombrar columna

In [78]:
df.withColumnRenamed('local_price','price').show()

+--------------+------+-------------+-------+-----------+----------+-------------------+
|          name|iso_a3|currency_code|  price|  dollar_ex|GDP_dollar|               date|
+--------------+------+-------------+-------+-----------+----------+-------------------+
|     Argentina|   ARG|          ARS|    2.5|        1.0|      null|2000-04-01 00:00:00|
|     Australia|   AUS|          AUD|   2.59|       1.68|      null|2000-04-01 00:00:00|
|        Brazil|   BRA|          BRL|   2.95|       1.79|      null|2000-04-01 00:00:00|
|       Britain|   GBR|          GBP|    1.9|0.632911392|      null|2000-04-01 00:00:00|
|        Canada|   CAN|          CAD|   2.85|       1.47|      null|2000-04-01 00:00:00|
|         Chile|   CHL|          CLP| 1260.0|      514.0|      null|2000-04-01 00:00:00|
|         China|   CHN|          CNY|    9.9|       8.28|      null|2000-04-01 00:00:00|
|Czech Republic|   CZE|          CZK|  54.37|       39.1|      null|2000-04-01 00:00:00|
|       Denmark|   DN

## Pivoteando

In [79]:
df.groupBy("name").pivot("date").min('local_price').show()

+-----------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+-------------------+
|       name|2000-04-01 00:00:00|2001-04-01 00:00:00|2002-04-01 00:00:00|2003-04-01 00:00:00|2004-05-01 00:00:00|2005-06-01 00:00:00|2006-01-01 00:00:00|2006-05-01 00:00:00|2007-01-01 00:00:00|2007-06-01 00:00:00|2008-06-01 00:00:00|2009-07-01 00:00:00|2010-01-01 00:00:00|2010-07-01 00:00:00|2011-07-01 00:00:00|2012-01-01 00:00:00|2012-07-01 00:00:00|2013-01-01 00

## Obtener Panda DataFrame

In [80]:
df.toPandas()

Unnamed: 0,name,iso_a3,currency_code,local_price,dollar_ex,GDP_dollar,date
0,Argentina,ARG,ARS,2.50,1.000000,,2000-04-01
1,Australia,AUS,AUD,2.59,1.680000,,2000-04-01
2,Brazil,BRA,BRL,2.95,1.790000,,2000-04-01
3,Britain,GBR,GBP,1.90,0.632911,,2000-04-01
4,Canada,CAN,CAD,2.85,1.470000,,2000-04-01
5,Chile,CHL,CLP,1260.00,514.000000,,2000-04-01
6,China,CHN,CNY,9.90,8.280000,,2000-04-01
7,Czech Republic,CZE,CZK,54.37,39.100000,,2000-04-01
8,Denmark,DNK,DKK,24.75,8.040000,,2000-04-01
9,Euro area,EUZ,EUR,2.56,1.075269,,2000-04-01


## Join

In [None]:
#df.join(other, on=None, how=None)

## SQL

In [81]:
df.registerTempTable('bigmac')

In [86]:
sqlCtx.sql('select name, max(local_price) from bigmac group by name order by max(local_price) desc limit 10 ').show()

+-----------+----------------+
|       name|max(local_price)|
+-----------+----------------+
|     Turkey|       4000000.0|
|    Vietnam|         65000.0|
|  Indonesia|         35750.0|
|   Colombia|         11900.0|
|  Venezuela|         10950.0|
|    Lebanon|          6500.0|
|South Korea|          4500.0|
|      Chile|          2640.0|
| Costa Rica|          2290.0|
|    Hungary|           900.0|
+-----------+----------------+



## PySpark DataFrame Vs Pandas DataFrame

* Operation on Pyspark DataFrame run parallel on different nodes in cluster but, in case of pandas it is not possible.
* Operations in PySpark DataFrame are lazy in nature but, in case of pandas we get the result as soon as we apply any operation.
* In PySpark DataFrame, we can’t change the DataFrame due to it’s immutable property, we need to transform it. But in pandas it is not the case.
* Pandas API support more operations than PySpark DataFrame. Still pandas API is more powerful than Spark.
* Complex operations in pandas are easier to perform than Pyspark DataFrame