<a href="https://colab.research.google.com/github/freddyduitama/GVD/blob/master/0_3_dataframes_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Configuración plataforma

In [0]:
# instala el ambiente de spark..solo se corre una vez
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q http://apache.osuosl.org/spark/spark-2.4.0/spark-2.4.0-bin-hadoop2.7.tgz
!tar xf spark-2.4.0-bin-hadoop2.7.tgz
!pip install -q findspark

In [0]:
#Configura variables de ambiente en sistema operativo
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-2.4.0-bin-hadoop2.7"

In [0]:
#import  librerias a usar
import findspark
findspark.init()
from pyspark.sql import SparkSession , SQLContext
from pyspark import SparkContext,SparkConf

In [0]:
# monta el google drive para usar sus archivos
from google.colab import drive
drive.mount('/gdrive', force_remount=True)

Mounted at /gdrive


In [0]:
# lista archivos en el drive usado por colaborative
!ls -l /gdrive/'My Drive'/'Colab Notebooks'

total 109
-rw------- 1 root root 26178 Mar  6 20:17 0.1.wordcount.ipynb
-rw------- 1 root root 48280 Mar  6 20:18 0.2.Introduction-To-Spark.ipynb
-rw------- 1 root root 13996 Mar  6 20:20 0.3.dataframes-1.ipynb
-rw------- 1 root root 13485 Mar  6 20:18 0.4.dataframes-2.ipynb
drwx------ 2 root root  4096 Mar  5 20:40 casos-ejemplo
drwx------ 2 root root  4096 Mar  6 20:16 Data


In [0]:
# define variables de ambiente de la sesion SPARK
conf = SparkConf().setAppName("ejemplo").setMaster("local[*]")
sc = SparkContext(conf=conf)
spark = SQLContext(sc)

In [0]:
#inicia sesion en el cluster. 
spark = SparkSession.builder.master("local[*]").getOrCreate()

# **Ejemplo 1**  Leer archivo json y deducir el esquema del dataframe

In [0]:
#  INST A: Opcional..si quiere subir archivos al ambiente de trabajo
from google.colab import files
datafile = files.upload()

In [0]:
# lee datos de archivo json y crea DataFrame
df = spark.read.format("json").load("/gdrive/My Drive/Colab Notebooks/Data/datos.json")

In [0]:
df.show()

+--------------------+-------------------+-----+
|   DEST_COUNTRY_NAME|ORIGIN_COUNTRY_NAME|count|
+--------------------+-------------------+-----+
|       United States|            Romania|   15|
|       United States|            Croatia|    1|
|       United States|            Ireland|  344|
|               Egypt|      United States|   15|
|       United States|              India|   62|
|       United States|          Singapore|    1|
|       United States|            Grenada|   62|
|          Costa Rica|      United States|  588|
|             Senegal|      United States|   40|
|             Moldova|      United States|    1|
|       United States|       Sint Maarten|  325|
|       United States|   Marshall Islands|   39|
|              Guyana|      United States|   64|
|               Malta|      United States|    1|
|            Anguilla|      United States|   41|
|             Bolivia|      United States|   30|
|       United States|           Paraguay|    6|
|             Algeri

In [0]:
df.select("firstName", df["age"] +1).show()  

+---------+---------+
|firstName|(age + 1)|
+---------+---------+
|   Rajesh|       25|
|    Peter|       21|
|     Mary|       18|
+---------+---------+



In [0]:
df.select("firstName", "age").filter(df["age"] > 18).show()

+---------+---+
|firstName|age|
+---------+---+
|   Rajesh| 24|
|    Peter| 20|
+---------+---+



In [0]:
# importa librerias para manejar esquemas y tipos de datos con dataframe
from pyspark.sql.types import StructField, StructType, StringType, LongType

In [0]:
# define esquema a ser usado por el dataframe
myManualSchema = StructType([
  StructField("DEST_COUNTRY_NAME", StringType(), True),
  StructField("ORIGIN_COUNTRY_NAME", StringType(), True),
  StructField("count", LongType(), False, metadata={"hello":"world"})
])

In [0]:
df1 = spark.read.format("json").load("/gdrive/My Drive/Colab Notebooks/Data/2015-summary.json")

In [0]:
df1.printSchema

<bound method DataFrame.printSchema of DataFrame[DEST_COUNTRY_NAME: string, ORIGIN_COUNTRY_NAME: string, count: bigint]>

In [0]:
# lee archivo usando esquema previamente definido
df = spark.read.format("json").schema(myManualSchema).load("/gdrive/My Drive/Colab Notebooks/Data/2015-summary.json")

In [0]:
df.head(5)

[Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Romania', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Croatia', count=1),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='Ireland', count=344),
 Row(DEST_COUNTRY_NAME='Egypt', ORIGIN_COUNTRY_NAME='United States', count=15),
 Row(DEST_COUNTRY_NAME='United States', ORIGIN_COUNTRY_NAME='India', count=62)]

In [0]:
df.select("DEST_COUNTRY_NAME", "count").show(10) 

+-----------------+-----+
|DEST_COUNTRY_NAME|count|
+-----------------+-----+
|    United States|   15|
|    United States|    1|
|    United States|  344|
|            Egypt|   15|
|    United States|   62|
|    United States|    1|
|    United States|   62|
|       Costa Rica|  588|
|          Senegal|   40|
|          Moldova|    1|
+-----------------+-----+
only showing top 10 rows



In [0]:
from pyspark.sql.functions import expr, col, column
df.select(expr("DEST_COUNTRY_NAME AS destination")).show(2)

+-------------+
|  destination|
+-------------+
|United States|
|United States|
+-------------+
only showing top 2 rows



In [0]:
df.selectExpr("avg(count)", "count(distinct(DEST_COUNTRY_NAME))").show()

+-----------+---------------------------------+
| avg(count)|count(DISTINCT DEST_COUNTRY_NAME)|
+-----------+---------------------------------+
|1770.765625|                              132|
+-----------+---------------------------------+



In [0]:
df.select("ORIGIN_COUNTRY_NAME", "DEST_COUNTRY_NAME").distinct().count()

256