# Uma Introdução ao PySpark


## Instalando o PySpark


In [1]:
# !pip install pyspark py4j

## Inicializando uma sessão no Spark

In [2]:
from pyspark.sql import SparkSession
spark = SparkSession \
          .builder \
          .appName("CDIA4-22-PySpark") \
          .getOrCreate()


spark

22/09/18 11:04:42 WARN Utils: Your hostname, gustavo-tessitore resolves to a loopback address: 127.0.1.1; using 192.168.15.23 instead (on interface enp5s0)
22/09/18 11:04:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/09/18 11:04:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/09/18 11:04:43 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
# Obtendo o contexto do spark
sc = spark.sparkContext
sc

## Criando um `DataFrame` a partir de uma `RDD`

In [4]:
# Creates a random list of something
rdd = sc.parallelize([
  ("XS", 2018, 5.65, 2.79, 6.24), 
  ("XR", 2018, 5.94, 2.98, 6.84), 
  ("X10", 2017, 5.65, 2.79, 6.13), 
  ("8Plus", 2017, 6.23, 3.07, 7.12)
])

# Use these names as the column names
cols = ['Model', 'Year', 'Height', 'Width', 'Weight']

# Create a DataFrame
df = spark.createDataFrame(rdd, schema=cols)
df.show()

+-----+----+------+-----+------+
|Model|Year|Height|Width|Weight|
+-----+----+------+-----+------+
|   XS|2018|  5.65| 2.79|  6.24|
|   XR|2018|  5.94| 2.98|  6.84|
|  X10|2017|  5.65| 2.79|  6.13|
|8Plus|2017|  6.23| 3.07|  7.12|
+-----+----+------+-----+------+



## Criando um `DataFrame` de arquivos

In [8]:
#TODO pedir o arquivo p Jeff
df = spark.read.csv("content/people.csv", header=True, inferSchema=True)
df.show()

AnalysisException: Path does not exist: file:/home/gustavo/Desktop/PUC/PySpark/content/people.csv

## Inspecionando `DataFrames`

In [None]:
df.dtypes

In [None]:
df.head(3)

In [None]:
df.first()

In [None]:
df.take(3)

In [None]:
df.schema

In [None]:
df.describe().show()

In [None]:
df.columns

In [None]:
df.count()

In [None]:
df.distinct().count()

In [None]:
df.printSchema()

In [None]:
df.explain()

## Queries

### `select()`

In [None]:
df.select("firstname").show()

In [None]:
df.select('firstname', 'age').show()

In [None]:
df.select(df.firstname, df.age + 1).show()

In [None]:
df.select(df.firstname, df.age > 30).show()

### `.alias()`

In [None]:
df.select(
    df.firstname,
    (df.age + 1).alias("age")
).show()

### `groupBy()`

In [None]:
grouped = df.groupBy("age").count()
grouped.show()

### `filter()`

In [None]:
filtered = df.filter(
    df.age > 30
).show()

### `substring`

In [None]:
df.select(
    df.firstname.substr(1,3).alias('name')
).show()

### `like()`

In [None]:
from pyspark.sql.functions import col

df.select(
    col("lastname"),
    col("lastname").like("%Tay%")
).show()

### `startswith()`

In [None]:
df.select(
    col("firstname"),
    col("firstname").startswith("M")
).show()

### `endswith()`

In [None]:
df.select(
    col("firstname"),
    col("firstname").endswith("a")
).show()

### `between()`

In [None]:
df.select(
    col("age"),
    col("age").between(22,30)
).show()

### Explodindo as colunas

In [None]:
# Motivação do problema: Situação inicial
# nome, idade, cursos em que a pessoa está matriculada
data = [('Jaya', '20', ['SQL','Data Science']),
        ('Milan', '21', ['ML','AI']),
        ('Rohit', '19', ['Programming', 'DSA']),
        ('Maria', '20', ['DBMS', 'Networking']),
        ('Jay', '22', ['Data Analytics','ML'])]

columns = ['Name', 'Age', 'Courses_enrolled']

df = spark.createDataFrame(data, columns)
df.printSchema()
df.show()

In [None]:
from pyspark.sql.functions import explode

exploded_df = df.select(
    df.Name,
    explode(df.Courses_enrolled)
)
exploded_df.printSchema()
exploded_df.show()

### Soluções dos Exercícios 

Considerando o dataset `movies.csv`, responda às questões do Practice 04.

In [9]:
# Importando o movies.csv
df_movies = spark.read.csv("data/movies.csv", header=True, inferSchema=True)
df_movies.show()

+--------------------+---------+--------------------+--------------+-------------+---------------+---------------+----+
|                film|    genre|         lead-studio|audience-score|profitability|rotten-tomatoes|worldwide-gross|year|
+--------------------+---------+--------------------+--------------+-------------+---------------+---------------+----+
|Zack and Miri Mak...|  Romance|The Weinstein Com...|            70|  1.747541667|             64|          41.94|2008|
|     Youth in Revolt|   Comedy|The Weinstein Com...|            52|         1.09|             68|          19.62|2010|
|You Will Meet a T...|   Comedy|         Independent|            35|  1.211818182|             43|          26.66|2010|
|        When in Rome|   Comedy|              Disney|            44|          0.0|             15|          43.04|2010|
|What Happens in V...|   Comedy|                 Fox|            72|  6.267647029|             28|         219.37|2008|
| Water For Elephants|    Drama|    20th

In [10]:
df_movies.printSchema()

root
 |-- film: string (nullable = true)
 |-- genre: string (nullable = true)
 |-- lead-studio: string (nullable = true)
 |-- audience-score: integer (nullable = true)
 |-- profitability: double (nullable = true)
 |-- rotten-tomatoes: integer (nullable = true)
 |-- worldwide-gross: double (nullable = true)
 |-- year: integer (nullable = true)



Importação dos módulos para a solução dos exercícios

In [11]:
from pyspark.sql.functions import col

In [12]:
# P01. Obtendo uma descrição estatística para os campos numéricos

df_movies.describe().show()

+-------+--------------------+-------+----------------+------------------+-----------------+------------------+------------------+------------------+
|summary|                film|  genre|     lead-studio|    audience-score|    profitability|   rotten-tomatoes|   worldwide-gross|              year|
+-------+--------------------+-------+----------------+------------------+-----------------+------------------+------------------+------------------+
|  count|                  74|     74|              74|                74|               74|                74|                74|                74|
|   mean|                null|   null|            null| 63.91891891891892|4.549382372391891| 46.91891891891892|136.35081081081074| 2009.054054054054|
| stddev|                null|   null|            null|13.683062979972128|8.174251914350178|26.332310876099108|157.06578623243522|1.3537557284756396|
|    min|(500) Days of Summer| Action|20th Century Fox|                35|              0.0|        

In [13]:
# P02. Que filmes tem audiência maior que 70?

df_movies.filter(
    col("audience-score") > 70
).show()

+--------------------+---------+--------------------+--------------+-------------+---------------+---------------+----+
|                film|    genre|         lead-studio|audience-score|profitability|rotten-tomatoes|worldwide-gross|year|
+--------------------+---------+--------------------+--------------+-------------+---------------+---------------+----+
|What Happens in V...|   Comedy|                 Fox|            72|  6.267647029|             28|         219.37|2008|
| Water For Elephants|    Drama|    20th Century Fox|            72|  3.081421053|             60|         117.09|2011|
|              WALL-E|Animation|              Disney|            89|  2.896019067|             96|         521.28|2008|
|            Twilight|  Romance|              Summit|            82|  10.18002703|             49|         376.66|2008|
|The Twilight Saga...|    Drama|              Summit|            78|      14.1964|             27|         709.82|2009|
|        The Proposal|   Comedy|        

In [14]:
# P03. Conte quantos filmes tem para cada ano

df_movies.groupBy(
  col("year")
).count().show()

+----+-----+
|year|count|
+----+-----+
|2007|   11|
|2009|   12|
|2010|   19|
|2011|   13|
|2008|   19|
+----+-----+



In [15]:
# P04. Que filmes começam com a palavra "The" nos títulos?

df_movies.select(
    col("film"),
    col("film").startswith("The")
).show()

+--------------------+---------------------+
|                film|startswith(film, The)|
+--------------------+---------------------+
|Zack and Miri Mak...|                false|
|     Youth in Revolt|                false|
|You Will Meet a T...|                false|
|        When in Rome|                false|
|What Happens in V...|                false|
| Water For Elephants|                false|
|              WALL-E|                false|
|            Waitress|                false|
| Waiting For Forever|                false|
|     Valentine's Day|                false|
|Tyler Perry's Why...|                false|
|Twilight: Breakin...|                false|
|            Twilight|                false|
|      The Ugly Truth|                 true|
|The Twilight Saga...|                 true|
|The Time Traveler...|                 true|
|        The Proposal|                 true|
|The Invention of ...|                 true|
|  The Heartbreak Kid|                 true|
|         

In [16]:
# P05. Que filmes tem receita entre 100 e 200?

df_movies.select(
    col("worldwide-gross"),
    col("worldwide-gross").between(100, 200)
).show()

+---------------+-------------------------------------------------------+
|worldwide-gross|((worldwide-gross >= 100) AND (worldwide-gross <= 200))|
+---------------+-------------------------------------------------------+
|          41.94|                                                  false|
|          19.62|                                                  false|
|          26.66|                                                  false|
|          43.04|                                                  false|
|         219.37|                                                  false|
|         117.09|                                                   true|
|         521.28|                                                  false|
|          22.18|                                                  false|
|           0.03|                                                  false|
|         217.57|                                                  false|
|          55.86|                     