#### This Notebook aims to test basic operations in a dataframe (select, describe, show, dtypes...)

In [1]:
# Testing pyspark installation
# you have to install findspark with this command: conda install -c conda-forge findspark
# Make sure if you are behind a corporate proxy to whitelist conda-forge channel
import findspark
findspark.init()
findspark.find()

import os
# Make sure you add the good version of the package hadoop-aws, compatible to hadoop version already installed
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.hadoop:hadoop-aws:3.2.0 --conf spark.dynamicAllocation.enabled=true pyspark-shell '

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("Dataframe").getOrCreate()
spark


In [5]:
#pdf = spark.read.option("header", "true").csv("../data/reviews.csv")
#dfp.printSchema()

pdf = spark.read.option("header","true").csv('../data/reviews.csv',inferSchema=True)
pdf.show(5)

+----------+----------+
|listing_id|      date|
+----------+----------+
|      2818|      null|
|      2818|2009-04-24|
|      2818|2009-05-03|
|      2818|2009-05-18|
|      2818|2009-05-25|
+----------+----------+
only showing top 5 rows



In [23]:
pdf.head(3)

[Row(listing_id=2818, date='2009-03-30'),
 Row(listing_id=2818, date='2009-04-24'),
 Row(listing_id=2818, date='2009-05-03')]

In [4]:
pdf.columns

['listing_id', 'date']

In [35]:
pdf.select("listing_id","date").show(2)

+----------+----------+
|listing_id|      date|
+----------+----------+
|      2818|2009-03-30|
|      2818|2009-04-24|
+----------+----------+
only showing top 2 rows



In [38]:
pdf["date"]

Column<'date'>

In [39]:
pdf.dtypes

[('listing_id', 'int'), ('date', 'string')]

In [40]:
pdf.describe().show()

+-------+--------------------+----------+
|summary|          listing_id|      date|
+-------+--------------------+----------+
|  count|              272056|    272056|
|   mean|1.3869881695500191E7|      null|
| stddev|1.2648607608005749E7|      null|
|    min|                2818|2009-03-30|
|    max|            53294643|2021-12-05|
+-------+--------------------+----------+



In [52]:
# Add Columns in our Dataframe
pdf = pdf.withColumn("listing_id_2", pdf["listing_id"]+2)


In [54]:
pdf.show(5)

+----------+----------+------------+
|listing_id|      date|listing_id_2|
+----------+----------+------------+
|      2818|2009-03-30|        2820|
|      2818|2009-04-24|        2820|
|      2818|2009-05-03|        2820|
|      2818|2009-05-18|        2820|
|      2818|2009-05-25|        2820|
+----------+----------+------------+
only showing top 5 rows



In [56]:
## Drop Columns

pdf = pdf.drop("listing_id_2")

In [58]:
pdf.show(5)

+----------+----------+
|listing_id|      date|
+----------+----------+
|      2818|2009-03-30|
|      2818|2009-04-24|
|      2818|2009-05-03|
|      2818|2009-05-18|
|      2818|2009-05-25|
+----------+----------+
only showing top 5 rows



In [59]:
## Rename Columns
pdf.withColumnRenamed("listing_id", "id").show(5)

+----+----------+
|  id|      date|
+----+----------+
|2818|2009-03-30|
|2818|2009-04-24|
|2818|2009-05-03|
|2818|2009-05-18|
|2818|2009-05-25|
+----+----------+
only showing top 5 rows



In [None]:
spark.stop()