# Spark Dataframe Example

In [None]:
# Pandas related libraries also used with Spark
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Add following 3 cells to run Spark with Jupyter Notebook

In [None]:
# Set up the environment for using pyspark
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.ml.linalg import Vectors

In [None]:
# Create Application Context
spark = SparkSession.builder.appName("Dataframe Example").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("Error")

## Create Dataframe from csv file

In [None]:
# Read the dataset file which is in csv - comma separated values format
sdf = spark.read.format('csv').options(header='true', 
                                       inferSchema='true').load('../datasets/Mall_Customers.csv')

In [None]:
# Print type of the variable
print(type(sdf))

## Dataframe Manipulations

In [None]:
# Display schema
sdf.printSchema()

In [None]:
# Show first 5 rows of data
sdf.head(5)

In [None]:
# By default show will print the first 20 rows of the dataframe
sdf.show(5, truncate = False)

In [None]:
# print shape - number of rows and columns
print('Rows: {} Columns: {}'.format(sdf.count(), len(sdf.columns)))

In [None]:
# show the data types of each column
sdf.dtypes

## Statistical information

In [None]:
# On all columns
sdf.describe().show()

In [None]:
# By individual column
sdf.describe('Age').show()

## Accessing information in dataframe

In [None]:
# Display Age and Genre
sdf.select('Age', 'Genre').show(5)

In [None]:
# Get distinct age groups
sdf.select('Age').distinct().count()

In [None]:
# Display the distinct values in Genre
sdf.select('Genre').distinct().show()

In [None]:
# Get the count of males and females
sdf.groupBy('Genre').count().orderBy('count').show()

In [None]:
# map Age and Gender and get counts
sdf.crosstab('Age', 'Genre').show(sdf.count())

In [None]:
# Age > 50
sdf.filter(sdf.Age > 50).count()

In [None]:
# Select by age and then provide mean for each age group
sdf.groupby('Age').agg({'Annual Income (k$)' : 'mean'}).show()

In [None]:
# Correlation - provide quantitative measurements of the statistical dependence between two variables
sdf.stat.corr('Age', 'Annual Income (k$)')

## Convert to Pandas dataframe

In [None]:
df_pd = sdf.toPandas()

In [None]:
# pandas dataframe head by default show first 5 rows of dataframe
df_pd.head()

In [None]:
# Tail shows the last n elements of dataframe. NO SUPPORT FOR tail IN SPARK DATAFRAME
# df_pd.tail()     # will show last 5 rows

df_pd.tail(10)

In [None]:
df_pd.isnull().any()

In [None]:
# similar to printing schema in Spark Dataframe
df_pd.info()

In [None]:
# describe without parameter displays only numeric features
df_pd.describe()

In [None]:
# describe for non numeric features use include parameter
df_pd.describe(include = 'O')

#### Convert the Spark Dataframe into Pandas Dataframe to plot - pyplot library is available in Spark to do plotting

In [None]:
df_pd.plot.scatter(x = 'Age', y = 'Annual Income (k$)')
plt.show()