# Descriptive Statistics Example with Pandas and Spark
## Overview


In [None]:
# Set up the environment for using pyspark
import findspark
findspark.init()

In [None]:
import pyspark
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.ml.linalg import Vectors

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Create Application Context
spark = SparkSession.builder.appName("Descriptive Statistics").getOrCreate()
sc = spark.sparkContext

In [None]:
sdf = spark.read.format('csv').options(header='true', inferSchema='true').load('../datasets/pima_diabetes.csv')

In [None]:
sdf.show(3)

#### Convert to Pandas

In [None]:
df_pd = sdf.toPandas()

### Print/Display the shape of dataset

In [None]:
df_pd.shape

### Print/Display the datatypes of all the columns

In [None]:
df_pd.info()

### Display the following statistics for the dataset

Count<br>
Mean<br>
Standard Devaition <br>
Minimum Value<br>
25th Percentile<br>
50th Percentile (Median)<br>
75th Percentile<br>
Maximum Value<br>

In [None]:
df_pd.describe()

### Class Distribution - display the counts with and without diabetes

In [None]:
df_pd['diabetes'].value_counts()

### Display the Correlations in the data
Correlation refers to the relationship between two variables and how they may or may not change together.

In [None]:
df_pd.corr()

### Display the skew in the data
Skew refers to a distribution that is assumed Gaussian (normal or bell curve) that is shifted or squashed in one direction or another.<br>
Positive (right) or Negative (left) skew. Values closer to zero show less skew.

In [None]:
df_pd.skew()

## Spark Dataframe Descriptive Statistics

<font color = 'red'>
<h3> Print the schema</h3>
</font>

In [None]:
sdf.printSchema()

<font color = 'red'>
    <h3>Display the Spark Dataframe</h3>
    Show 10 rows
</font>

In [None]:
sdf.show(10, truncate = False)

<font color = 'red'>
    <h3>Print the shape of the Spark Dataframe</h3>
    number of rows and number of columns
</font>

In [None]:
print('rows: {} columns: {}'.format(sdf.count(), len(sdf.columns)))

<font color = 'red'>
    <h3>Display the statistics of the Spark Dataframe</h3>
</font>

In [None]:
sdf.describe().show(truncate = False)

In [None]:
sdf.describe().toPandas().head(10)

<font color = 'red'>
    <h3>Print Correlation between 'age' and 'bmi'</h3>
</font>

In [None]:
sdf.stat.corr('age', 'bmi')

<font color = 'red'>
    <h3>Filter on num_preg greater than 5 and display all columns</h3>
</font>

In [None]:
sdf.filter('num_preg > 5').show()