# Descriptive Statistics Lab
## Overview


In [None]:
# Set up the environment for using pyspark
import findspark
findspark.init()

In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf, SparkContext
from pyspark.ml.linalg import Vectors

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [None]:
# Create Application Context
spark = SparkSession.builder.appName("Descriptive Statistics").getOrCreate()
sc = spark.sparkContext
sc.setLogLevel("Error")

In [None]:
# Read data into Spark Dataframe
sdf = spark.read.format('csv').options(header='true', inferSchema='true').load('../datasets/pima_diabetes.csv')

In [None]:
sdf.show(3)

## Convert to Pandas

In [None]:
df_pd = sdf.toPandas()

## Print/Display the shape of dataset

In [None]:
df_pd.shape

## Print/Display the datatypes of all the columns

In [None]:
df_pd.info()

## Display the following statistics for the dataset
<ul>
<li>Count</li>
<li>Mean</li>
<li>Standard Devaition</li>
<li>Minimum Value</li>
<li>25th Percentile</li>
<li>50th Percentile (Median)</li>
<li>75th Percentile</li>
<li>Maximum Value</li>
</ul>

In [None]:
df_pd.describe()

## Class Distribution - display the counts with and without diabetes

In [None]:
df_pd['diabetes'].value_counts()

## Display the Correlations in the data
Correlation refers to the relationship between two variables and how they may or may not change together.

In [None]:
df_pd.corr()

## Display the skew in the data
Skew refers to a distribution that is assumed Gaussian (normal or bell curve) that is shifted or squashed in one direction or another.<br>
Positive (right) or Negative (left) skew. Values closer to zero show less skew.

In [None]:
df_pd.skew()

<font color = 'tomato'>
    <h1>Spark Dataframe Descriptive Statistics</h1>
</font>

<font color = 'tomato'>
<h2> Print the schema</h2>
</font>

<font color = 'tomato'>
    <h2>Display the Spark Dataframe</h2>
    Show 10 rows
</font>

<font color = 'tomato'>
    <h2>Print the shape of the Spark Dataframe</h2>
    number of rows and number of columns
</font>

<font color = 'tomato'>
    <h2>Display the statistics of the Spark Dataframe</h2>
</font>

<font color = 'tomato'>
    <h2>Print Correlation between 'age' and 'bmi'</h2>
</font>

<font color = 'tomato'>
    <h2>Filter on num_preg greater than 5 and display all columns</h2>
</font>