# **EGCO 425: Chapter 2 (Data Exploration & Visualization)**

## Google Colab



In [None]:
## If using Colab

from google.colab import drive
#drive.mount('/content/drive')
drive.mount('/content/drive', force_remount=True)

import os
os.chdir('/content/drive/MyDrive/Workspace/425')          ## replace Workspace/425 with your folder
%cd /content/drive/MyDrive/Workspace/425

In [None]:
import pandas as pd
import matplotlib.pyplot as plt             ## to work with visualization (backend)
import seaborn as sns                       ## to work with high-level visualization

## (1) Titanic Data

In [None]:
TitanicDF = pd.read_excel('./data/TitanicExcel.xlsx')    ### row 0 = header by default
TitanicDF.head()

In [None]:
TitanicDF.info()

In [None]:
### Rename column names

mapping = {TitanicDF.columns[1]: 'PassengerClass', TitanicDF.columns[3]: 'SiblingsSpouses', TitanicDF.columns[4]: 'ParentsChildren', TitanicDF.columns[5]: 'Fare'}
TitanicDF.rename(columns = mapping, inplace = True)

TitanicDF.head()

### Descriptive Statistics (numeric attributes)
**[Manual: pandas.DataFrame.describe](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.describe.html)**

Try
- Individual statistics --> **mean()**, **min()**, **max()**, **std()**
- Use **groupby(..)** for statistics by categories

In [None]:
TitanicDF.describe()                      ### only numeric attributes by default
#TitanicDF.describe(include = 'all')       ### all attributes

In [None]:
numericCols = ['Age', 'Fare']

print("\n\n----- Mean by survived -----")
print(TitanicDF.groupby('Survived')[numericCols].mean().to_markdown())

print("\n\n----- SD by survived -----")
print(TitanicDF.groupby('Survived')[numericCols].std().to_markdown())

### Descriptive Statistics (nominal attributes)
**[Manual: pandas.DataFrame.value_counts](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.value_counts.html)**

In [None]:
nominalCols = ['Sex', 'PassengerClass']

for col in nominalCols:
    print("\n\n----- Frequencies of " + col + " -----")
    print(TitanicDF[col].value_counts().to_markdown())
    #print(TitanicDF[col].value_counts(normalize = True).to_markdown())

    #print(TitanicDF.groupby('Survived')[col].value_counts().to_markdown())

## (2) Iris Data

In [None]:
IrisDF = pd.read_excel('./data/IrisFromWeka.xlsx')    ### row 0 = header by default
IrisDF.head()

In [None]:
IrisDF.sample(5)

## Basic plots by Seaborn
- Check data distribution (histogram, KDE plot) --> **[Manual: seaborn.histplot](https://seaborn.pydata.org/generated/seaborn.histplot.html)**, **[Manual: seaborn.kdeplot](https://seaborn.pydata.org/generated/seaborn.kdeplot.html)**
- Check data distribution & outliers (boxplot) --> **[Manual: seaborn.boxplot](https://seaborn.pydata.org/generated/seaborn.boxplot.html)**
- Check correlation (scatter plot) --> **[Manual: seaborn.pairplot](https://seaborn.pydata.org/generated/seaborn.pairplot.html)**, **[Manual: seaborn.scatterplot](https://seaborn.pydata.org/generated/seaborn.scatterplot.html)**
- Subplots --> **[Manual: matplotlib.pyplot.subplot](https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.subplot.html)**

In [None]:
import seaborn as sns

In [None]:
#snsplot = sns.histplot(data = IrisDF)
#snsplot = sns.histplot(data = IrisDF, x = 'sepallength')
#snsplot = sns.histplot(data = IrisDF, x = 'sepallength', kde = True)

#snsplot = sns.kdeplot(data = IrisDF)
#snsplot = sns.kdeplot(data = IrisDF, x = 'sepallength', hue = 'class')

In [None]:
selectedCols = ['sepallength', 'sepalwidth', 'petallength', 'petalwidth']

fig = plt.figure( figsize = (8,6) )      # figure size

ax = plt.subplot(221)                     # subplot rci --> r = rows, c = cols, i = index
sns.histplot(data = IrisDF, x = selectedCols[0]).set_title("Histogram")

ax = plt.subplot(222)
sns.kdeplot(data = IrisDF, x = selectedCols[0]).set_title("KDE")

ax = plt.subplot(223)
sns.histplot(data = IrisDF, x = selectedCols[0], kde = True).set_title("Histogram with KDE")

ax = plt.subplot(224)
sns.boxplot(data = IrisDF, x = selectedCols[0], width = 0.2).set_title("Boxplot")

fig.tight_layout()
plt.show()

In [None]:
#snsplot = sns.relplot(data = IrisDF, x = selectedCols[0], y = selectedCols[2])
#snsplot = sns.relplot(data = IrisDF, x = selectedCols[0], y = selectedCols[2], hue = 'class')

In [None]:
### Histograms and scatter plots between all pair of attributes together

#snsplot = sns.pairplot(data = IrisDF)
#snsplot = sns.pairplot(data = IrisDF, hue = 'class')

In [None]:
snsplot = sns.boxplot(data = IrisDF)

In [None]:
selectedCols = ['sepallength', 'sepalwidth', 'petallength', 'petalwidth']

fig = plt.figure( figsize = (8,8) )

for i in range(4):
    plotindex = 221 + i
    ax = plt.subplot(plotindex)
    sns.boxplot(data = IrisDF, y = selectedCols[i], x = 'class')

fig.tight_layout()
plt.show()