In [None]:
# Importing Jupyter Black Formatter.
import jupyter_black

jupyter_black.load()

# ICS 214 IT Workshop III (Python) | IIIT Kottayam
# Session 8 - Let's Talk Data: Python, Numpy, and Visualizations | Wednesday, December 14, 2022
#### **Author:** Anmol Krishan Sachdeva (@greatdevaks)

**Note:** Session inspired by https://aaltoscicomp.github.io/python-for-scicomp/ and Real Python.

## Common Tools

- Jupyter - Interactive Analysis
- NumPy and SciPy - Numerical Analysis
- Matplotlib - Visualization

## NumPy

- A third-party Python library.
- Provides a data structure - the N-dimensional (Multidimensional) Array.
- Used in Data Science and Scientific Computing.

### Pre-requisites

- Matrix Operations in Mathematics

### Why use NumPy over traditional code which uses Loops, Flat Files, etc.?

- **Performance:** NumPy is performance optimized and written in C language.
- **Less Looping:** Less number of loops required.
- **Clean Code:** More aligned with scientific representations.

### Important Terminologies

- **Scalars:** Single Element stored in a variable.
- **Vectors:** Single-dimensional Arrays.
- **Shape:** The size of each dimension of the Array.
    - Critical when using functions because Arrays need to have the right shape in order to be fed to a function or operated on.

In [None]:
# Installing Python packages from within Jupyter.
!python -m pip install numpy matplotlib

In [None]:
# Example: NumPy usage.

import numpy as np

# Creating NumPy Arrays.
scores = np.array([51, 20, 34, 79, 82, 99, 10, 30])

# Taking mean of scores.
mean_score = scores.mean()

# Defining a Scalar.
score_threshold = 85

# Calculating the mean score diff from the score threshold
score_diff = score_threshold - mean_score

# Vectorization: Performing same operation on every element of the Array.
# Broadcasting: Performing Vectorization on two Arrays of different shapes.
vectorized_scores = scores + score_diff

print(f"Mean Score: {mean_score}")
print(f"Score Diff: {score_diff}")
print(f"Original Scores: {scores}")
print(f"Vectorized Scores: {vectorized_scores}")

# Using Broadcasting for Clipping.
print(f"Clipped Scores: {np.clip(vectorized_scores, a_min=scores, a_max=100)}")

In [None]:
# Example: Multi-dimensional Arrays.

num_matrix = np.array([
    [1, 2, 3],
    [4, 5, 6],
    [7, 8, 9],
])

print(num_matrix)
print(num_matrix.shape)
print(num_matrix.size)

# Returning the maximum element in the entire Array.
print(num_matrix.max())

In [None]:
# Example: Shape of Arrays.

boiling_point = np.array([
    10.30, 42.1, 18.8, 16.1, 38.0, 12.5, 12.6, 49.9, 38.6, 31.3, 20.21, 1.10
]).reshape(2, 2, 3)

print(boiling_point)
print(boiling_point.shape)

In [None]:
# Example: Swapping the axes.

swapped_axes = np.swapaxes(boiling_point, 1, 2)
print(swapped_axes)
print(swapped_axes.shape)

In [None]:
# Example: Max Element along Axis.

print(f"Array:\n{num_matrix}\n")

print(f"Axis 0: {num_matrix.max(axis=0)}")
print(f"Axis 1: {num_matrix.max(axis=1)}")

# Note: When feeding the Array to a function, if no Axis is specified, the operation is performed over the entire Array.

In [None]:
# Example: Array arrangement with shapes followed by broadcasting.

# Creating arrays in a range using arange().
A = np.arange(12).reshape(1, 3, 4) # 1 Plane with 3 Rows and 4 Columns.
B = np.arange(20).reshape(5, 1, 4) # 5 Planes with 1 Row and 4 Columns.

print(f"A:\n {A}\n")
print(f"B:\n {B}\n")

# Note: Broadcasting can be done if the Arrays match in dimensions or one of the Arrays has a size of 1. Explore having size of axies match.
print(f"A + B:\n {A + B}\n")

In [None]:
# Example: Performance Test.

a = list(range(100000)) # List from 0 to 99999.
b = [0] * 100000 # Empty list of 100000 elements.

In [None]:
%%timeit
for i in range(len(a)):
    b[i] = a[i]**2 # Square `a` and put in `b`.

In [None]:
import numpy as np
a = np.arange(100000)
b = np.zeros(100000)

In [None]:
%%timeit
b = a ** 2

In [None]:
# Example: Some other Array creation methods.

print(f"{np.zeros((2, 3))}\n") # 2x3 array with all elements 0.
print(f"{np.ones((1,2))}\n") # 1x2 array with all elements 1.
print(f"{np.full((2,2),7)}\n") # 2x2 array with all elements 7.
print(f"{np.eye(2)}\n") # 2x2 identity matrix.
print(f"{np.arange(10)}\n") # Evenly spaced values in an interval.
print(f"{np.linspace(0,9,5)}\n") # Similar to arrange but uses a sample number for defining the steps.
print(f"{np.ones((3, 2), 'bool')}\n") # 3x2 boolean array.

In [None]:
# Example: Storing Arrays in Files.

np.save('num_matrix.npy', num_matrix)           # Save the array `num_matrix` to `num_matrix.npy` file.
x = np.load('num_matrix.npy')          # Load an array from `num_matrix .npy file` and store it in variable `x`.

In [None]:
# Example: Common Arithematic Operations.

a = np.array([[1,2],[3,4]])
b = np.array([[5,6],[7,8]])

# Addition
c = a + b
d = np.add(a,b)

# Matrix multiplication
e = a @ b
f = np.dot(a, b)

In [None]:
# Example: Indexing and Slicing.

a = np.arange(16).reshape(4, 4)  # 4x4 matrix from 0 to 15.
print(a)
print(a[0])                             # First row.
print(a[:,0])                           # First column.
print(a[:,1])                           # Second column.
print(a[1:3,1:3])                       # Middle 2x2 array.

print(a[(0, 1), (1, 1)])                # Second element of first and second row as array.

In [None]:
# Example: Masking.

numbers = np.linspace(5, 50, 24, dtype=int).reshape(4, -1)
print(numbers)

# Vectorization.
mask = numbers % 4 == 0
print(mask)

print(numbers[mask])
# Similar to numbers[numbers % 4 == 0].

In [None]:
# Example: Record/Structured Arrays.

import numpy as np

data = np.array([
    ("wang", 32, 6),
    ("john", 15, 20),
    ("christian", 80, 100),
    ("daniel", 38, 9001),
    ], dtype=[("name", str, 10), ("age", int), ("power", int)])

print(data[0])

print(data["name"])

In [None]:
# Example: Matrix Transpose.

import numpy as np
a = np.random.rand(10000, 20000)
print(a)
print(f'Matrix `a` takes up {a.nbytes / 10**6} MB')

In [None]:
%%timeit
b = a.transpose()

## Pandas

- High-performance data structures along with analaysis capabilities.


In [None]:
# Installing Pandas library.
!python -m pip install pandas

In [None]:
# Example: Pandas.

import pandas as pd

# Fetching data into Pandas DataFrame from CSV file.
# DataFrame is a tabular data.
# URL: https://raw.githubusercontent.com/pandas-dev/pandas/master/doc/data/titanic.csv
file_path = "titanic.csv"
titanic = pd.read_csv(file_path, index_col='Name')

# Printing the first few elements.
print(f"Head:\n{titanic.head()}\n")

# Summary for each column.
print(f"Description:\n{titanic.describe()}")

In [None]:
# Example: Print survival probablitiy with average age of people who survived and didn't survive.
print(titanic.groupby("Survived")["Age"].mean())

In [None]:
titanic.hist(column='Age', by='Survived', bins=25, figsize=(8,10),
             layout=(2,1), zorder=2, sharex=True, rwidth=0.9);

## Visualizations

- Libraries:
    - Matplotlib
    - Plotly
    - Bokeh
    - Seaborn
    - Many more
- Should be familiar to the MATLAB users.

In [None]:
# This line tells Jupyter to display matplotlib figures in the notebook.
%matplotlib inline

import matplotlib.pyplot as plt

# This is dataset 1 from
# https://en.wikipedia.org/wiki/Anscombe%27s_quartet
data_x = [10.0, 8.0, 13.0, 9.0, 11.0, 14.0, 6.0, 4.0, 12.0, 7.0, 5.0]
data_y = [8.04, 6.95, 7.58, 8.81, 8.33, 9.96, 7.24, 4.26, 10.84, 4.82, 5.68]

fig, ax = plt.subplots()

ax.scatter(x=data_x, y=data_y, c="#E69F00")

ax.set_xlabel("X Axis")
ax.set_ylabel("Y Axis")
ax.set_title("Graph Title")

# Saving the graph.
fig.savefig("sample_plot.png")

## Pandas and Matplotlib Together

In [None]:
# Example: Pandas and Matplotlib Together.

import pandas as pd
import matplotlib.pyplot as plt

# URL: https://raw.githubusercontent.com/plotly/datasets/master/gapminder_with_codes.csv
file_path = "gapminder_with_codes.csv"
data = pd.read_csv(file_path)

data_2007 = data[data["year"] == 2007]

data_2007

In [None]:
fig, ax = plt.subplots()

ax.scatter(x=data_2007["gdpPercap"], y=data_2007["lifeExp"], alpha=0.5) # Alpha handles the transparency of the plot.

ax.set_xlabel("GDP (USD) per capita")
ax.set_ylabel("life expectancy (years)")