## **Matplotlib**

If you use Python, then at some point you will definitely want to visualize things. So in this part of the seminar we will guide you through some of the main plotting libraries, starting from the most basic and fundamental one, `matplotlib`. But if you are curious to find out more packages, [here](https://pyviz.org/tools.html) you can find a whole bunch of them.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# plot x vs y 
plt.plot([0, 1, 2, 3, 4, 5], [0, 1, 4, 9, 16, 25])
plt.show()

In [None]:
# plot several y
plt.plot([0, 1, 2, 3, 4, 5], [0, 1, 4, 9, 16, 25], [0, 1, 3, 5, 10, 30])

In [None]:
# scatter-plot

x = np.arange(5)
plt.scatter(x, x**2)

plt.show()  # try to comment me
plt.plot(x, x**2)

In [None]:
# Draw a scatter plot with custom markers and colours
plt.scatter([1, 1, 2, 3, 4, 4.5], [3, 2, 2, 5, 15, 24],
            c=["red", "blue", "orange", "green", "cyan", "gray"],
            marker=",")

plt.scatter([0.0, 0.1, 0.2], [2, 4, 16],
            c=["red", "blue", "orange"],
            marker="o")

# Without plt.show(), several plots will be drawn on top of one another
plt.plot([0, 1, 2, 3, 4, 5], [0, 1, 4, 9, 16, 25], c="blue")

plt.title("Title")
plt.xlabel("Text 1")
plt.ylabel("Text 2")

In [None]:
# Histogram - showing data density
plt.hist([0, 1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 4, 5, 5, 5, 6, 7, 7, 8, 9,
          10])  # Plot a histogram.
plt.show()

plt.hist([0, 1, 1, 1, 2, 2, 3, 3, 3, 3, 3, 4, 4, 5, 5, 5, 6, 7, 7, 8, 9, 10],
         bins=5)
plt.show()

In [None]:
data = pd.read_csv("./data/train.csv", index_col='PassengerId')

In [None]:
data["Age"].plot(kind='hist')

In [None]:
data["Fare"].plot(kind='hist')
plt.grid()
plt.show()

In [None]:
data["Age"].plot(kind='hist')
data["Fare"].plot(kind='hist')

# Do you know where is 'Age'?

In [None]:
# Plot a histogram of age and a histogram of ticket fares on separate plots

plt.subplot(211)  # nrows, ncols, index, ...
plt.hist(data["Age"])
plt.subplot(212)  # Add a subplot to the current figure.
plt.hist(data["Fare"])
plt.show()

In [None]:
# Plot a histogram of age and a histogram of ticket fares on separate plots

plt.subplot(221)  # nrows, ncols, index, ...
plt.hist(data["Age"])
plt.subplot(222)
plt.hist(data["Fare"])
plt.subplot(223)
plt.hist(data["Fare"])
plt.subplot(224)
plt.hist(data["Age"])

plt.show()

In [None]:
# Make a scatter plot of passenger age vs ticket fare

m_data = data[data["Sex"] == "male"]
f_data = data[data["Sex"] == "female"]
plt.scatter(m_data["Age"], m_data["Fare"], c='r', label='male')
plt.scatter(f_data["Age"], f_data["Fare"], c='g', label='female')

plt.legend()
plt.grid()
plt.show()

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d') # and this returns an axis

# Make data
u = np.linspace(0, 2 * np.pi, 100)
v = np.linspace(0, np.pi, 100)
x = 10 * np.outer(np.cos(u), np.sin(v))  # Compute the outer product of two vectors
y = 10 * np.outer(np.sin(u), np.sin(v))
z = 10 * np.outer(np.ones(np.size(u)), np.cos(v))

# Plot the surface
ax.plot_surface(x, y, z)

plt.show()

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# Make data
u = np.linspace(0, 2 * np.pi, 10)
v = np.linspace(0, np.pi, 10)
x = 10 * np.outer(np.cos(u), np.sin(v))
y = 10 * np.outer(np.sin(u), np.sin(v))
z = 10 * np.outer(np.ones(np.size(u)), np.cos(v))

# Plot the surface
ax.plot_surface(x, y, z)

plt.show()

In [None]:
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# Make data
u = np.linspace(0, 2 * np.pi, 5)
v = np.linspace(0, np.pi, 5)
x = 10 * np.outer(np.cos(u), np.sin(v))
y = 10 * np.outer(np.sin(u), np.sin(v))
z = 10 * np.outer(np.ones(np.size(u)), np.cos(v))

# Plot the surface
ax.plot_surface(x, y, z)

plt.show()

In [None]:
# catch it

def lorenz(x, y, z, s=10, r=28, b=2.667):
    """
    Given:
       x, y, z: a point of interest in three-dimensional space
       s, r, b: parameters defining the Lorenz attractor
    Returns:
       x_dot, y_dot, z_dot: values of the Lorenz attractor's partial
           derivatives at the point x, y, z
    """
    x_dot = s * (y - x)
    y_dot = r * x - y - x * z
    z_dot = x * y - b * z
    return x_dot, y_dot, z_dot


dt = 0.01
num_steps = 10000

# Need one more for the initial values
xs = np.empty(
    num_steps + 1
)  # Return a new array of given shape and type, without initializing entries.
ys = np.empty(num_steps + 1)
zs = np.empty(num_steps + 1)

# Set initial values
xs[0], ys[0], zs[0] = (0., 1., 1.05)

# Step through "time", calculating the partial derivatives at the current point
# and using them to estimate the next point
for i in range(num_steps):
    x_dot, y_dot, z_dot = lorenz(xs[i], ys[i], zs[i])
    xs[i + 1] = xs[i] + (x_dot * dt)
    ys[i + 1] = ys[i] + (y_dot * dt)
    zs[i + 1] = zs[i] + (z_dot * dt)

# Plot
fig = plt.figure()
ax = fig.gca(
    projection='3d')  # Get the current axes, creating one if necessary.

ax.plot(xs, ys, zs, lw=0.5)
ax.set_xlabel("X Axis")
ax.set_ylabel("Y Axis")
ax.set_zlabel("Z Axis")
ax.set_title("Lorenz Attractor")

plt.show()

### Problem №5

Read toothpaste sales per month (in `data/company_sales_data.csv`) and show it as a scatter plot.

Also, add a grid in the plot. Gridline style should “–“.
The scatter plot should look like this:

<center>
<img src="images/matplotlib_and_pandas_exercise_4_show_scatter_plot.png" width="500" height="600">
</center>


## **Plotly**

The next library we will have a look at is [plotly](https://plotly.com/python/) - an interactive, open-source plotting Python library that supports over 40 unique chart types covering a wide range of statistical, financial, geographic, scientific, and 3-dimensional use-cases.

"Interactive" means that you can hover over the plot and interact with it by dragging, zooming in, clicking and selecting things around! We will showcase just a tiny bit of plotly, but you can always explore more on their [website](https://plotly.com/python/).

In [None]:
import plotly.express as px

In [None]:
df = px.data.tips()
df

In [None]:
fig = px.density_heatmap(df, x="total_bill", y="tip")
fig.show()

# In a density heatmap, rows of data_frame are grouped into coloured rectangular tiles
# to visualize the 2D distribution of an aggregate function histfunc (e.g. the count or sum) of the value z.

In [None]:
fig = px.density_heatmap(df,
                         x="total_bill",
                         y="tip",
                         marginal_x="histogram",
                         marginal_y="histogram")
fig.show()

In [None]:
fig = px.density_heatmap(df,
                         x="total_bill",
                         y="tip",
                         facet_row="sex",
                         facet_col="smoker")
fig.show()

In [None]:
fig = px.bar(df, x="sex", y="total_bill", color="smoker", barmode="group",
             facet_row="time", facet_col="day",
             category_orders={"day": ["Thur", "Fri", "Sat", "Sun"],
                              "time": ["Lunch", "Dinner"]})
fig.show()

In [None]:
df = px.data.wind()
df

In [None]:
fig = px.scatter_polar(df, r="frequency", theta="direction")
fig.show()

In [None]:
fig = px.line_polar(df,
                    r="frequency",
                    theta="direction",
                    color="strength",
                    line_close=True,
                    color_discrete_sequence=px.colors.sequential.Plasma_r,
                    template="plotly_dark")

fig.show()

# In a polar line plot, each row of data_frame is represented as a vertex of a polyline mark in polar coordinates.

In [None]:
import plotly.figure_factory as ff

x, y = np.meshgrid(np.arange(0, 2, .2), np.arange(
    0, 2, .2))  # Return coordinate matrices from coordinate vectors.
u = np.cos(x) * y
v = np.sin(x) * y

fig = ff.create_quiver(x, y, u, v)  # Returns data for a quiver plot.
fig.show()

In [None]:
# but it can do simple and basic plots too
df = px.data.iris() # have you heard of this dataset?
fig = px.scatter(df, x="sepal_width", y="sepal_length", color="species",
                 size='petal_length', hover_data=['petal_width'])
fig.show()

## **Seaborn**

[Seaborn](https://seaborn.pydata.org/) is a library for making statistical graphics in Python. It builds on top of matplotlib and integrates closely with pandas data structures. What is important is that seaborn helps you explore and better understand your data - its dataset-oriented, declarative API lets you focus on what the different elements of your plots mean, rather than on the details of how to draw them. And last but definitely not the least, it has very elegant design:)

In [None]:
import seaborn as sns

In [None]:
# Load the example flights dataset and convert to long-form.
flights_long = sns.load_dataset("flights")
flights = flights_long.pivot("month", "year", "passengers")

In [None]:
flights_long

In [None]:
flights

In [None]:
# Draw a heatmap with the numeric values in each cell.
f, ax = plt.subplots(figsize=(9, 6))
sns.heatmap(flights, annot=True, fmt="d", linewidths=.5, cmap='PuBu',
            ax=ax)  # Plot rectangular data as a colour-encoded matrix.

In [None]:
sns.set(style="ticks")  # Set aesthetic parameters in one step.
tips = sns.load_dataset("tips")

In [None]:
sns.relplot(x="total_bill", y="tip", hue="day", data=tips)

In [None]:
g = sns.relplot(x="total_bill",
                y="tip",
                hue="day",
                col="time",
                row="sex",
                data=tips)

In [None]:
g = sns.relplot(x="total_bill",
                y="tip",
                hue="time",
                size="size",
                palette=["b", "r"],
                sizes=(10, 100),
                col="time",
                data=tips)

In [None]:
sns.set(style="whitegrid")
ax = sns.boxplot(x=tips["total_bill"]
                 ) 
# do you know what it represents?

In [None]:
ax = sns.boxplot(x="day",
                 y="total_bill",
                 hue="smoker",
                 data=tips,
                 palette="Set3")

In [None]:
ax = sns.violinplot(x="day",
                    y="total_bill",
                    hue="smoker",
                    data=tips,
                    palette="muted",
                    split=True)

In [None]:
sns.regplot(x="total_bill", y="tip",
                 data=tips)  # Plot data and a linear regression model fit.

In [None]:
sns.set(style="ticks", color_codes=True)
iris = sns.load_dataset("iris")

In [None]:
sns.pairplot(iris,
                 hue="species")  # Plot pairwise relationships in a dataset.

# you might want to read what KDE is to understand the plots on the diagonal
# https://en.wikipedia.org/wiki/Kernel_density_estimation

In [None]:
sns.pairplot(iris, kind="reg") # there is actually a bit of Machine Learning happening here

## **SciPy**

Finally, it is important to mention [SciPy](https://docs.scipy.org/doc/scipy/reference/tutorial/index.html) - an open-source Python library used for scientific computing and technical computing. It contains modules for optimization, linear algebra, integration, interpolation, special functions, FFT, signal and image processing, ODE solvers and other tasks common in science and engineering. SciPy builds on the NumPy array object and is part of the NumPy stack which includes tools like Matplotlib, pandas and SymPy, and an expanding set of scientific computing libraries. 

### Linear Algebra

In [None]:
from scipy import linalg

In [None]:
matrix = np.array([[4, 5], [3, 2]])
linalg.det(matrix)

In [None]:
inv_matrix = linalg.inv(matrix)  # Compute the inverse of a matrix.
inv_matrix

In [None]:
A = np.array([[1, 2], [3, 4]])
b = np.array([[5], [6]])

In [None]:
x = np.linalg.solve(A, b)  # Solve the equation Ax = b for x
x

### Statistics

In [None]:
from scipy import stats
from scipy.stats import norm

In [None]:
a = norm.rvs(size=5)  # Random variates of a given type.
a

In [None]:
a.mean(), a.std(), a.var()

In [None]:
r = norm.rvs(size=1000)

In [None]:
fig, ax = plt.subplots(1, 1)
ax.hist(r, density=True, histtype='stepfilled', alpha=0.2)
plt.show()

In [None]:
from scipy.stats import chisquare
a = chisquare([16, 18, 16, 14, 12, 12])  # Calculate a one-way chi-square test.
a

### Optimisation

In [None]:
from scipy.optimize import minimize

In [None]:
def rosen(x):
    """The Rosenbrock function"""
    return sum(100.0 * (x[1:] - x[:-1]**2.0)**2.0 + (1 - x[:-1])**2.0)

In [None]:
x0 = np.array([1.3, 0.7, 0.8, 1.9, 1.2])
res = minimize(  # Minimization of a scalar function of one or more variables.
    rosen,
    x0,
    method='nelder-mead',
    options={
        'xatol': 1e-8,
        'disp': True
    })

In [None]:
res.x

### Fitting

In [None]:
# Seed the random number generator for reproducibility
np.random.seed(123)

# Generate some data from normal distribution
data = norm.rvs(10.0, 2.5, size=500) + np.random.normal(size=500)

# Fit a normal distribution to it
mu, std = norm.fit(data)

In [None]:
# Plot the histogram
plt.hist(data, bins=25, density=True, alpha=0.6, color='g')

# Plot the PDF
xmin, xmax = plt.xlim()
x = np.linspace(xmin, xmax, 100)
p = norm.pdf(x, mu, std)  # Probability density function.
plt.plot(x, p, 'k', linewidth=2)
title = "Fit results: mu = %.2f,  std = %.2f" % (mu, std)
plt.title(title)
plt.figure(figsize=(6, 4))

plt.show()

In [None]:
from scipy.optimize import curve_fit

In [None]:
def func(x, a, b, c):
    return a*np.exp(-b*x) + c

In [None]:
np.random.seed(123)
xdata = np.linspace(0, 4, 50)
y = func(xdata, 2.5, 1.3, 0.5)
y_noise = 0.2 * np.random.normal(size=xdata.size)
ydata = y + y_noise

plt.plot(xdata, ydata, 'b-', label='data')
plt.show()

In [None]:
popt, pcov = curve_fit(
    func, xdata,
    ydata)  # Use non-linear least squares to fit a function to data.
popt

In [None]:
plt.plot(xdata,
         func(xdata, *popt),
         'r-',
         label='fit: a=%5.3f, b=%5.3f, c=%5.3f' % tuple(popt))
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.show()

In [None]:
plt.plot(xdata, ydata, 'b-', label='data')

###
popt, pcov = curve_fit(func, xdata, ydata)
plt.plot(xdata,
         func(xdata, *popt),
         'r-',
         label='fit: a=%5.3f, b=%5.3f, c=%5.3f' % tuple(popt))
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
#plt.show()

# Constrain the optimization to the region of 0 <= a <= 3, 0 <= b <= 1 and 0 <= c <= 0.5
popt, pcov = curve_fit(func, xdata, ydata, bounds=(0, [3., 1., 0.5]))

plt.plot(xdata,
         func(xdata, *popt),
         'g--',
         label='fit: a=%5.3f, b=%5.3f, c=%5.3f' % tuple(popt))
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
#plt.show()

### Problem №6

Fit the following data with a function $f(x) = a\cdot\sin(bx)$, where $a$ and $b$ are the fit parameters, and plot corresponding results:

In [None]:
x_data = np.linspace(-5, 5, num=50)
y_data = 2.9 * np.sin(1.5 * x_data) + np.random.normal(size=50)

plt.figure(figsize=(6, 4))
plt.scatter(x_data, y_data)
plt.show()

<center>
<img src="images/mememe_cb8e239ef97eb73a7d04ecf46ed4bf5c-1.jpg" alt="YOU-ROCK" width="500" height="600">
</center>