# Statistics

<!-- SUMMARY: This script gives the information of Basic Statistics.  -->

<!-- CATEGORY: Courses -->

This file gives the elementary information on calculation of basic Statistics.

In [None]:
import gstlearn.document as gdoc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import gstlearn as gl 
from IPython.display import Markdown

gdoc.setNoScroll()

We will illustrate the notions on the Scotland temperatures data set.

In [None]:
data = pd.read_csv(gdoc.loadData("Scotland", "Scotland_Temperatures.csv"))
data

In [None]:
z = data["Elevation"].to_numpy()

### 1) Position

In [None]:
Markdown(gdoc.loadDoc("Statistics_mean.md"))

In [None]:
m = np.mean(z)
rounded = np.round(m,decimals=2)
print("Mean = " + str(rounded))

In [None]:
Markdown(gdoc.loadDoc("Statistics_median.md"))

In [None]:
print("Median = " + str(np.median(z)))

In [None]:
Markdown(gdoc.loadDoc("Statistics_quartiles.md"))

In [None]:
print("Quartiles = " + str(np.quantile(z,[0,0.25,0.5,0.75,1])))

In [None]:
Markdown(gdoc.loadDoc("Statistics_quantiles.md"))

### 2) Dispersion

In [None]:
Markdown(gdoc.loadDoc("Statistics_range.md"))

In [None]:
print("Range= " + str(np.max(z)-np.min(z)))

In [None]:
Markdown(gdoc.loadDoc("Statistics_interq.md"))

In [None]:
print("Inter-quartiles distance = " + str(np.diff(np.quantile(z,[0.25,0.75]))))

In [None]:
Markdown(gdoc.loadDoc("Statistics_variance.md"))

In [None]:
# Variance
n = len(z)
print("Variance (First formula) = " + str(np.mean((z-np.mean(z))**2)))
print("Variance (Sec.  formula) = " + str(np.mean(z**2)-np.mean(z)**2))
print("Variance (numpy version) = " + str(np.var(z)))

In [None]:
Markdown(gdoc.loadDoc("Statistics_std.md"))

In [None]:
print("Variance (numpy version) = " + str(np.std(z)))

### 3) Distribution

In [None]:
Markdown(gdoc.loadDoc("Statistics_histo.md"))

In [None]:
nbin = 20
ax = plt.hist(z,bins=nbin)

In [None]:
#Histogram (normalized)
ax = plt.hist(z,bins=nbin,density=True)

In [None]:
Markdown(gdoc.loadDoc("Statistics_histocum.md"))

In [None]:
#Cumulative histogram
p = 0.8
x = np.sort(z)
y = np.linspace(1/len(z),1,len(x))
a = plt.plot(x, y)
a = plt.scatter(np.quantile(z,p),p,c="r")

In [None]:
Markdown(gdoc.loadDoc("Statistics_quantileF.md"))

In [None]:
#Quantile function
p = 0.8
plt.plot(y,x)
a = plt.scatter(p,np.quantile(z,p),c="r")

In [None]:
Markdown(gdoc.loadDoc("Statistics_Ore.md"))

In [None]:
#Ore
ore = 1. - y
a = plt.plot(x,ore)

In [None]:
Markdown(gdoc.loadDoc("Statistics_Metal.md"))

In [None]:
#Metal (normalized)
metal = 1/len(x)*(np.sum(x)-np.cumsum(x))
a = plt.plot(x, metal)

In [None]:
Markdown(gdoc.loadDoc("Statistics_Grade.md"))

In [None]:
#Grade
a = plt.plot(x[:-1],metal[:-1]/ore[:-1])

In [None]:
Markdown(gdoc.loadDoc("Statistics_QT.md"))

In [None]:
#Q(T) curve
a = plt.plot(ore, metal/metal[0])
a = plt.plot([0,1],[0,1],"--")

In [None]:
Markdown(gdoc.loadDoc("Statistics_Benefit.md"))

In [None]:
#Benefit
a = plt.plot(x,metal-x*ore)

### 4) Bivariate Statistics

In [None]:
Markdown(gdoc.loadDoc("Statistics_Bivariate.md"))

In [None]:
temp = data["January_temp"].to_numpy()
elev = data["Elevation"].to_numpy()
sel = temp!="MISS"
z2=temp[sel].astype("float")
z1=elev[sel]

In [None]:
Markdown(gdoc.loadDoc("Statistics_Covariance.md"))

In [None]:
# Covariance
print("Covariance = " + str(np.cov(z1,z2)[0,1]))

In [None]:
Markdown(gdoc.loadDoc("Statistics_Correlation.md"))

In [None]:
print("Correlation coefficient",np.corrcoef(z1,z2)[0,1])

In [None]:
Markdown(gdoc.loadDoc("Statistics_CovarianceM.md"))

In [None]:
#Covariance matrix
print("Covariance matrix = \n" + str(np.cov(z1,z2)))

In [None]:
print("Variance",np.var(z1) * len(z1)/(len(z1)-1))

In [None]:
print("Covariance matrix = \n" + str(np.cov(z1,z2,bias=True)))

In [None]:
Markdown(gdoc.loadDoc("Statistics_Scatter.md"))

In [None]:
a = plt.scatter(z1,z2,s=1)

In [None]:
Markdown(gdoc.loadDoc("Statistics_Regr.md"))

In [None]:
# Regression 
ahat = np.cov(z1,z2,bias=True)[1,:][0]/np.var(z1)
bhat = np.mean(z2) - ahat*np.mean(z1)
plt.scatter(z1,z2,s=1)
a = plt.plot([np.min(z1),np.max(z1)],[bhat+ahat*np.min(z1),bhat+ahat*np.max(z1)])

In [None]:
Markdown(gdoc.loadDoc("Statistics_MRegr.md"))

In [None]:
Markdown(gdoc.loadDoc("Statistics_hist2d.md"))

In [None]:
# 2d histogram
nbin = 15
ax = plt.hist2d(z1,z2,nbin)

In [None]:
Markdown(gdoc.loadDoc("Statistics_histcond.md"))

In [None]:
#Histogram in a class
axc = plt.hist(z2[np.where((z1<ax[1][2])*(z1>ax[1][1]))],bins=nbin,density=True)

In [None]:
Markdown(gdoc.loadDoc("Statistics_meancond.md"))

In [None]:
## Conditional mean
ax = plt.hist2d(z1,z2,15)
plt.scatter(z1,z2,s=1)
m = np.empty_like(ax[1])
for i in range(ax[1].shape[0]-1):
    ind = np.where(np.logical_and(z1>ax[1][i],z1<ax[1][i+1]))[0]
    if len(ind)>0:
        m[i] = np.mean(z2[ind])
    else:
        m[i] = None
ax=plt.plot(ax[1],m,c="r")