In [2]:
# import the modules we need
# using 'as' allows us to use an alias
import numpy as np
import scipy as sp
from scipy import stats
import matplotlib.pyplot as plt
import pandas as pd
import statistics as s

In [None]:
# create an array -- this is one dimensional like a vector
# numpy defines the ndarray -- an n-dimensional array
x = np.array([1, 2, 3, 7, 4, 5, 4, 6, 8, 10, 3, 4, 4, 3, 4, 5])

In [None]:
np.var(x) # population variance 

In [None]:
s.var(x)  ## Error -- no var() in statistics module

In [None]:
# display the mean
# either use the mean() method with instance x
# or call it using np explicity and pasing x as a parameter
# or use the built-in mean from the statistics module
x.mean()

In [None]:
np.mean(x)

In [None]:
s.mean(x)

In [None]:
# weighted mean
np.average(x, weights=[0.0625, 0.0625,0.1225,0.0025,0.0625,0.1225,0.025,0.0625,0.0625,0.0625,0.0625,0.0625,0.0625,
                       0.0625,0.0625,0.0625])

In [None]:
# standard deviation
# we can use print to format our output
print('%.4f' % x.std())
print('%.4f' % np.std(x))
print('%.4f' % s.stdev(x))

In [None]:
# let's take another look explicitly calculating the variance
y = (x-x.mean())**2
z = (y.sum()/len(x))**(0.5)
print('%.4f' % z)

In [None]:
# notice np.std() divides by n by default which gives us the population variance
# change the parameter ddof=1 to divide by n-1 for an unbiased sample standard deviation
print('%.4f' % x.std(ddof=1))
print('%.4f' % np.std(x, ddof=1))

In [None]:
# check it 'by hand'
w = (y.sum()/(len(x)-1))**(0.5)
print('%.4f' % w)

In [None]:
# find the median of the data -- the middle value
print(np.median(x))
print(s.median(x))

In [None]:
# and the mode -- the most frequently occurring value
np.mode(x)  ## Error!  NumPy does not have a mode()

In [None]:
s.mode(x)

In [None]:
# you can set the precision of the output
np.set_printoptions(precision=4) 

# create a two dimensional array
# summary stats for the entire array
# x2 has two rows and five columns
x2 = np.array([[1, 2, 3, 4, 6], [7, 8, 10, 9, 12]])
print(np.mean(x2), np.median(x2), '%.4f' % np.std(x2, ddof=1), sep='\t') 

In [None]:
# summary stats column wise - set axis = 0
print(np.mean(x2, axis=0), np.median(x2, axis=0), np.std(x2, ddof=1, axis=0), sep='\t')

# row wise  - set axis = 1 -- print formatting retained
print(np.mean(x2, axis=1), np.median(x2, axis=1), np.std(x2, ddof=1, axis=1), sep='\t')

In [None]:
# Create a pandas Series -- a one dimensional array (aka a vector)
# then display the summary statistics
y = pd.Series(x)
y.describe()

In [None]:
# scipy stats also has a describe() method
# displays summary statistics
sp.stats.describe(x) 

In [None]:
# create a two dimensional array
# summary stats for the entire array
import numpy as np
x2 = np.array([[1, 2, 3, 4, 6], [7, 8, 10, 9, 12]])
print(np.mean(x2), np.median(x2), '%.4f' % np.std(x2, ddof=1), sep='\t') 

In [None]:
# we can calculate the correlation in several ways as well
# np.corrcoef rreturns an np array
np.corrcoef(x2)

In [None]:
# calculate Pearson's rho and the p-value
# the p-value suggest the corelation is statistically significant
import scipy as sp
rho, pstat = sp.stats.pearsonr(x2[0, ], x2[1, ])
print('rho = %.4f' % rho, 'p-val = %.4f' % pstat)

In [None]:
# compare sample means
tstat, pval = sp.stats.ttest_ind(x2[0], x2[1])
print('t-stat = %.3f ' % tstat, 'p-value = %.3f' % pval)

In [3]:
# linear regression -- we didn't cover it in the lectures
# But here's how 
import statsmodels as sm
import statsmodels.regression.linear_model as lm

Y = np.array([1, 3, 4, 5, 2, 3, 4, 8, 12, 14, 11, 12, 23, 17, 14, 17, 18, 21, 25, 30])
X = np.array([1, 2, 3, 4, 8, 6, 7, 9, 4, 17, 14, 15, 20, 31, 23, 25, 17, 20, 29, 30])
X = sm.tools.tools.add_constant(X)  # adds a 1 to the data so the model can estimate a coefficient term

# fit the model and display results 
model = lm.OLS(Y, X)
results = model.fit()
results.summary()

0,1,2,3
Dep. Variable:,y,R-squared:,0.769
Model:,OLS,Adj. R-squared:,0.756
Method:,Least Squares,F-statistic:,59.9
Date:,"Fri, 29 Jun 2018",Prob (F-statistic):,3.91e-07
Time:,08:18:03,Log-Likelihood:,-55.961
No. Observations:,20,AIC:,115.9
Df Residuals:,18,BIC:,117.9
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.5225,1.667,0.913,0.373,-1.980,5.025
x1,0.7493,0.097,7.740,0.000,0.546,0.953

0,1,2,3
Omnibus:,0.209,Durbin-Watson:,1.625
Prob(Omnibus):,0.901,Jarque-Bera (JB):,0.376
Skew:,0.182,Prob(JB):,0.829
Kurtosis:,2.436,Cond. No.,30.7
