In [1]:
# Set-up statements that we will nearly always use
import numpy as np
import pandas as pd
import matplotlib.pyplot as pl
%matplotlib inline

# A very quick look at some real data

In [2]:
ecoli = pd.read_csv('~/Dropbox/Transfer/Teaching/DATA201/Datasets/TaranakiStWharf.csv',skiprows=0,header=None,parse_dates=[8])
ecoli.head()

FileNotFoundError: [Errno 2] No such file or directory: '/Users/macOS/Dropbox/Transfer/Teaching/DATA201/Datasets/TaranakiStWharf.csv'

How do we plot the ecoli data?

In [None]:
pl.plot(ecoli.iloc[:,9],'.')

It would be more useful with a better $x$ axis

In [None]:
pl.plot(ecoli.iloc[:,8],ecoli.iloc[:,9],'.')

There are colours associated with these, to say if you can swim -- red if ecoli > 350, amber if it is > 150, green otherwise

In [None]:
inds_red = np.squeeze(np.where(ecoli.iloc[:,9]>350))
inds_amber = np.squeeze(np.where(ecoli.iloc[:,9]>150))
pl.plot(ecoli.iloc[:,8],ecoli.iloc[:,9],'.g',markersize=10)
pl.plot(ecoli.iloc[inds_amber,8],ecoli.iloc[inds_amber,9],'.k',markersize=10)
pl.plot(ecoli.iloc[inds_red,8],ecoli.iloc[inds_red,9],'.r',markersize=10)

And if you think some of those numbers are bad...
https://www.lawa.org.nz/explore-data/wellington-region/coastal/wellington-harbour-at-taranaki-st-dive-platform/

# In case that was too cheery

In [None]:
co2 = pd.read_csv('~/Dropbox/Transfer/Teaching/DATA201/Datasets/MLO/mlo_station/daily_flask_co2_mlo.csv',skiprows=70,header=None,parse_dates=[0])

This is the CO2 (in ppm) at Mauna Loa on Hawaii for the 60 or so years. Let's plot it.

In [None]:
pl.plot(co2.iloc[500:,0],co2.iloc[500:,-1])

Does it match the NZ data? We should plot them both.

In [None]:
co2nz = pd.read_csv('~/Dropbox/Transfer/Teaching/DATA201/Datasets/daily_flask_co2_nzd.csv',skiprows=70,header=None,parse_dates=[0])

In [None]:
pl.plot(co2nz.iloc[:,-1])
pl.plot(co2.iloc[:,-1])

Why don't they match?

In [None]:
pl.plot(co2nz.iloc[:,0],co2nz.iloc[:,-1])
pl.plot(co2.iloc[:,0],co2.iloc[:,-1])

In [None]:
pl.plot(co2nz.iloc[500:,0],co2nz.iloc[500:,-1])

# Image and Sound Data

Inside a computer all data is numbers.

## Images...
record the intensity of light at each point on the microchip in the camera lens. This can be 1D (greyscale), 3D (e.g., red, green, blue), or multi-spectral (lots of other features such as infra-red spectrum, etc.)

We can load and process images using Python.

In [None]:
c = pl.imread('/home/marslast/Dropbox/Transfer/Teaching/DATA201/Labs/cute.jpg')
print(c)
print(np.shape(c))

In [None]:
pl.imshow(c,cmap='gray')

In [None]:
d = pl.imread('hakatere.jpg')
print(d)
print(np.shape(d))
pl.imshow(d,cmap='gray')

## Sound
is a wave of sound pressure. We turn it into a computational representation by sampling the wave at regular time points, such as 36,000 times a second (36 kHz). 

In [None]:
import wavio
s = wavio.read('tril1.wav')

In [None]:
print(s.data[:100].T)
pl.plot(s.data)

We can turn sound into a histogram of power (a spectrogram) using the Fourier transform. This can be treated like an image.

In [None]:
pl.specgram(np.squeeze(s.data))

# Anscombe's dataset

This has two purposes here:
 - to explore a dataset and see how important plotting can be
 - to see some basic NumPy syntax 

In [None]:
data = np.array([
[10.0    ,8.04   ,10.0   ,9.14   ,10.0   ,7.46   ,8.0    ,6.58 ],
[8.0     ,6.95   ,8.0    ,8.14   ,8.0    ,6.77   ,8.0    ,5.76 ],
[13.0    ,7.58   ,13.0   ,8.74   ,13.0   ,12.74  ,8.0    ,7.71 ],
[9.0     ,8.81   ,9.0    ,8.77   ,9.0    ,7.11   ,8.0    ,8.84 ],
[11.0    ,8.33   ,11.0   ,9.26   ,11.0   ,7.81   ,8.0    ,8.47 ],
[14.0    ,9.96   ,14.0   ,8.10   ,14.0   ,8.84   ,8.0    ,7.04 ],
[6.0     ,7.24   ,6.0    ,6.13   ,6.0    ,6.08   ,8.0    ,5.25 ],
[4.0     ,4.26   ,4.0    ,3.10   ,4.0    ,5.39   ,19.0   ,12.50],
[12.0    ,10.84  ,12.0   ,9.13   ,12.0   ,8.15   ,8.0    ,5.56 ],
[7.0     ,4.82   ,7.0    ,7.26   ,7.0    ,6.42   ,8.0    ,7.91 ],
[5.0     ,5.68   ,5.0    ,4.74   ,5.0    ,5.73   ,8.0    ,6.89 ],
])

What are the basic statistics of these datasets?

In [None]:
print(np.mean(data,axis=0))
print(np.std(data,axis=0))

In [None]:
print(np.corrcoef(data[:,0],data[:,1]))
print(np.corrcoef(data[:,2],data[:,3]))
print(np.corrcoef(data[:,4],data[:,5]))
print(np.corrcoef(data[:,6],data[:,7]))

In [None]:
print(np.polyfit(data[:,0],data[:,1],1))
print(np.polyfit(data[:,2],data[:,3],1))
print(np.polyfit(data[:,4],data[:,5],1))
print(np.polyfit(data[:,6],data[:,7],1))

So are they all the same then?

In [None]:
pl.subplot(221)
pl.plot(data[0,:],data[1,:],'.',markersize=12)
pl.subplot(222)
pl.plot(data[2,:],data[3,:],'.',markersize=12)
pl.subplot(223)
pl.plot(data[4,:],data[5,:],'.',markersize=12)
pl.subplot(224)
pl.plot(data[6,:],data[7,:],'.',markersize=12)

In [None]:
print(np.polyfit(data[:,0],data[:,1],1))
print(np.polyfit(data[:,2],data[:,3],1))
print(np.polyfit(data[:,4],data[:,5],1))
print(np.polyfit(data[:,6],data[:,7],1))