# Data Science Introduction

We will be using two additional packages - `numpy` and `pandas`. They both need to be imported and usually they are given an alias.

In [1]:
import pandas as pd
import numpy as np

## Numpy array

Example of numpy `array` structure.

In [2]:
a = np.array([1,2,3])
print(a)

[1 2 3]


The array `a` has a special `ndarray` type.

In [3]:
print(type(a))

<class 'numpy.ndarray'>


In [4]:
print(type(a[0]))

<class 'numpy.int32'>


Check the dimension and size of the array.

In [5]:
a.shape

(3,)

Create a multidimensional array.

In [6]:
b = np.array([[1,2,3],[4,5,6],[7,8,9]], dtype=np.float64)
print(b)

[[1. 2. 3.]
 [4. 5. 6.]
 [7. 8. 9.]]


In [7]:
print(type(b))

<class 'numpy.ndarray'>


In [8]:
b.shape

(3, 3)

In [9]:
print(type(b[0][0]))

<class 'numpy.float64'>


The `strides` attributes gives us the number of bytes to step in each direction when traversing the array. 

In [10]:
b.strides
#8 bytes to reach the next column and 24 bytes to reach the next row

(24, 8)

Zero-dimensional array - scalar.

In [11]:
x = np.array(42)
print(x)
print(type(x))
print(np.ndim(x))
print(x.shape)

42
<class 'numpy.ndarray'>
0
()


In [12]:
arr = np.array([1,'martina', 7.8])
print(arr)
print(type(arr[0]))

['1' 'martina' '7.8']
<class 'numpy.str_'>


### Special arrays

Arrays of ones and zeroes

In [None]:
ones = np.ones((2,3), float)
print(ones)

In [None]:
zeroes = np.zeros((2,3), int)
print(zeroes)

Identity arrays

In [13]:
ident = np.identity(3)
print(ident)

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


Non-square identity matrix, the `k` parameter defines the position of the diagonal, default 0 - main diagonal.

In [14]:
ident2 = np.eye(3,4,k=2, dtype=int)
print(ident2)

[[0 0 1 0]
 [0 0 0 1]
 [0 0 0 0]]


### Numerical operations

Use standard mathematical notation for adding, subtracting and comparing entire arrays.

** NO NEED TO LOOP OVER EACH VALUE **

In [15]:
a = np.array([1,2,3,4,5,6,7,8])
a = a + 1
print(a)

[2 3 4 5 6 7 8 9]


In [None]:
# adding two arrays together
b = np.ones((1,8), int)
print(b)

c = a+b
print(c)

Matrix multiplication

In [None]:
mat1 = np.array([[1,2,3], [4,5,6]])
mat2 = np.array([[1,2], [3,4], [5,6]])
print(np.dot(mat1,mat2))

Equality operator

In [None]:
mat1 = np.array([[1,2,4], [3,4,5], [6,7,8]])
mat2 = np.array([[1,2,6],[3,7,5], [6,4,5]])
print(mat1 == mat2)

#complete array equality
print(np.array_equal(mat1, mat2))

Mathematical comparison

In [None]:
arr1 = np.array ([1,2,3,4,5])
# same for comparisons
#arr1 > arr2
#arr1 < arr2
#arr1 <= arr2
#arr1 >= arr2
#arr1 == arr2
#arr1 != arr2

# and also more complex expressions
(arr1 + 12) >= 15
#(arr2 - arr1) == 27

Boolean operators

In [None]:
np.logical_or(  (arr1 % 2) == 0, (arr2 % 20) == 0 )

#np.logical_and( (arr1 % 2) == 0, (arr2 % 20) == 0 )

#np.logical_not( (arr1 % 2) == 0 )

Structured arrays - heterogenous data

In [21]:
dt = np.dtype([('country', 'S20'), ('population','i4')])
datatable = np.array([
    ('UK', 65640000),
    ('USA', 323100000),
    ('Russia', 144300000)
], dtype = dt)

print(datatable)

[(b'UK',  65640000) (b'USA', 323100000) (b'Russia', 144300000)]


In [22]:
print(datatable['country'])

[b'UK' b'USA' b'Russia']


In [None]:
print(datatable['population'])

In [None]:
ctry = datatable[0]
print(ctry)
print(ctry[0])

Numpy matrices are strictly 2-D, support matrix manipulation

In [None]:
MA = np.mat([[1,2,3], [4,5,6]])
MB = np.mat([[7,8],[9,12],[10,11]])

print(MA*MB)

## Numpy arange

Example of `arange()` function, if used with integers it is almost equivalent to Python in-built `range()` function. The values are generated within the half-open interval, similar to slicing.

In [None]:
a = np.arange(1,5)
print(a)

Using optional step-size parameter.

In [None]:
a = np.arange(0.5, 2.5, 0.5)
print(a)
print(type(a))

## Pandas DataFrames

 - DataFrames are rectangular table of data
 - Contain rows and columns
 - Columns have headings
 - Rows have index

In [17]:
# can create dataframe by reading an excel spreadsheet
df = pd.read_excel(io='FinData.xls', sheet_name='AMZN')
df

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2020-10-26,3198.739990,3282.979980,3153.300049,3207.040039,3207.040039,5901200
1,2020-10-27,3224.939941,3291.659912,3211.300049,3286.330078,3286.330078,4291000
2,2020-10-28,3249.300049,3264.020020,3162.469971,3162.780029,3162.780029,5588300
3,2020-10-29,3201.270020,3257.250000,3164.000000,3211.010010,3211.010010,6596500
4,2020-10-30,3157.750000,3167.000000,3019.000000,3036.149902,3036.149902,8386400
...,...,...,...,...,...,...,...
247,2021-10-19,3434.290039,3454.689941,3422.000000,3444.149902,3444.149902,2386100
248,2021-10-20,3452.659912,3462.860107,3400.370117,3415.060059,3415.060059,2139800
249,2021-10-21,3414.250000,3440.280029,3403.000000,3435.010010,3435.010010,1881400
250,2021-10-22,3421.000000,3429.840088,3331.300049,3335.550049,3335.550049,3133800


View first few rows or last few rows.

In [18]:
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
0,2020-10-26,3198.73999,3282.97998,3153.300049,3207.040039,3207.040039,5901200
1,2020-10-27,3224.939941,3291.659912,3211.300049,3286.330078,3286.330078,4291000
2,2020-10-28,3249.300049,3264.02002,3162.469971,3162.780029,3162.780029,5588300
3,2020-10-29,3201.27002,3257.25,3164.0,3211.01001,3211.01001,6596500
4,2020-10-30,3157.75,3167.0,3019.0,3036.149902,3036.149902,8386400


In [19]:
df.tail(10)

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume
242,2021-10-12,3257.0,3267.530029,3236.280029,3247.330078,3247.330078,1819600
243,2021-10-13,3269.709961,3288.379883,3261.090088,3284.280029,3284.280029,2420100
244,2021-10-14,3302.449951,3312.600098,3290.780029,3299.860107,3299.860107,2109500
245,2021-10-15,3311.419922,3410.419922,3304.0,3409.02002,3409.02002,5175100
246,2021-10-18,3388.360107,3449.169922,3385.100098,3446.73999,3446.73999,3174100
247,2021-10-19,3434.290039,3454.689941,3422.0,3444.149902,3444.149902,2386100
248,2021-10-20,3452.659912,3462.860107,3400.370117,3415.060059,3415.060059,2139800
249,2021-10-21,3414.25,3440.280029,3403.0,3435.01001,3435.01001,1881400
250,2021-10-22,3421.0,3429.840088,3331.300049,3335.550049,3335.550049,3133800
251,2021-10-25,3335.0,3347.800049,3297.699951,3320.370117,3320.370117,2222700


Access certain columns

In [None]:
df['Open']

In [None]:
print(type(df['Open']))

In [None]:
df[['Open']]

In [None]:
print(type(df[['Open']]))

In [None]:
#multiple columns
df[['Open', 'Close', 'Adj Close']]

Using filters

In [None]:
# filter
df['Open'] > 3000

#using filter
df[df['Open']>3000]

Getting rows

In [None]:
# check the type
df.loc[1]

In [None]:
# data frame
df.loc[[1]]

In [None]:
df.loc[5:90]

Functions on columns of data:
- min
- max
- mean
- median
- count

Min and max also work on string columns.

In [None]:
df['Open'].mean()
#df['Open'].min()
#df['Open'].max()
#df['Open'].median()
#df['Open'].max()
#df['Open'].count()

In [None]:
df[['Open', 'Close', 'Adj Close']].mean()

Array arithmetics with DataFrame

In [None]:
df['Open'] - df['Close']

In [None]:
arr = df['High'] > 3020
print(arr[6:90])

## Numpy where

In [None]:
a = np.array([1,2,3,4,5])
np.where(a % 2 == 0, 'Even', 'Odd')

In [20]:
np.where(df['High'] > 3100 , 'High', 'Low')

array(['High', 'High', 'High', 'High', 'High', 'Low', 'Low', 'High',
       'High', 'High', 'High', 'High', 'High', 'High', 'High', 'High',
       'High', 'High', 'High', 'High', 'High', 'High', 'High', 'High',
       'High', 'High', 'High', 'High', 'High', 'High', 'High', 'High',
       'High', 'High', 'High', 'High', 'High', 'High', 'High', 'High',
       'High', 'High', 'High', 'High', 'High', 'High', 'High', 'High',
       'High', 'High', 'High', 'High', 'High', 'High', 'High', 'High',
       'High', 'High', 'High', 'High', 'High', 'High', 'High', 'High',
       'High', 'High', 'High', 'High', 'High', 'High', 'High', 'High',
       'High', 'High', 'High', 'High', 'High', 'High', 'High', 'High',
       'High', 'High', 'High', 'High', 'High', 'High', 'High', 'High',
       'Low', 'Low', 'Low', 'Low', 'High', 'High', 'Low', 'Low', 'High',
       'High', 'High', 'Low', 'High', 'High', 'High', 'High', 'Low',
       'Low', 'Low', 'High', 'High', 'High', 'High', 'High', 'High',
       'Hi

## Time Series

In [None]:
df = pd.read_excel(io='FinData.xls', sheet_name='DARK.L', index_col = 'Date', parse_dates = True)
df

In [None]:
df.describe()

In [None]:
df['2021-10']

In [None]:
df['2021-May':'2021-Sep']


In [None]:
df['2021-May':'2021-Sep':30]

In [None]:
df['Open'].plot()

In [None]:
open = df[['Open']]

In [None]:
open['2021-May':'2021-Aug'].plot()

In [None]:
df2 = df[['Open','Close']]
df2['2021-May':'2021-Aug'].plot()