CSC461 Machine Learning
=======================

Addendum: Matrices
------------------------

### Getting the example data

In [1]:
import numpy as np
import pandas as pd # Remember, Pandas is built on top of numpy (dataframe is superclass of ndarray)
dataframe = pd.read_csv('~/data/pima/diabetes.csv') # Read in database

In [2]:
# Off the shelf function for displaying dataframes side-by-side
from IPython.display import display, HTML
# From: https://python.plainenglish.io/displaying-multiple-dataframes-side-by-side-in-jupyter-lab-notebook-9a4649a4940
def side_by_side(*dfs):
    html = '<div style="display:flex">'
    for df in dfs:
        html += '<div style="margin-right: 2em">'
        html += df.to_html()
        html += '</div>'
    html += '</div>'
    display(HTML(html))

In [3]:
dataframe.head(6)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
5,5,116,74,0,0,25.6,0.201,30,0


In [4]:
NUMBER_DATA_POINTS=6 # Cut out dataset down to 6 elements (easier to view)
example_df = dataframe.iloc[0:NUMBER_DATA_POINTS,:]
example_labels = example_df.loc[:,['Outcome']]
example_labels_values = example_df['Outcome'].values
example_features = example_df.loc[:,['Age','BMI','Glucose']] # Cut our data down to 3 features
side_by_side(example_labels, example_features)

Unnamed: 0,Outcome
0,1
1,0
2,1
3,0
4,1
5,0

Unnamed: 0,Age,BMI,Glucose
0,50,33.6,148
1,31,26.6,85
2,32,23.3,183
3,21,28.1,89
4,33,43.1,137
5,30,25.6,116


#### Quick note on axes

In [5]:
print("example_labels.shape: {} | type: {}".format(example_labels.shape, type(example_labels)))
print(example_labels.to_numpy())
print("example_labels_values.shape: {} | type: {}".format(example_labels_values.shape, type(example_labels_values)))
print(example_labels_values)
print("After reshape of example_labels_values:")
print(example_labels_values.reshape((6,1)))
print("example_features.shape: {}".format(example_features.shape))

example_labels.shape: (6, 1) | type: <class 'pandas.core.frame.DataFrame'>
[[1]
 [0]
 [1]
 [0]
 [1]
 [0]]
example_labels_values.shape: (6,) | type: <class 'numpy.ndarray'>
[1 0 1 0 1 0]
After reshape of example_labels_values:
[[1]
 [0]
 [1]
 [0]
 [1]
 [0]]
example_features.shape: (6, 3)


In [6]:
lables_np_2d = example_labels.to_numpy()
print("2D array of shape {}:".format(lables_np_2d.shape))
print(lables_np_2d)
lables_np_1d = lables_np_2d[0]
print("1D array of shape {}:".format(lables_np_1d.shape))
print(lables_np_1d)
print("Item in array:")
print(lables_np_2d[0,0])

print(lables_np_2d[0][0])

2D array of shape (6, 1):
[[1]
 [0]
 [1]
 [0]
 [1]
 [0]]
1D array of shape (1,):
[1]
Item in array:
1
1


### Scalar Operations

#### Example: Subtraction

In [7]:
example_labels_sub_one = example_labels - 1
sub_one_example_labels = 1 - example_labels
print("example_labels, example_labels_sub_one, sub_one_example_labels")
side_by_side(example_labels, example_labels_sub_one, sub_one_example_labels)

example_labels, example_labels_sub_one, sub_one_example_labels


Unnamed: 0,Outcome
0,1
1,0
2,1
3,0
4,1
5,0

Unnamed: 0,Outcome
0,0
1,-1
2,0
3,-1
4,0
5,-1

Unnamed: 0,Outcome
0,0
1,1
2,0
3,1
4,0
5,1


#### Example: Addition

In [8]:
example_features_plus_one = example_features + 1
example_features_plus_one_glucose = example_features.copy()
example_features_plus_one_glucose["Glucose"] = example_features["Glucose"] + 1
print("example_features, example_features_plus_one, example_features_plus_one_glucose")
side_by_side(example_features, example_features_plus_one, example_features_plus_one_glucose)

example_features, example_features_plus_one, example_features_plus_one_glucose


Unnamed: 0,Age,BMI,Glucose
0,50,33.6,148
1,31,26.6,85
2,32,23.3,183
3,21,28.1,89
4,33,43.1,137
5,30,25.6,116

Unnamed: 0,Age,BMI,Glucose
0,51,34.6,149
1,32,27.6,86
2,33,24.3,184
3,22,29.1,90
4,34,44.1,138
5,31,26.6,117

Unnamed: 0,Age,BMI,Glucose
0,50,33.6,149
1,31,26.6,86
2,32,23.3,184
3,21,28.1,90
4,33,43.1,138
5,30,25.6,117


### Matrix x Vector Operations

<u>Note</u>: I am using the term "vector" to mean a matrix where one of the axes has size 1.

When multiplying matrices and vectors, you have to make sure that the **sizes of one of the axes match**. In the example below, the same number of sample data points are represented in the example_features and example_labels data types (i.e., they have the same number of rows).

This makes use of a concept called **broadcasting**, where a vector can be "broadcast" across a row or a column. (More details [here](https://numpy.org/doc/stable/user/basics.broadcasting.html)).

#### Example: Multiplication

In [9]:
# Note: At least one of our dataframes must be "downcast" to a numpy array, 
# otherwise the column names will be checked, and won't match
example_features_multiplied_row = example_features * example_labels.to_numpy()
example_features_multiplied_row_pandas = example_features * example_labels
side_by_side(example_features, example_labels, example_features_multiplied_row)
display(example_features_multiplied_row_pandas)

Unnamed: 0,Age,BMI,Glucose
0,50,33.6,148
1,31,26.6,85
2,32,23.3,183
3,21,28.1,89
4,33,43.1,137
5,30,25.6,116

Unnamed: 0,Outcome
0,1
1,0
2,1
3,0
4,1
5,0

Unnamed: 0,Age,BMI,Glucose
0,50,33.6,148
1,0,0.0,0
2,32,23.3,183
3,0,0.0,0
4,33,43.1,137
5,0,0.0,0


Unnamed: 0,Age,BMI,Glucose,Outcome
0,,,,
1,,,,
2,,,,
3,,,,
4,,,,
5,,,,


In [10]:
column_multiplier = pd.DataFrame(data={'A':[1], 'B':[0], 'C': [2]})
example_features_multiplied_col = example_features * column_multiplier.to_numpy()
example_features_multiplied_col_fun = example_features.multiply(column_multiplier.to_numpy(), axis=1)
side_by_side(example_features, column_multiplier, example_features_multiplied_col, example_features_multiplied_col_fun)

Unnamed: 0,Age,BMI,Glucose
0,50,33.6,148
1,31,26.6,85
2,32,23.3,183
3,21,28.1,89
4,33,43.1,137
5,30,25.6,116

Unnamed: 0,A,B,C
0,1,0,2

Unnamed: 0,Age,BMI,Glucose
0,50,0.0,296
1,31,0.0,170
2,32,0.0,366
3,21,0.0,178
4,33,0.0,274
5,30,0.0,232

Unnamed: 0,Age,BMI,Glucose
0,50,0.0,296
1,31,0.0,170
2,32,0.0,366
3,21,0.0,178
4,33,0.0,274
5,30,0.0,232


### Aggregation functions

#### Example: Sum
[https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sum.html](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sum.html)

In [11]:
example_features_sum_rows = example_features.sum(axis=0).to_frame(name="SumRows")
example_features_sum_cols = example_features.sum(axis=1).to_frame(name="SumCols")
example_features_sum_no_axis = example_features.sum().to_frame(name="SumNoAxis") # default: 0
side_by_side(example_features, example_features_sum_rows, example_features_sum_cols, example_features_sum_no_axis)

Unnamed: 0,Age,BMI,Glucose
0,50,33.6,148
1,31,26.6,85
2,32,23.3,183
3,21,28.1,89
4,33,43.1,137
5,30,25.6,116

Unnamed: 0,SumRows
Age,197.0
BMI,180.3
Glucose,758.0

Unnamed: 0,SumCols
0,231.6
1,142.6
2,238.3
3,138.1
4,213.1
5,171.6

Unnamed: 0,SumNoAxis
Age,197.0
BMI,180.3
Glucose,758.0


### Logical Operators

#### Example: Logical Not
[https://numpy.org/doc/stable/reference/generated/numpy.logical_not.html](https://numpy.org/doc/stable/reference/generated/numpy.logical_not.html)

In [12]:
example_labels_not = np.logical_not(example_labels)
example_labels_not_times_one = example_labels_not * 1 # Implicit cast to int
example_labels_not_exp_cast = example_labels_not.astype(int)
print("example_labels, example_labels_not, example_labels_not_times_one, example_labels_not_exp_cast")
side_by_side(example_labels, example_labels_not, example_labels_not_times_one, example_labels_not_exp_cast)

example_labels, example_labels_not, example_labels_not_times_one, example_labels_not_exp_cast


Unnamed: 0,Outcome
0,1
1,0
2,1
3,0
4,1
5,0

Unnamed: 0,Outcome
0,False
1,True
2,False
3,True
4,False
5,True

Unnamed: 0,Outcome
0,0
1,1
2,0
3,1
4,0
5,1

Unnamed: 0,Outcome
0,0
1,1
2,0
3,1
4,0
5,1


### Element-wise functions

#### Example: Absolute
[https://numpy.org/doc/stable/reference/generated/numpy.absolute.html](https://numpy.org/doc/stable/reference/generated/numpy.absolute.html)

In [13]:
example_labels_sub_one_absolute = np.absolute(example_labels_sub_one)
print("example_labels, example_labels_sub_one, sub_one_example_labels_absolute")
side_by_side(example_labels, example_labels_sub_one, example_labels_sub_one_absolute)

example_labels, example_labels_sub_one, sub_one_example_labels_absolute


Unnamed: 0,Outcome
0,1
1,0
2,1
3,0
4,1
5,0

Unnamed: 0,Outcome
0,0
1,-1
2,0
3,-1
4,0
5,-1

Unnamed: 0,Outcome
0,0
1,1
2,0
3,1
4,0
5,1


#### Example: Square root
[https://numpy.org/doc/stable/reference/generated/numpy.sqrt.html](https://numpy.org/doc/stable/reference/generated/numpy.sqrt.html)

In [14]:
example_features_sqrt = np.sqrt(example_features)
print("example_features, example_features_sqrt")
side_by_side(example_features, example_features_sqrt)

example_features, example_features_sqrt


Unnamed: 0,Age,BMI,Glucose
0,50,33.6,148
1,31,26.6,85
2,32,23.3,183
3,21,28.1,89
4,33,43.1,137
5,30,25.6,116

Unnamed: 0,Age,BMI,Glucose
0,7.071068,5.796551,12.165525
1,5.567764,5.157519,9.219544
2,5.656854,4.827007,13.527749
3,4.582576,5.300943,9.433981
4,5.744563,6.565059,11.7047
5,5.477226,5.059644,10.77033


### Filtering functions

#### Example: Where
[https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.where.html](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.where.html)

In [15]:
THRESHOLD=31
example_features_filtered = example_features.where(example_features["Age"] > THRESHOLD)
example_features_filtered_nonan = example_features_filtered.dropna()
print("example_features, example_features_filtered, example_features_filtered_nonan")
side_by_side(example_features, example_features_filtered, example_features_filtered_nonan)

example_features, example_features_filtered, example_features_filtered_nonan


Unnamed: 0,Age,BMI,Glucose
0,50,33.6,148
1,31,26.6,85
2,32,23.3,183
3,21,28.1,89
4,33,43.1,137
5,30,25.6,116

Unnamed: 0,Age,BMI,Glucose
0,50.0,33.6,148.0
1,,,
2,32.0,23.3,183.0
3,,,
4,33.0,43.1,137.0
5,,,

Unnamed: 0,Age,BMI,Glucose
0,50.0,33.6,148.0
2,32.0,23.3,183.0
4,33.0,43.1,137.0


### Joining matrices
[https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.join.html](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.join.html)

In [16]:
combined = example_features.join(example_labels)
side_by_side(example_labels, example_features, combined)

Unnamed: 0,Outcome
0,1
1,0
2,1
3,0
4,1
5,0

Unnamed: 0,Age,BMI,Glucose
0,50,33.6,148
1,31,26.6,85
2,32,23.3,183
3,21,28.1,89
4,33,43.1,137
5,30,25.6,116

Unnamed: 0,Age,BMI,Glucose,Outcome
0,50,33.6,148,1
1,31,26.6,85,0
2,32,23.3,183,1
3,21,28.1,89,0
4,33,43.1,137,1
5,30,25.6,116,0
