In [1]:
from os.path import exists
import requests
from typing import *

import numpy as np
import pandas as pd
from tqdm.notebook import trange, tqdm

# Notebook 1: Linear Algebra and Linear Regression

## Problem 1

Determine if each column of a $(n x m)$ matrix $X$ is a basis for $R^n$ where $n \geq 1$ and $m \geq 1$.

Example 1:
$$
A = \begin{pmatrix}
0.5 & 0 \\
0 & -0.2 
\end{pmatrix}
$$

```
check_basis(A) = true
```

Example 2:
$$
B = \begin{pmatrix}
0.5 \\
0 
\end{pmatrix}
$$

```
check_basis(B) = false
```

Example 3:
$$
C = \begin{pmatrix}
0.5 & 0 \\
0.2 & -0.2 \\
0.3 & -0.4 
\end{pmatrix}
$$

```
check_basis(C) = false
```

Example 4:
$$
D = \begin{pmatrix}
0.5 & 0 & 1 \\
0.2 & -0.2 & 1 \\
\end{pmatrix}
$$

```
check_basis(D) = true
```

In [2]:
# Saved these matrices to check some examples.

A = np.array([[0.5, 0], [0, -0.2]]) #True
B = np.array([[0.5, 0]]) #False
C = np.array([[0.5, 0.2, 0.3], [0, -0.2, 0.4]]) #False
D = np.array([[0.5, 0.2], [0, -0.2], [0.3, -0.4]]) #True
E = np.array([[1, 1], [1, 0]]) #True
F = np.array([[1, 1], [2, 2], [3, 3]]) #False
G = np.array([[0, 1], [1, 0]]) #True

# np.size(D, 0)
# np.linalg.matrix_rank(A)

In [3]:
def check_basis(X: np.ndarray) -> bool:
    
    #if np.size(X, 0) < np.size(X, 1):
        #return False
    
    if np.linalg.matrix_rank(X) < X.shape[1]:
        return False
    
    return True

In [4]:
check_basis(D)

True

## Problem 2

We saw that we could solve
$$
Ax = b
$$
where $A$ is a $n \times n$ matrix, $x$ is a $n \times 1$ vector, and $b$ is a $n \times 1$ vector.

### Problem 2a

Write a function that solves the matrix equation for $n \times n$ $X$
$$
AX = B
$$
where $A$ is a $n \times n$ matrix and $B$ is a $n \times n$ matrix.

In [5]:
A = np.array([[1, 2], [2, 3]])
B = np.array([[3, 5], [1, 4]])

P = np.array([[1, 2, 1], [1, -3, 1], [1, -1, -1]])
Q = np.array([[5, 0, -2], [0, 1, 0], [-2, 0, 3]])

In [6]:
# method 1

def solve_matrix(A: np.ndarray, B: np.ndarray) -> np.ndarray: 
    
    return np.linalg.lstsq(A.transpose() @ A, A.transpose() @ B, rcond=None)[0]
    # np.linalg.solve() works almost the same as np.linalg.lstsq() as shown at the bottom of the notebook.

# method 2: not ideal

def solve_matrixinv(A: np.ndarray, B: np.ndarray) -> np.ndarray:    
    return np.linalg.inv(A.transpose() @ A) @ A.transpose() @ B

solve_matrix(A, B), solve_matrixinv(A, B)

(array([[-7., -7.],
        [ 5.,  6.]]),
 array([[-7., -7.],
        [ 5.,  6.]]))

### Problem 2b

Write a function that solves the matrix equation for $n \times n$ $X$
$$
A_1 A_2 X = B
$$
where $A_1$, $A_2$, and $B$ are $n \times n$ matrices.

In [7]:
# method 1

def solve_matrix2(A1: np.ndarray, A2: np.ndarray, B: np.ndarray) -> np.ndarray: 
    
    return np.linalg.solve(A1 @ A2, B)

#method 2: not ideal

def solve_matrix2inv(A1: np.ndarray, A2: np.ndarray, B: np.ndarray) -> np.ndarray:
    return np.linalg.inv(A1 @ A2) @ B

solve_matrix2(P, P, Q), solve_matrix2inv(P, P, Q)

(array([[ 1.50000000e+00,  1.70000000e-01, -7.10000000e-01],
        [ 2.53765263e-17,  6.00000000e-02,  2.20000000e-01],
        [-5.00000000e-01, -1.90000000e-01,  9.70000000e-01]]),
 array([[ 1.50000000e+00,  1.70000000e-01, -7.10000000e-01],
        [-2.77555756e-17,  6.00000000e-02,  2.20000000e-01],
        [-5.00000000e-01, -1.90000000e-01,  9.70000000e-01]]))

## Problem 3

Now we'll get to practice using linear regression on a dataset.

# Dataset

Site: https://archive.ics.uci.edu/ml/datasets/Physicochemical+Properties+of+Protein+Tertiary+Structure

### Attributes Information:

- RMSD - Size of the residue. 
- F1 - Total surface area. 
- F2 - Non polar exposed area. 
- F3 - Fractional area of exposed non polar residue. 
- F4 - Fractional area of exposed non polar part of residue. 
- F5 - Molecular mass weighted exposed area. 
- F6 - Average deviation from standard exposed area of residue. 
- F7 - Euclidian distance. 
- F8 - Secondary structure penalty. 
- F9 - Spacial Distribution constraints (N,K Value).

In [8]:
if not exists('casp.csv'):
    
    url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00265/CASP.csv"
    response = requests.get(url, stream=True)

    with open("casp.csv", "wb") as f:
        for data in tqdm(response.iter_content()):
            f.write(data)

In [9]:
df = pd.read_csv('casp.csv')
df

Unnamed: 0,RMSD,F1,F2,F3,F4,F5,F6,F7,F8,F9
0,17.284,13558.30,4305.35,0.31754,162.1730,1.872791e+06,215.3590,4287.87,102,27.0302
1,6.021,6191.96,1623.16,0.26213,53.3894,8.034467e+05,87.2024,3328.91,39,38.5468
2,9.275,7725.98,1726.28,0.22343,67.2887,1.075648e+06,81.7913,2981.04,29,38.8119
3,15.851,8424.58,2368.25,0.28111,67.8325,1.210472e+06,109.4390,3248.22,70,39.0651
4,7.962,7460.84,1736.94,0.23280,52.4123,1.021020e+06,94.5234,2814.42,41,39.9147
...,...,...,...,...,...,...,...,...,...,...
45725,3.762,8037.12,2777.68,0.34560,64.3390,1.105797e+06,112.7460,3384.21,84,36.8036
45726,6.521,7978.76,2508.57,0.31440,75.8654,1.116725e+06,102.2770,3974.52,54,36.0470
45727,10.356,7726.65,2489.58,0.32220,70.9903,1.076560e+06,103.6780,3290.46,46,37.4718
45728,9.791,8878.93,3055.78,0.34416,94.0314,1.242266e+06,115.1950,3421.79,41,35.6045


### Problem 3a

Write a function that splits a data matrix X into two halves according to train which gives the percentage of the dataset to put in the first half. Each row of the data matrix corresponds to a point and each column corresponds to a feature.

1. A number of 50 means 50% in the first half and 50% in the second half.
2. A number of 20 means 20% in the first half and 80% in the second half.

In [10]:
def split_dataset(X: np.ndarray, train: int) -> (np.ndarray, np.ndarray):
    
    if 0 > train or train > 100:
        raise ValueError("Train should be between 0 and 100")

    train_size = int(train / 100 * X.shape[0])
    return df.iloc[:train_size, :], df.iloc[train_size:, :]
    
    raise Error("TODO")

In [11]:
train, test = split_dataset(df.values, 80)

print("Fraction assigned to test set: " + str(np.size(test)/(np.size(train)+np.size(test))))

np.shape(train)

Fraction assigned to test set: 0.2


(36584, 10)

### Problem 3b

Write a function that takes a Pandas dataframe `df`, a list of features `feature_keys` which are columns in a Pandas dataframe, and a Pandas key `y_key` that is a column corresponding to the regression value, and produces the weights of the linear regression.
1. The constant weight should be the $0$-th index.
2. The weights should be given in the order of the features.

In [12]:
def fit_regression(df: pd.DataFrame, feature_keys: List[str], y_key: str) -> np.ndarray:
    
    features = df[feature_keys].values
    y = df[y_key].values

    weights = solve_matrix(features, y)
    return weights

feature_columns = ['F1', 'F2', 'F3', 'F4', 'F5', 'F6', 'F7', 'F8', 'F9']
y_column = 'RMSD'
weights = fit_regression(train, feature_columns, y_column)

weights

array([ 7.82256287e-04,  3.11624224e-03,  8.26217493e-04, -9.36701930e-02,
       -1.16342246e-06, -2.40088008e-02, -9.17442805e-05,  1.59056106e-02,
        1.34188182e-01])

### Problem 3c

Write a prediction function that takes the weights of a linear regression, a data point, and produces a prediction.

In [13]:
def prediction(weights: np.ndarray, x: np.ndarray) -> float:
    
    return weights @ x

In [14]:
predictions = prediction(weights, test[feature_columns].values.transpose())
predictions

array([1.41550863, 6.8691442 , 6.5959128 , ..., 8.86931689, 8.565424  ,
       7.2730752 ])

In [15]:
test

Unnamed: 0,RMSD,F1,F2,F3,F4,F5,F6,F7,F8,F9
36584,2.096,16930.50,3898.60,0.23027,203.1190,2.390636e+06,264.4190,5602.09,98,23.3512
36585,1.257,5596.07,1246.98,0.22283,51.7444,8.104006e+05,70.9034,2840.34,59,40.3889
36586,7.008,5648.33,1763.51,0.31221,62.8576,7.395403e+05,87.5732,3270.77,41,38.6058
36587,10.930,8352.28,1664.81,0.19932,80.1154,1.153948e+06,118.5080,3847.18,41,35.7656
36588,2.634,10632.80,3645.47,0.34285,124.4450,1.437999e+06,190.1710,3740.54,33,33.2004
...,...,...,...,...,...,...,...,...,...,...
45725,3.762,8037.12,2777.68,0.34560,64.3390,1.105797e+06,112.7460,3384.21,84,36.8036
45726,6.521,7978.76,2508.57,0.31440,75.8654,1.116725e+06,102.2770,3974.52,54,36.0470
45727,10.356,7726.65,2489.58,0.32220,70.9903,1.076560e+06,103.6780,3290.46,46,37.4718
45728,9.791,8878.93,3055.78,0.34416,94.0314,1.242266e+06,115.1950,3421.79,41,35.6045


### Problem 3d

Write a function that computes the root mean square error (rmse) on a data matrix $X$ with rows $X_1, \dots X_n$, and corresponding values $y = (y_1 \dots y_n)$ given a prediction function $f$. The rmse is defined as
$$
\sqrt{\frac{1}{n}\sum_{i=1}^n \lVert f(X_i) - y_i \rVert^2}
$$

In [16]:
def rmse(predictions: Callable[[np.ndarray, np.ndarray], float], X: np.ndarray, y: np.ndarray) -> float:
    
    return np.sqrt(np.mean((predictions - y)**2))

In [17]:
print ("RMSE: " + str(rmse(predictions, test[feature_columns].values, test[y_column].values)))
print("Average value of the RMSD: " + str(np.mean(test[y_column].values)))

RMSE: 5.255331673714851
Average value of the RMSD: 7.734758582987098


- We see here that RMSE value is relatively very big compared to the average of RMSD (y) values.
- We see below that functions linalg.lstsq() and linalg.solve() result in almost if not exatly the same weight coefficients.

In [18]:
np.linalg.lstsq(test[feature_columns], test[y_column], rcond=None)[0]

array([ 1.97300198e-03,  6.94203811e-04,  2.51239863e+01, -1.12558359e-01,
       -4.01309190e-06, -2.53056743e-02, -1.47530136e-04,  1.77020162e-02,
       -3.67679184e-02])

In [19]:
np.linalg.solve(test[feature_columns].transpose() @ test[feature_columns], test[feature_columns].transpose() @ test[y_column])

array([ 1.97300198e-03,  6.94203811e-04,  2.51239863e+01, -1.12558359e-01,
       -4.01309190e-06, -2.53056743e-02, -1.47530136e-04,  1.77020162e-02,
       -3.67679184e-02])