# Singular Value Decomposition

Example from Richard Smith's analysis of the BAA Marathon data

If $A$ is an $m\times n$ real matrix, the *singular value decomposition* is a factorization of the form:

$$
A = U\;S\;V'
$$

Where:

* $U$ is an $m\times m$ orthogonal matrix (i.e., $U'U = UU' = I$)
* $S$ is an $m\times n$ diagonal matrix
* $V$ is an $n\times n$ orthogonal matrix

In [None]:
#import numpy, pandas, svd (Singular Value Decomposition), pinv (Moore-Penrose pseudoinverse
import numpy as np
import pandas as pd
from numpy.linalg import svd, pinv

## Read the data from Tim1.txt

In [None]:
#read in the data
fTim1 = pd.read_csv('../../rls_baa2013/TIM1.txt', delim_whitespace=True)
fTim1.head()

## Create a dataframe with only the runners with NO missing split times

In [None]:
#find runners with complete split times
fTim1Data = fTim1[pd.isnull(fTim1.ix[:,'K0-5':'K40-Fin']).sum(axis=1) == 0]
print(fTim1Data.shape)
fTim1Data.head()

## Create a dataframe with just (complete) split times as columns

In [None]:
frameFull = fTim1Data.ix[:,'K0-5':'K40-Fin']
print(frameFull.shape)
frameFull.head()

## x

In [None]:
u,s,vt = svd(frameFull.values, full_matrices=False)

## Determine the type of the u object

In [None]:
print(type(u))

## List the attributes of numpy.ndarray

In [None]:
print(dir(u))

## List the dimensions of u

In [None]:
u.shape

## Show the first few rows of u

In [None]:
print(u[0:5,])

## Determine the type of the s object

In [None]:
print(type(s))

## List the dimensions of s

In [None]:
s.shape

## Show s

In [None]:
print(s)

## Determine the type of the vt object

In [None]:
print(type(vt))

## List the dimensions of vt

In [None]:
vt.shape

## Show vt

In [None]:
print(vt)

## Get the transpose of vt

In [None]:
v = vt.T
print(vt.shape)
print(v.shape)

## Compute vt' times the diagonal matrix constructed from s

In [None]:
vs = np.dot(v[:,:9],np.diag(s[:9]))

## Make a dataframe from the result

In [None]:
vsdf = pd.DataFrame(vs, index = frameFull.columns)
print(vsdf.shape)
vsdf

## Create a dataframe with only runners WITH missing split times

In [None]:
#find runners with missing split times
fTim1Test = fTim1[pd.isnull(fTim1.ix[:,'K0-5':'K40-Fin']).sum(axis=1) > 0]
print(fTim1Test.shape)
fTim1Test.head()

## Make a dataframe with only splits for runners with missing values

In [None]:
frameTest = fTim1Test.ix[:,'K0-5':'K40-Fin']
print(frameTest.shape)
frameTest.head()

## Create a copy of frameTest to hold the predicted values

In [None]:
framePredict = frameTest.copy()

## Now fill in the missing splits

In [None]:
ndisp=4
i=0
skip=100

for ind, row in frameTest.iterrows():
        row_part = row.dropna()
        if (i < ndisp+skip):
            print("\n******************\nrow: {}\n\n",format(row))
            print("row_part: {}\n\n",format(row_part))
            print("row_part.index: {}\n\n",format(row_part.index))
            print("vsdf.ix[row_part.index]: {}\n\n",format(vsdf.ix[row_part.index]))
            print("pinv: {}\n\n",format(pinv(vsdf.ix[row_part.index])))
            print("np.dot(pinv(vsdf.ix[row_part.index]), row_part): {}\n\n",format(np.dot(pinv(vsdf.ix[row_part.index]), row_part)))
            print("np.dot(vsdf, x): {},\n\n",format(np.dot(vsdf, x)))
        i=i+1

In [None]:
for ind, row in frameTest.iterrows():
    row_part = row.dropna()                                 #select only non-missing cells
    x = np.dot(pinv(vsdf.ix[row_part.index]), row_part)
    framePredict.ix[ind] = np.dot(vsdf, x)

## Show a few predicted values

In [None]:
print(framePredict.shape)
framePredict.head()

## Create a column for predicted finish time

In [None]:
framePredict['SVDTime'] = framePredict.sum(axis=1)
framePredict.head()

## Save the completed times to a .csv file

In [None]:
#save the data along with the SVDTime column to output file
framePredict.to_csv('PredictSVD.csv')