In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import preprocessing
%matplotlib inline

# Capstone Project: Midvale Video Compression


In [None]:
# First consider data from a single camera
A3=pd.read_csv('data/A3.csv')
print(A3.head())

## Feature Engineering
- Resolution: take product of dimensions to get total number of pixels
- Status:
- Message:
- Test:
- Detail: high -> 1, low -> 0
- Motion: high -> 1, low->0

In [None]:
# resolutions
def res2int(res):
    for ii in range(len(res)):
        ind=res[ii].index('x')
        res[ii]=(float(res[ii][:ind])*(float(res[ii][ind+1:])))
    return res
p_res=res2int(A3['PrimaryResolution'].values)
s_res=res2int(A3['SecondaryResolution'].values)

In [None]:
# Detail and Motion
def highlow2int(arr):
    for ii in range(len(arr)):
        arr[ii]=int(arr[ii]=='high')
    return arr
motion=highlow2int(A3['Motion'].values)
detail=highlow2int(A3['Detail'].values)

# Test
def test2int(arr):
    for ii in range(len(arr)):
        arr[ii]=int(arr[ii]=='Compression')
    return arr
test=test2int(A3['Test'].values)

In [None]:
# Plots (features vs primary bitrate)
fig = plt.figure(figsize=(17,8))

ax1 = fig.add_subplot(2,1,1)
ax1.scatter(p_res, np.log(A3['PrimaryBitsPerSecond']))
ax1.set_xlabel('Primary Resolution')
ax1.set_ylabel('log(Primary Bitrate)')

ax2 = fig.add_subplot(2,1,2)
ax2.scatter(s_res, np.log(A3['PrimaryBitsPerSecond']))
ax2.set_xlabel('Secondary Resolution')
ax2.set_ylabel('log(Primary Bitrate)')


In [None]:
datadir = '/home/curtd/var/data/'
datafile = 'A3.csv'
localdatadir = '~/var/data/'
localdatafile = 'TotalBytes.csv'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
data = pd.read_csv(datadir + datafile)
#print(data.head())

# Output column
PrimaryBits = data['PrimaryBitsPerSecond'].as_matrix()
SecondaryBits = data['SecondaryBitsPerSecond'].as_matrix()
TertiaryBits = data['TertiaryBitsPerSecond'].as_matrix()

# Features 
StrFeatKeys = ['PrimaryResolution','SecondaryResolution','TertiaryResolution','Flicker','Nonlinear','Compression','Test','Detail','Motion']
StrFeats = {}
for s in StrFeatKeys:
    StrFeats[s] = data[s].values

NumFeatKeys = ['Keyframe','ImageRate','Quality','KbpsLimit','WaitSeconds','CollectSeconds','TotalBytes']    
NumFeats = {}
for s in NumFeatKeys:
    NumFeats[s] = data[s].as_matrix()
    
m = PrimaryBits.shape[0]
print('\n Number of samples:', m)

In [None]:
def res_to_number(r):
    s = np.zeros(r.shape[0])
    for i in range(len(s)):
        ri = r[i]
        xloc = str.find(ri,'x')
        s[i] = int(r[i][:xloc])*int(r[i][xloc+1:])
    return s    

def categ_unordered_to_one_hot(r,vals):
    p = len(vals)
    s = np.zeros((len(r),p))
    for i in range(len(r)):
        ri = str(r[i])
        idx = vals.index(ri)
        s[i,idx] = 1
    return s

def categ_ordered_to_num(r,vals):
    p = len(vals)
    s = np.zeros(len(r))
    for i in range(len(r)):
        ri = str(r[i])
        idx = vals.index(ri)
        s[i] = idx
    return s
res_func = lambda x: np.log10(x)
res_func_inv = lambda x: 10**x

PrimRes_num = res_func(res_to_number(StrFeats['PrimaryResolution']))
SecRes_num = res_func(res_to_number(StrFeats['SecondaryResolution']))
#TerRes_num = res_func(res_to_number(StrFeats['TertiaryResolution']))

Flicker_num = categ_unordered_to_one_hot(StrFeats['Flicker'],['50','60'])
Nonlinear_num = categ_unordered_to_one_hot(StrFeats['Nonlinear'],['0','1'])
Compr_num = categ_unordered_to_one_hot(StrFeats['Compression'],['Off','Low','Medium','High'])
Kbpslim_num = NumFeats['KbpsLimit']
Waitsec_num = NumFeats['WaitSeconds']
Collectsec_num = NumFeats['CollectSeconds']
Totalbytes_num = NumFeats['TotalBytes']

Test_num = categ_unordered_to_one_hot(StrFeats['Test'],['Base','Idle','Compression','HDR'])
Detail_num = categ_unordered_to_one_hot(StrFeats['Detail'],['low','medium','high'])
Motion_num = categ_unordered_to_one_hot(StrFeats['Motion'],['none','low','high'])

AllCols =[PrimRes_num,SecRes_num,Flicker_num,Nonlinear_num,Compr_num,Kbpslim_num,Waitsec_num,
          Collectsec_num,Test_num,Detail_num,Motion_num]
for i in range(len(AllCols)):
    if len(AllCols[i].shape)==1:
        AllCols[i] = np.expand_dims(AllCols[i],axis=1)

In [None]:
X = np.concatenate(AllCols,axis=1)
Y = np.log10(PrimaryBits)

## Clean the data
Iremove = np.where(np.apply_along_axis(lambda x: np.any(x < 0) ,1,X))
X,Y = np.delete(X,Iremove,axis=0),np.delete(Y,Iremove,axis=0)

m,n = X.shape[0],X.shape[1]

## Shuffle the data
np.random.seed(857493)
shuffle = np.random.permutation(np.arange(m))
X, Y = X[shuffle], Y[shuffle]
p = 0.6
pdev = 0.8
Xtrain,Ytrain = X[:int(m*p),:],Y[:int(m*p)]
Xdev,Ydev = X[int(m*p):int(m*pdev),:],Y[int(m*p):int(m*pdev)]
Xtest,Ytest = X[int(m*pdev):,:],Y[int(m*pdev):]

Xtrain_final,Ytrain_final = X[:int(m*pdev),:],Y[:int(m*pdev)]

print('# total samples', m, 'with',n, 'features')
print('# train samples',Xtrain.shape[0])
print('# dev samples',Xdev.shape[0])
print('# test samples',Xtest.shape[0])

In [None]:
fig = plt.figure(figsize=(17,5))
ax1=fig.add_subplot(1,3,1)
ax1.plot(X[:,0],Y,'o')
ax1.set_xlabel('Feature 0')
ax1.set_ylabel('Output')

plt.show()

In [1]:
from sklearn import preprocessing
pdeviation = 0.3

scaler = preprocessing.StandardScaler()
Xtrain_scaled = scaler.fit_transform(Xtrain)
Xdev_scaled = scaler.transform(Xdev)

scaler_final = preprocessing.StandardScaler()
Xtrain_final_scaled = scaler_final.fit_transform(Xtrain_final)
Xtest_scaled = scaler_final.transform(Xtest)

lr = LinearRegression()
lr.fit(Xtrain_final_scaled,Ytrain_final)

def RMSE(output, prediction):
    return np.sqrt(np.mean((res_func_inv(output) - res_func_inv(prediction))**2))
def NRMSE(output, prediction):
    return RMSE(output,prediction)/(np.max(res_func_inv(output))-np.min(res_func_inv(output)))
def PERCENT_HITS(output,prediction):
    return np.mean( np.abs(res_func_inv(output)-res_func_inv(prediction))<pdeviation*abs(res_func_inv(output)) )


predict_train = lr.predict(Xtrain_final_scaled)
lr_train_rmse = NRMSE(Ytrain_final,predict_train)
print('Train err equals',lr_train_rmse)

prediction = lr.predict(Xtest_scaled)
lr_test_rmse = NRMSE(Ytest, prediction)
print('Test error equals: ', lr_test_rmse)

print('Percent of samples that deviate from the true predictions by less than ', pdeviation, ' (relatively)')
print(PERCENT_HITS(Ytest,prediction))

print(lr.coef_)



NameError: name 'Xtrain' is not defined

In [None]:
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso

alphas = [1e-4,1e-3,1e-2,1e-1,1,1e1,1e2]
min_dev_rmse = 1e10
min_alpha = 0
for i in range(len(alphas)):
    rm = Ridge(alpha=alphas[i])
    rm.fit(Xtrain_scaled,Ytrain)

    prediction = rm.predict(Xtrain_scaled)
    rm_train_rmse = NRMSE(Ytrain,prediction)

    predict_ridge = rm.predict(Xdev_scaled)
    rm_dev_rmse = NRMSE(Ydev,predict_ridge)
    if rm_dev_rmse < min_dev_rmse:
        min_dev_rmse = rm_dev_rmse
        min_alpha = alphas[i]    

print('Best alpha ',min_alpha)
rm = Ridge(alpha=min_alpha)
rm.fit(Xtrain_final_scaled,Ytrain_final)

prediction = rm.predict(Xtrain_final_scaled)
rm_train_rmse = NRMSE(Ytrain_final, prediction)
print('Test error equals: ', rm_train_rmse)

prediction = rm.predict(Xtest_scaled)
rm_test_rmse = NRMSE(Ytest, prediction)
print('Test error equals: ', rm_test_rmse)


print('Percent of samples that deviate from the true predictions by less than ', pdeviation, ' (relatively)')
print(PERCENT_HITS(Ytest,prediction))


print(rm.coef_)

In [None]:
alphas = [1e-4,1e-3,1e-2,1e-1,1,1e1,1e2]
min_dev_rmse = 1e10
min_alpha = 0
for i in range(len(alphas)):
    las = Lasso(alpha=alphas[i])
    las.fit(Xtrain_scaled,Ytrain)

    prediction = las.predict(Xtrain_scaled)
    rm_train_rmse = NRMSE(Ytrain,prediction)

    predict_ridge = las.predict(Xdev_scaled)
    rm_dev_rmse = NRMSE(Ydev,predict_ridge)
    if rm_dev_rmse < min_dev_rmse:
        min_dev_rmse = rm_dev_rmse
        min_alpha = alphas[i]    

print('Best alpha ',min_alpha)
las = Lasso(alpha=min_alpha)
las.fit(Xtrain_final_scaled,Ytrain_final)

prediction = las.predict(Xtrain_final_scaled)
rm_train_rmse = NRMSE(Ytrain_final, prediction)
print('Train error equals: ', rm_train_rmse)

prediction = las.predict(Xtest_scaled)
rm_test_rmse = NRMSE(Ytest, prediction)
print('Test error equals: ', rm_test_rmse)

print('Percent of samples that deviate from the true predictions by less than ', pdeviation, ' (relatively)')
print(PERCENT_HITS(Ytest,prediction))

print(las.coef_)