# BC Data Workshop - Video Compression Project

In [None]:
datadir = '/home/curtd/var/data/'
datafile = 'A3.csv'
localdatadir = '~/var/data/'
localdatafile = 'TotalBytes.csv'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
pd.options.display.max_columns = 50
plt.rcParams['font.size'] = 16

In [None]:
# Preprocessing data by Aaron
def checkColumn(df, colNum):
    """
    Used in throwAwayUnchanged
    """
    return np.all(df.iloc[0, colNum] == df.iloc[1:, colNum])

# Preprocessing data by Aaron
def removeUnwantedCols(data):
    drop_cols = ['Flicker','Index','Sharpening','TotalBytes','Status','Message']
    for c in drop_cols:
        data = data.drop(c,axis=1)
    data.groupby('Mode').get_group(0)
    data = removeUnchanging(data)
    return data

def removeUnchanging(data):
    idxUnhelpful = [j for j in range(data.columns.size) if checkColumn(data,j)]
    data = data.drop(data.columns[idxUnhelpful],axis=1)    
    return data

def fixMiscValues(data):
    data = data.fillna({'TertiaryResolution' : 'NaN'})
    data = data.replace('-', value=0)
    data['SecondaryBitsPerSecond'] = data['SecondaryBitsPerSecond'].astype(np.float64)
    #data['TertiaryBitsPerSecond'] = data['TertiaryBitsPerSecond'].astype(np.float64)
    return data

def logTransformColumn(data, colname):
    """
    Tailor-made for the Midvale data. 
    log-transforms the columns pertaining to bit-rate.
    """
    logBytes = data[colname]
    logBytes = logBytes.replace(0., np.nan).apply(lambda x: np.log10(x))
    logBytes = logBytes.dropna(how='all')
    return data.assign(**{'log'+logBytes.name: logBytes})

def addLogCols(data):
    for columnName in ['PrimaryBitsPerSecond', 
                       'SecondaryBitsPerSecond']:
        data = logTransformColumn(data, columnName)
    return data

def preProcess(data):
    data = removeUnwantedCols(data)
    data = fixMiscValues(data)
    data = addLogCols(data)
    return data

# Read in data
data = pd.read_csv(datadir + datafile)
data = preProcess(data)
data.head()

In [None]:
# Data processing by Aaron
def setUpCategs(data, sparse=False):
    from sklearn.preprocessing import LabelEncoder, OneHotEncoder
    lb = LabelEncoder()
    oh = OneHotEncoder()
    
    categoricals = data.select_dtypes(include=['object'])
    categoricals = pd.concat((categoricals, data['Nonlinear']), axis=1)
    categoricals = categoricals.apply(lb.fit_transform)
    categoricals = oh.fit_transform(categoricals)
    if not sparse:
        categoricals = categoricals.toarray()
    return categoricals
categoricals = setUpCategs(data)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

numerical_names = ['Keyframe', 'ImageRate', 'Quality', 'KbpsLimit', 'CollectSeconds']
numerical = data.filter(items=numerical_names)

response_names = ['logPrimaryBitsPerSecond', 'logSecondaryBitsPerSecond']
responses = data.filter(items=response_names)

scaler = StandardScaler()
X = np.hstack((scaler.fit_transform(numerical.values), categoricals))
y = responses.values
y_pbps = y[:, 1]

X_train, X_test,y_train,y_test = train_test_split(X,y_pbps)


In [None]:
# Output statistics for after regression
res_func_inv = lambda x: 10**x
pdeviation = 0.3
def RMSE(output, prediction):
    return np.sqrt(np.mean((res_func_inv(output) - res_func_inv(prediction))**2))
def NRMSE(output, prediction):
    return RMSE(output,prediction)/(np.max(res_func_inv(output))-np.min(res_func_inv(output)))
def percentHits(output,prediction):
    return np.mean( np.abs(res_func_inv(output)-res_func_inv(prediction))<pdeviation*abs(res_func_inv(output)) )

def regressionSummary(method,Xtrain,Ytrain,Xtest,Ytest):
    predict_train = method.predict(Xtrain)
    train_nrmse = NRMSE(Ytrain,predict_train)
    train_hits = percentHits(Ytrain,predict_train)
    Rsq_train = method.score(Xtrain,Ytrain)
    predict_test = method.predict(Xtest)    
    test_nrmse = NRMSE(Ytest,predict_test)
    test_hits = percentHits(Ytest,predict_test)
    Rsq_test = method.score(Xtest,Ytest)
    print('------Training data------')
    print('NRMSE = {}'.format(train_nrmse))
    print('Percentage of values with <',pdeviation*100,'percent rel err =',train_hits)
    print('R^2 = {}'.format(Rsq_train))
    print('------Test data----------')
    print('NRMSE = {}'.format(test_nrmse))
    print('Percentage of values with <',pdeviation*100,'percent rel err =',test_hits)
    print('R^2 = {}'.format(Rsq_test))

In [None]:
# Plain linear regression
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,y_train)

regressionSummary(lr,X_train,y_train,X_test,y_test)

In [None]:
# Plain elastic net
from sklearn.linear_model import ElasticNetCV
en = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], n_jobs=-1)
en.fit(X_train,y_train)

regressionSummary(en,X_train,y_train,X_test,y_test)

In [None]:
# Kernel ridge regression 
from sklearn.kernel_ridge import KernelRidge
kr = KernelRidge(alpha=1,kernel='rbf',gamma=0.1)
kr.fit(X_train,y_train)
regressionSummary(kr,X_train,y_train,X_test,y_test)