# BC Data Workshop - Video Compression Project

In [None]:
datadir = '/home/curtd/var/data/'
datafile = 'AllData.csv'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
pd.options.display.max_columns = 50
plt.rcParams['font.size'] = 16

In [None]:
# Preprocessing data by Aaron
def checkColumn(df, colNum):
    """
    Used in throwAwayUnchanged
    """
    return np.all(df.iloc[0, colNum] == df.iloc[1:, colNum])

def removeUnwantedCols(data):
    drop_cols = ['Flicker','Index','Sharpening','TotalBytes','Status','Message',
                 'TertiaryBitsPerSecond','TertiaryResolution']
    for c in drop_cols:
        data = data.drop(c,axis=1)
    data.groupby('Mode').get_group(0)
    data = removeUnchanging(data)
    return data

def removeUnchanging(data):
    idxUnhelpful = [j for j in range(data.columns.size) if checkColumn(data,j)]
    data = data.drop(data.columns[idxUnhelpful],axis=1)    
    return data

def fixMiscValues(data):
    data = data.fillna({'TertiaryResolution' : 'NaN'})
    data = data.replace('-', value=0)
    data['SecondaryBitsPerSecond'] = data['SecondaryBitsPerSecond'].astype(np.float64)
    #data['TertiaryBitsPerSecond'] = data['TertiaryBitsPerSecond'].astype(np.float64)
    return data

def appendFeatureName(data):
    cols = ['Compression','Test','Detail','Motion','CameraName','PrimaryResolution','SecondaryResolution']
    for c in cols:
        data[c] = data[c].apply(lambda x: c + '_' + x)
    return data

def logTransformColumn(data, colname):
    """
    Tailor-made for the Midvale data. 
    log-transforms the columns pertaining to bit-rate.
    """
    logBytes = data[colname]
    logBytes = logBytes.replace(0., np.nan).apply(lambda x: np.log10(x))
    logBytes = logBytes.dropna(how='all')
    return data.assign(**{'log'+logBytes.name: logBytes})

def addLogCols(data):
    for columnName in ['PrimaryBitsPerSecond', 
                       'SecondaryBitsPerSecond']:
        data = logTransformColumn(data, columnName)
    return data

def preProcess(data):
    data = removeUnwantedCols(data)
    data = fixMiscValues(data)
    data = addLogCols(data)
    data = appendFeatureName(data)
    return data

# Read in data
data = pd.read_csv(datadir + datafile)
data = preProcess(data)
data.head()

In [None]:
numerical_names = ['Keyframe', 'ImageRate', 'Quality', 'KbpsLimit']
numerical = data.filter(items=numerical_names)

categ_names = ['PrimaryResolution', 'SecondaryResolution', 'Detail', 'Test', 
               'Motion','CameraName','Nonlinear','Mode','Compression']
categ = data.filter(items=categ_names)

response_names = ['logPrimaryBitsPerSecond', 'logSecondaryBitsPerSecond']
responses = data.filter(items=response_names)

excluded_names = [j for j in data.columns if (j not in categ_names and j not in numerical_names and j not in response_names and j not in ['PrimaryBitsPerSecond','SecondaryBitsPerSecond']) ]
print('Included numerical features',numerical_names)
print('Included categorical features',categ_names)
print('Excluded features',excluded_names)

# Data processing by Aaron
def categDF(data):
    categoricals = data.select_dtypes(include=['object'])
    #categoricals = categoricals.drop('Message', axis=1)
    return categoricals

def setUpCategs(data, sparse=False):
    from sklearn.preprocessing import LabelEncoder, OneHotEncoder
    lb = LabelEncoder()
    oh = OneHotEncoder()
    categoricals = categDF(data)
    categoricals = categoricals.apply(lb.fit_transform)
    categoricals = oh.fit_transform(categoricals)
    if not sparse:
        categoricals = categoricals.toarray()
    return categoricals

def unencodeOneHotLabelling(ohEnc, oh, lbl):
    return lbl.inverse_transform(oh.active_features_)[np.argmax(ohEnc, axis=-1)]

categoricals = setUpCategs(categ)
categ_data = categDF(categ)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def getCategNames(df):
    return np.concatenate([np.unique(df[col].values) for col in df.columns])

def unencodeOneHot(ohArr, df):
    categNames = getCategNames(df)
    return [categNames[np.where(ohArr)[1]][j::2] for j in range(2)]

categ_names = getCategNames(categ_data)
data_names = np.concatenate((numerical_names, categ_names))

scaler = StandardScaler()
X = np.hstack((scaler.fit_transform(numerical.values), categoricals))
y = responses.values
y_pbps = y[:, 0]
nonnanrows = [not np.isnan(y_pbps[j]) for j in range(y_pbps.shape[0])]
X_nonnan = X[nonnanrows, :]
y_nonnan = y_pbps[nonnanrows]

X_train, X_test,y_train,y_test = train_test_split(X_nonnan,y_nonnan)
ntrain = X_train.shape[0]
ntest = X_test.shape[0]
nfeat = X_train.shape[1]
print('# of training samples {}'.format(ntrain))
print('# of test samples {}'.format(ntest))
print('# of features {}'.format(nfeat))


In [None]:
# Output statistics for after regression
res_func_inv = lambda x: 10**x
pdeviation = 0.2
def RMSE(output, prediction):
    return np.sqrt(np.mean((res_func_inv(output) - res_func_inv(prediction))**2))
def NRMSE(output, prediction):
    return RMSE(output,prediction)/(np.max(res_func_inv(output))-np.min(res_func_inv(output)))
def percentHits(output,prediction,pdeviation):
    return np.mean( np.abs(res_func_inv(output)-res_func_inv(prediction))<pdeviation*abs(res_func_inv(output)) )

def regressionSummary(method,Xtrain,Ytrain,Xtest,Ytest):
    predict_train = method.predict(Xtrain)
    train_nrmse = NRMSE(Ytrain,predict_train)
    train_hits = percentHits(Ytrain,predict_train,pdeviation)
    Rsq_train = method.score(Xtrain,Ytrain)
    predict_test = method.predict(Xtest)    
    test_nrmse = NRMSE(Ytest,predict_test)
    test_hits = percentHits(Ytest,predict_test,pdeviation)
    Rsq_test = method.score(Xtest,Ytest)
    print('Regression results for ',method.__class__)
    print('------Training data------')
    print('NRMSE = {}'.format(train_nrmse))
    print('Percentage of values with <',pdeviation*100,'percent rel err =',train_hits)
    print('R^2 = {}'.format(Rsq_train))
    print('------Test data----------')
    print('NRMSE = {}'.format(test_nrmse))
    print('Percentage of values with <',pdeviation*100,'percent rel err =',test_hits)
    print('R^2 = {}'.format(Rsq_test))

In [None]:
# Plain linear regression
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train,y_train)

regressionSummary(lr,X_train,y_train,X_test,y_test)

In [None]:
# Plain elastic net
if ntrain < 5000:
    from sklearn.linear_model import ElasticNetCV
    en = ElasticNetCV(l1_ratio=[.1, .5, .7, .9, .95, .99, 1], n_jobs=-1)
    en.fit(X_train,y_train)
    regressionSummary(en,X_train,y_train,X_test,y_test)

In [None]:
if ntrain < 5000:
    # Kernel ridge regression 
    from sklearn.kernel_ridge import KernelRidge
    kr = KernelRidge(alpha=1,kernel='rbf',gamma=0.1)
    kr.fit(X_train,y_train)
    regressionSummary(kr,X_train,y_train,X_test,y_test)

In [None]:
# Random Forest Regression 
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_estimators=10)
rf.fit(X_train,y_train)
regressionSummary(rf,X_train,y_train,X_test,y_test)

In [None]:
# Extra Trees Regression
from sklearn.ensemble import ExtraTreesRegressor
et = ExtraTreesRegressor()
et.fit(X_train,y_train)
regressionSummary(et,X_train,y_train,X_test,y_test)

In [None]:
# Gradient Boosting - takes a while
from sklearn.ensemble import GradientBoostingRegressor
gb = GradientBoostingRegressor(n_estimators=100)
gb.fit(X_train,y_train)
regressionSummary(gb,X_train,y_train,X_test,y_test)

In [None]:
# Feature importances for Extra Trees Regressor
importances = et.feature_importances_
std = np.std([tree.feature_importances_ for tree in et.estimators_],axis=0)
indices = np.argsort(importances)[::-1]
dispindex = np.where(importances[indices] < 0.05*importances[indices][0])[0][0]
importances_trunc = importances[indices][:dispindex]
names_trunc = data_names[indices][:dispindex]
std_trunc = std[indices][:dispindex]

plt.figure()
plt.bar(range(dispindex),importances_trunc,color="r",yerr=std_trunc)
plt.xticks(range(dispindex),names_trunc,rotation='vertical')
plt.show()