# Principle Component Analysis
Linear regression and initial building of machine learning model. For the purpose of testing, we have selected, 'vm': "Virtual Memory Statistics" as the variable we are trying to predict.

This notebook is based in part on the wonderful tutorial found [here](https://www.kaggle.com/miguelangelnieto/pca-and-regression) on Kaggle.

*Notes from the article:*
- Applying log transformation really increases the accuracy.
- Using PCA with 36 components makes the learning and testing much (much much) faster.
- There are outliers. Instead of removing them, using Huber seems to provide a good result. Huber is a model robust to outliers.

In [2]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import normalize
from sklearn import svm
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.metrics import make_scorer
from sklearn.metrics import r2_score
from sklearn.tree import DecisionTreeRegressor

import tensorflow as tf
import tflearn

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn

import warnings
from os import listdir
warnings.filterwarnings('ignore')

In [3]:
# collect file names for all jobs
train_files = [ './jobs/train/'+file for file in listdir('./jobs/train/') if file.startswith('comet') ]
test_files = [ './jobs/test/'+file for file in listdir('./jobs/test/') if file.startswith('comet')]

In [4]:
len(train_files), len(test_files)

(7425, 3192)

### Minor Data Cleanup

In [5]:
cols = {
    "amd64_pmc": "AMD Opteron performance counters (per core)",
        "intel_hsw": "Intel Haswell Processor (HSW) (per core)",
        "intel_hsw_ht": "Intel Haswell Processor - Hyper-threaded (per logical core)",
        "intel_nhm": "Intel Nehalem Processor (NHM) (per core)",
        "intel_uncore": "Westmere Uncore (WTM) (per socket)",
        "intel_snb": "Intel Sandy Brige (SNB) or Ivy Bridge (IVB) Processor (per core)",
        "intel_rapl": "Running average power limit",
        "intel_hsw_cbo": "Caching Agent (CBo) for SNB (HSW) (per socket)",
        "intel_hsw_pcu": "Power Control Unit for SNB (HSW) (per socket)",
        "intel_hsw_imc": "Integrated Memory Controller for SNB (HSW) (per socket)",
        "intel_hsw_qpi": "QPI Link Layer for SNB (HSW) (per socket)",
        "intel_hsw_hau": "Home Agent Unit for SNB (HSW) (per socket)",
        "intel_hsw_r2pci": "Ring to PCIe Agent for SNB (HSW) (per socket)",
        "ib": "Infiniband usage (default)",
        "ib_sw": "InfiniBand usage (sw)",
        "ib_ext": "Infiniband usage (ext)",
        "llite": "Lustre filesystem usage (per mount)",
        "lnet": "Lustre network usage (lnet)",
        "mdc": "Lustre network usage (mdc)",
        "mic": "MIC scheduler account (per hardware thread)",
        "osc": "Lustre filesystem usage (osc)",
        "block": "Block device statistics (per device)",
        "cpu": "Scheduler accounting (per CPU)",
        "mem": "Memory usage (per socket)",
        "net": "Network device usage (per device)",
        "nfs": "NFS system usage",
        "numa": "NUMA statistics (per socket)",
        "proc": "Process specific data (MaxRSS, executable name etc.)",
        "ps": "Process statistics",
        "sysv_shm": "SysV shared memory segment usage",
        "tmpfs": "Ram-backed filesystem usage (per mount)",
        "vfs": "Dentry/file/inode cache usage",
        "vm": "Virtual memory statistics"
       }

titles = [ value for value in cols.values() ]

In [6]:
# normalize all data values in DataFrame
def clean ( file ):
    df = pd.read_csv( file ).drop("Cycle", 1)
    
    for title in titles:
        try:
            df[[title]] = normalize( df[[title]], axis=0, norm='max', copy=False )
        except:
            next
        
    return df

In [7]:
focus = cols['vm']

# Begin Training

In [None]:
train = clean( train_files[0] )

In [None]:
data = train
ids = test[[focus]]
labels = data.index.values

In [None]:
#test = clean( test_files[0] )
#data = pd.concat([train,test],ignore_index=True).T
#ids = test[[cols['vm']]]
#labels = data.index.values

In [None]:
data.shape

## Feature reduction ##

Use PCA to reduce the number of components by identifying ones with strong relationships.

In [None]:
pca = PCA(n_components=data.shape[1], whiten=True )
pca_df = pd.DataFrame( pca.fit_transform( data ) )
variance = pd.DataFrame( pca.explained_variance_ratio_ )
np.cumsum( pca.explained_variance_ratio_ )

In [None]:
pca = PCA() # whiten=True
pca_data = pca.fit_transform( data )
pca_data = pd.DataFrame( pca_data )

In [None]:
pca_data

## Data Model Selection ##

Simple test to run multiple models against our data. First, with raw features. No PCA.

In [None]:
stats_dict = {
    "Linear": linear_model.LinearRegression(),
    "Ridge": linear_model.Ridge(),
    "Bayesian Ridge": linear_model.BayesianRidge(),
    "Huber": linear_model.HuberRegressor(),
    "Lasso": linear_model.Lasso(alpha=1e-4),
    "Bagging": BaggingRegressor(),
    "RandomForest": RandomForestRegressor(),
    "AdaBoost": AdaBoostRegressor(),
    "SVM RBF": svm.SVR()
    #"SVM Linear": svm.SVR(kernel="linear")        # too complex to calculate for this size df
}

In [None]:
def test_model ( df ):
    results = {}
    cv = KFold( n_splits=5, shuffle=True, random_state=45 )
    r2 = make_scorer( r2_score )
    
    for stat,func in stats_dict.items():
        results[ stat ] = [ cross_val_score( func, df, df[ focus ], cv=cv, scoring=r2 ).mean() ]
        
    res_df = pd.DataFrame( results ).T.rename( columns={ 0: "R Square Score" })
    res_df = res_df.sort_values( by=["R Square Score"], ascending=False )
    
    res_df.plot(kind="bar",title="Model Scores")
    axes = plt.gca()
    axes.set_ylim([ -1,1])
    
    return res_df

In [None]:
# Cut data at 2/3rds
total_val = data.shape[0]
cut = int(total_val * 2 / 3)

# Split training and test
train = data[ :cut ]
test = data[ cut: ]

test_model(data)

In [None]:
cv = KFold( n_splits=5, shuffle=True, random_state=45 )

parameters = {'alpha': [1000,100,10],
              'epsilon' : [1.2,1.25,1.50],
              'tol' : [1e-10]
             }

clf = linear_model.HuberRegressor()
r2 = make_scorer(r2_score)
grid_obj = GridSearchCV(clf, parameters, cv=cv,scoring=r2)
grid_fit = grid_obj.fit(train, labels)
best_clf = grid_fit.best_estimator_ 

best_clf.fit(train,labels)

## regression

In [None]:
positives = df[df['Decision'] >= 0]
negatives = df[df['Decision'] < 0]
 
plt.scatter(positives['x1'], positives['x2'], marker='+', s=500*abs(positives['Decision']), c='blue')
plt.scatter(negatives['x1'], negatives['x2'], marker='_', s=500*abs(negatives['Decision']), c='red')
plt.show()
This code block produces the following graph. As seen, true clas

tf.data
---------------------

In [17]:
df = clean( train_files[0] )

In [13]:
target = df.pop(focus)

In [14]:
dataset = tf.data.Dataset.from_tensor_slices((df.values, target.values))

In [15]:
tf.constant(df['thal'])

KeyError: 'thal'

In [None]:
train_dataset = dataset.shuffle(len(df)).batch(1)

In [None]:
def get_compiled_model():
    model = tf.keras.Sequential([
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(10, activation='relu'),
    tf.keras.layers.Dense(3, activation='sigmoid')
  ])

    model.compile(optimizer='adam',
                loss='binary_crossentropy',
                metrics=['accuracy'])
    return model

In [None]:
model = get_compiled_model()
model.fit(train_dataset, epochs=15)

In [None]:
inputs = {key: tf.keras.layers.Input(shape=(), name=key) for key in df.keys()}
x = tf.stack(list(inputs.values()), axis=-1)

x = tf.keras.layers.Dense(10, activation='relu')(x)
output = tf.keras.layers.Dense(3, activation='sigmoid')(x)

model_func = tf.keras.Model(inputs=inputs, outputs=output)

model_func.compile(optimizer='adam',
                   loss='binary_crossentropy',
                   metrics=['accuracy'])

In [None]:
dict_slices = tf.data.Dataset.from_tensor_slices((df.to_dict('list'), target.values)).batch(16)

In [None]:
for dict_slice in dict_slices.take(1):
    print (dict_slice)

In [None]:
model_func.fit(dict_slices, epochs=15)