# PS8

In [1]:
"""
Author: Chittaranjan
Email: chitt@umich.edu
"""

'\nAuthor: Chittaranjan\nEmail: chitt@umich.edu\n'

In [2]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GroupShuffleSplit
import tensorflow as tf
from tensorflow.keras import layers

## Q0 - Slurm

### Job Statistics Output

```
[chitt@gl-login2 logs]$ sacct --format="JobID,NCPUS,Elapsed,CPUTime,TotalCPU"
       JobID      NCPUS    Elapsed    CPUTime   TotalCPU
------------ ---------- ---------- ---------- ----------
29748671              5   00:09:22   00:46:50  37:10.743
```

### Execution Logs

```
[chitt@gl-login2 logs]$ cat superconduct-chitt-6-12-29748671.log
Mon Dec  6 00:27:02 EST 2021
[<Process name='Process-4' pid=88440 parent=88392 started>, <Process name='Process-1' pid=88436 parent=88392 started>, <Process name='Process-5' pid=88441 parent=88392 started>, <Process name='Process-2' pid=88438 parent=88392 started>, <Process name='Process-3' pid=88439 parent=88392 started>]
[('rf_fold1', 106.41390980365682), ('rf_fold4', 106.75332069887826), ('rf_fold2', 92.31671151882448), ('rf_fold0', 93.59101150017862), ('rf_fold3', 93.90551985631339), ('rf_fold5', 102.874885396315), ('rf_fold7', 91.82756354836837), ('rf_fold6', 99.54484369888097), ('rf_fold8', 105.07271273405858), ('rf_fold9', 109.70729285419108)]
100.20077716096657
Mon Dec  6 00:36:23 EST 2021
Done.
```

Note: Batch mode python script `superconductivity.py` submitted separately.

## Q1 - Tensorflow and Keras

#### Part (a)

In [3]:
def train_test_validation_split(data,
                                train_size,
                                test_size,
                                validation_size,
                                groups_col
                                ):
    """
    Split a dataset into training, testing, and validation sets

    Parameters
    ----------
    data : DataFrame
        The complete data which needs to be split.
    train_size : float
        Proportion of training data.
    test_size : float
        Proportion of testing data.
    validation_size : float
        Proportion of validation data.
    groups_col: str
        Name of column in data that contains group labels

    Returns
    -------
    tuple
        Three dataframes for train, test, validation as per proportions.

    """
    gss = GroupShuffleSplit(n_splits=1,
                            train_size=train_size,
                            random_state=12052021
                            )
    train_idx, test_and_valid_idx = next(gss.split(data,
                                                   groups=data[groups_col]
                                                   )
                                         )
    ts = (validation_size / (test_size + validation_size))
    gss = GroupShuffleSplit(n_splits=1,
                            train_size=ts,
                            random_state=12052021
                            )
    validation_idx, test_idx = next(
        gss.split(
            data.iloc[test_and_valid_idx],
            groups=data[groups_col].iloc[test_and_valid_idx]
        )
    )
    return (
        data.iloc[train_idx].reset_index(drop=True),
        data.iloc[test_idx].reset_index(drop=True),
        data.iloc[validation_idx].reset_index(drop=True)
    )

In [4]:
def split_x_y(data, y_col):
    """
    Splits a dataset in X and y as per regular definition of predictors 
    and response 

    Parameters
    ----------
    data : DataFrame
        The dataset which needs to be split into X and y.
    y_col : str
        Name of response column in `data`.

    Returns
    -------
    list
        X and y dataframes, split based on `y_col`.

    """
    return [data[y_col], data.drop(y_col, axis=1)][::-1]

In [5]:
def score(model, x, y):
    """
    Compute Mean Squared Error for a model 

    Parameters
    ----------
    model : object
        A model object which supports a predict method.
    x : DataFrame
        Predictor values for the model.
    y: DataFrame
        Response values for the model.

    Returns
    -------
    float
        MSE for the model based on x and y

    """
    return mean_squared_error(y, model.predict(x))

In [6]:
def train_neural_network(model, X, y, val_X=None, val_y=None):
    """
    Trains a neural network model by fitting the X and y

    Parameters
    ----------
    model : KerasModel
        Neural network model with any number of hidden layers
    X : DataFrame
        The predictor values to train with
    y : DataFrame
        The response values to train with
    val_X : DataFrame
        The predictor values to validate with
    val_y : DataFrame
        The response values to validate with
    Returns
    -------
    KerasModel, float
        Trained model, and its MSE value against the validation fold
    """
    model.compile(loss='mse',
                  optimizer='adam',
                  metrics=[
                       tf.keras.metrics.MeanSquaredError()
                  ]
                  )
    model.fit(X, y, epochs=15, batch_size=10, verbose=0)
    return model, (
        model.evaluate(
            val_X,
            val_y,
            verbose=0)[0] if val_X is not None and val_y is not None else 0
    )

In [7]:
def evaluate_cv(model, train_X, train_y, k=10):
    """
    Compute Cross Validated MSE for a neural network model

    Parameters
    ----------
    model : KerasModel
        Neural network model with any number of hidden layers
    train_X : DataFrame
        The predictor values to train with
    train_y : DataFrame
        The response values to train with
    k: int
        The number of folds with which to perform CV
    Returns
    -------
    float
        MSE value against the validation folds
    """
    gss = GroupShuffleSplit(n_splits=k)
    cv_mses = []
    for idx_train, idx_val in gss.split(train_X, groups=train_X["material"]):
        X = train_X.iloc[idx_train]
        y = train_y.iloc[idx_train]
        val_X = train_X.iloc[idx_val]
        val_y = train_y.iloc[idx_val]
        model, cv_mse = train_neural_network(model,
                                             X.drop("material", axis=1),
                                             y,
                                             val_X.drop("material", axis=1),
                                             val_y
                                             )
        cv_mses.append(cv_mse)
    kfold_mse = sum(cv_mses) / len(cv_mses)
    return kfold_mse

In [8]:
def evaluate(model, train_X, train_y, val_X, val_y):
    """
    Compute MSE for a neural network model against a validation set

    Parameters
    ----------
    model : KerasModel
        Neural network model with any number of hidden layers
    train_X : DataFrame
        The predictor values to train with
    train_y : DataFrame
        The response values to train with
    val_X : DataFrame
        The predictor values to validate against
    val_y : DataFrame
        The response values to validate against
    Returns
    -------
    float
        MSE value against the validation data
    """
    full_model, _ = train_neural_network(
        model,
        train_X.drop("material", axis=1),
        train_y
    )
    return full_model.evaluate(
        val_X.drop("material", axis=1),
        val_y, verbose=0
    )[0]

In [9]:
superconduct_df = pd.read_csv("./superconduct/train.csv")
superconduct_df["material"] = pd.read_csv(
    "./superconduct/unique_m.csv"
)["material"]
train, test, validation = train_test_validation_split(
    superconduct_df,
    0.8,
    0.1,
    0.1,
    "material"
)
train_X, train_y = split_x_y(train, "critical_temp")
test_X, test_y = split_x_y(test, "critical_temp")
validation_X, validation_y = split_x_y(validation, "critical_temp")

In [10]:
model1 = tf.keras.Sequential(
    [
        layers.Dense(65,
                     kernel_regularizer=tf.keras.regularizers.l2(0.001)
                     ),
        layers.Dense(40),
        layers.Dense(1),
    ]
)

model2 = tf.keras.Sequential([
    layers.Dense(128, activation='relu'),
    layers.Dropout(0.1),
    layers.Dense(1),
])

model3 = tf.keras.Sequential([
    layers.Dense(128, activation='relu', kernel_regularizer='l1'),
    layers.Dropout(0.3),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(1),
])

model4 = tf.keras.Sequential([
    layers.Dense(64, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(1),
])

model5 = tf.keras.Sequential([
    layers.Dense(64, activation='relu'),
    layers.Dense(1),
])

2021-12-07 10:55:05.016761: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
nn_summary = pd.DataFrame({
    "MSE (Validation)": [
        evaluate(model1, train_X, train_y, validation_X, validation_y),
        evaluate(model2, train_X, train_y, validation_X, validation_y),
        evaluate(model3, train_X, train_y, validation_X, validation_y),
        evaluate(model4, train_X, train_y, validation_X, validation_y),
        evaluate(model5, train_X, train_y, validation_X, validation_y)
    ],
    "Cross Validated MSE": [
        evaluate_cv(model1, train_X, train_y),
        evaluate_cv(model2, train_X, train_y),
        evaluate_cv(model3, train_X, train_y),
        evaluate_cv(model4, train_X, train_y),
        evaluate_cv(model5, train_X, train_y)
    ]
}, index=["Model-1", "Model-2", "Model-3", "Model-4", "Model-5"])

2021-12-07 10:55:05.147531: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)


In [12]:
nn_summary.round(2)

Unnamed: 0,MSE (Validation),Cross Validated MSE
Model-1,645.68,376.89
Model-2,460.69,255.9
Model-3,918.58,456.28
Model-4,497.13,225.13
Model-5,509.64,255.2


Model-2, with three hidden layers of 64 nodes each performs best in terms of MSE

#### Part (b)

In [15]:
round(evaluate(model2, train_X, train_y, test_X, test_y), 2)

446.99

The best model has an MSE of `446.99` on the test data

## Q2 - PySpark and hdfs

Note: Script that computes these values is submitted separately as `q2.py`

In [14]:
pd.options.display.float_format = '{:.2f}'.format
pd.read_csv("./ps8_q2_chitt_results.csv")

Unnamed: 0,Shape,Total,Area Sum,Median,Mean
0,Triangle,8451,33961384.47,2653.58,4018.62
1,Rectangle,8431,17478227.44,1668.51,2073.09
