In [None]:
import pandas as pd #Pandas handles tabular data
pd.set_option('display.float_format', lambda x: '%.3f' % x) # turn off scientific notation and too much decimal blah
import matplotlib.pyplot as plt # standard plotting library
import numpy as np #Numpy for linear algebra & co
import seaborn as sns # For pretty dataviz
sns.set_style("whitegrid") # Define style for dataviz
import pickle
from scipy.stats import truncnorm

import warnings
warnings.filterwarnings('ignore')



In [None]:
# Load data

data = pd.read_csv('DemonstratorDataset.csv', delim_whitespace= True)
pd.set_option('display.max_columns', None)

#print(data)

## Statistical Analaysis

In [None]:
data.describe()

In [None]:
data.hist(figsize=(20, 15),color="firebrick")
plt.savefig('dataHist.jpg')

In [None]:
#Create BlockID column
data['pos'] = data['edgeID'].str.find(';')
data['BlockID'] = data.apply(lambda x: x['edgeID'][0:x['pos']],axis=1)
data["BlockID"] = data["BlockID"].str[1:]
data["WetLength"] = data["WetLength"] * 10
data["DryLength"] = data["DryLength"] * 10

data = data.drop('pos', 1)
print(data)

## Correlation analysis

In [None]:
data_num = data[["Offset","PrintHeight", "WetArea", "WetPerim", "DeltaHeight", "DeltaArea", "PlateAngY", "PlateAngX", "Humidity", "Temperature", "WetLength", "WetAngStart" ]]

In [None]:
#correlation matrix
plt.figure(figsize=(12, 6))
corr = sns.heatmap(data_num.corr(), cmap = "rocket", annot=True, vmin = -1, vmax = 1)
corr.set_title('Numerical Value Correlation Heatmap')
plt.savefig('correlation_matrix.jpg')

## Delta Height as variable

In [None]:
#https://medium.com/@morganjonesartist/color-guide-to-seaborn-palettes-da849406d44f
sns.set_style("whitegrid")
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
from matplotlib.offsetbox import OffsetImage,AnnotationBbox

#dic for color mapping
di = {0 : 'w', 1: 'r', 2: 'b'}

legend_elements = [Patch(facecolor='w', edgecolor='w', label='None'),
                   Patch(facecolor='w', edgecolor='r', label='Male'),
                   Patch(facecolor='w', edgecolor='b', label='Female')]

#Add block thumbnail under xtick
#https://stackoverflow.com/questions/44246650/add-image-annotations-to-bar-plots

def get_thumb(name):
     path = "./thumb/{}.png".format(name)
     im = plt.imread(path)
     return im


def offset_image(coord, name, ax):
    img = get_thumb(name)
    im = OffsetImage(img, zoom=0.5)
    im.image.axes = ax

    ab = AnnotationBbox(im, (coord, 20),  xybox=(0., -50), frameon=False,
                        xycoords='data',  boxcoords="offset points", pad=0)

    ax.add_artist(ab)

In [None]:
f = sns.relplot(x="BlockID", y="DeltaHeight", hue = "EdgesNum", style = "Cross?", edgecolor = (data["FemMal?"].map(di)), palette = "Dark2", data = data, height = 12, s=150, linewidth = 1)
f.set_ylabels("Delta Height (mm)", clear_inner=False)
f.set_xlabels(" ", clear_inner=False)
leg = f._legend
leg.set_bbox_to_anchor([1, 0.6])

blocks = list(data.BlockID.unique())

for axes in f.axes.flat:
    axes.set_xticklabels(axes.get_xticklabels(), rotation=0)
    axes.set_title("Height Shrinkage")
    axes.set_yticks(np.arange(20,45,1))
    axes.grid(True, axis = "both")
    for i, c in enumerate(blocks):
        offset_image(i, c, axes)
    axes.legend(handles=legend_elements,frameon=False, loc='best', bbox_to_anchor=(0.6, -0.035, 0.5, 0.5), title = "Joint")

f.savefig("Height Shrinkage.jpg")

In [None]:
pp = sns.pairplot(data = data, x_vars= ["PrintHeight", "Offset", "WetLength"],
                  y_vars = ["DeltaHeight"],
                  hue = "FemMal?", height = 5)
pp.fig.suptitle("Geometric feature correlation")
pp.savefig("Corr.jpg")

In [None]:
ppp = sns.pairplot(data = data, x_vars= ["WetArea", "WetPerim", "WetAngStart"],
                  y_vars = ["DeltaHeight"],
                  hue = "FemMal?", height = 5)
ppp.fig.suptitle("Geometric feature correlation")
ppp.savefig("Corr2.jpg")

## CLUSTER DATA FOR TRAIN TEST SPLIT

In [None]:
print(data.columns)

In [None]:
#extract numerical data
data_num = data[['Offset', 'EdgesNum', 'PrintHeight',
       'WetLength', 'WetAngStart', 'WetAngEnd', 'WetArea', 'WetPerim',
                 'PlateAngY', 'PlateAngX', 'Humidity', 'Temperature' ]]
print(data_num.shape)

#scale for PCA
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
data_num_scaled = scaler.fit_transform(data_num)

#Train PCA
from sklearn.decomposition import PCA
model_pca = PCA()
model_pca.fit(data_num_scaled)
data_num_reduced = model_pca.transform(data_num_scaled)
data_num_reduced_df = pd.DataFrame(data_num_reduced)
data_cat = data[['FemMal?', 'Cross?', 'BlockID']]
data_num_reduced_df= data_num_reduced_df.join(data_cat)
data_num_reduced_df["DeltaHeight"] = data["DeltaHeight"]

#plot PCA components
# Plot the explained variances
features = range(model_pca.n_components_)
a = plt.bar(features, model_pca.explained_variance_ratio_, color='black')
plt.xlabel('PCA features')
plt.ylabel('variance %')
plt.xticks(features)
plt.savefig("PCAfeatures.jpg")

In [None]:
def myplot_df(df,coeff, scale, labels = None):
    df["xs"] = df[df.columns[0]]
    df["ys"] = df[df.columns[1]]
    #print(df)

    n = coeff.shape[0]


    if scale == True:
        scalex = 1.0/(df["xs"].max() - df["xs"].min())
        scaley = 1.0/(df["ys"].max() - df["ys"].min())

        df["xs"] = scalex *  df["xs"]
        df["ys"] = scaley * df["ys"]

    sns.set_style("whitegrid")
    sns.set(rc={'axes.facecolor':'#f0f0f0', 'figure.facecolor':'#f0f0f0'})
    #sns.set(rc={'axes.facecolor':'#ffffff', 'figure.facecolor':'#ffffff'})

    g = sns.scatterplot(x='xs',y='ys', data=df, s=100,
                        hue = "BlockID", style = "Cross?", edgecolor = (data["FemMal?"].map(di)), linewidth = 2, alpha = 0.9, palette = "magma")
    g.legend(title = "BlockID", loc='center left', bbox_to_anchor=(1, 0.6), ncol=1, frameon = False)

    #name of columns is hardcoded, pay attention
    for i in range(n):
        plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
        if labels is None:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, data_num.columns[i], color = 'g', ha = 'center', va = 'center')
        else:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'g', ha = 'center', va = 'center')


    plt.savefig("PCA_plot.jpg")

In [None]:
from matplotlib.patches import Patch
from matplotlib.lines import Line2D
from matplotlib.offsetbox import OffsetImage,AnnotationBbox

#dic for color mapping
di = {0 : 'w', 1: 'r', 2: 'b'}

legend_elements = [Patch(facecolor='w', edgecolor='w', label='None'),
                   Patch(facecolor='w', edgecolor='r', label='Male'),
                   Patch(facecolor='w', edgecolor='b', label='Female')]


plt.xlim(-1,1)
plt.ylim(-1,1)
plt.xlabel("PC{}".format(1))
plt.ylabel("PC{}".format(2))
fig = plt.gcf()
fig.set_size_inches(12, 10)
fig.legend(handles=legend_elements,frameon=False, bbox_to_anchor=(0.51, -0.21, 0.5, 0.5), title = "Joint")
myplot_df(data_num_reduced_df, np.transpose(model_pca.components_[0:2, :]), True)

In [None]:
#helper function
from sklearn.cluster import KMeans

def findClustNumb(rng, data, x):
    #rng: range(1,x) x being max numb of clusters to test
    #data: df reduced data components
    #x: how many columns/components to take into account

    ks = rng
    inertias = []

    for k in ks:
        # Create a KMeans instance with k clusters: model
        model = KMeans(n_clusters=k)

        # Fit model to samples
        model.fit(data.iloc[:,:x])

        # Append the inertia to the list of inertias
        inertias.append(model.inertia_)
    plt.figure(figsize = (6, 6))
    plt.plot(ks, inertias, '-o', color='black')
    plt.xlabel('number of clusters, k')
    plt.ylabel('inertia')
    plt.xticks(ks)
    plt.savefig("inertiaElbow.jpg")
    plt.show()


def clusterPlot(n, reduced_data, x, i, show):

    #n: number of kmeans clusters
    #data: df reduced data components
    #x: how many columns/components to take into account
    #i : tag for legend

    #run kmeans
    kmeans = KMeans(n_clusters=n, random_state= 42)
    kmeans.fit(reduced_data.iloc[:,:x])

    reduced_data["cluster"] = kmeans.predict(reduced_data.iloc[:,:x])

    centroids = kmeans.cluster_centers_


    dim0_min, dim0_max= reduced_data.iloc[:, 0].min() - 1, reduced_data.iloc[:, 0].max() + 1
    dim1_min, dim1_max= reduced_data.iloc[:, 1].min() - 1, reduced_data.iloc[:, 1].max() + 1

    sns.set(rc={'axes.facecolor':'#ffffff', 'figure.facecolor':'#ffffff'})

    if show == True:
        g=sns.scatterplot(x=0, y=1,data = reduced_data,  hue = "cluster", cmap = "BrBG", s=100, linewidth = 1)
        sns.scatterplot(x = centroids[:, 0], y =centroids[:, 1], marker = "X", s = 200, color = "r", zorder = 10).set(title = "K-means clustering on %s data" % i)
        g.legend(title = "cluster", loc='right', bbox_to_anchor=(1.25, 0.5), ncol=1)
        fig = plt.gcf()
        fig.set_size_inches(10, 10)
        fig.savefig("kmeansPCA.jpg")
    else:
        pass

    return kmeans, reduced_data

In [None]:
findClustNumb(range(1,10), data_num_reduced_df, 6)

In [None]:
kmeans_pca, clustered_PCAdata = clusterPlot(8, data_num_reduced_df, 6, "PCA-reduced", True)

In [None]:
#bring clusters to dataframe
data_num["clusterPCA"] = clustered_PCAdata["cluster"]
data["clusterPCA"] = clustered_PCAdata["cluster"]

a = sns.catplot(kind = "count", x = "clusterPCA", data = data_num, palette = "rocket")
#a.savefig("clusterCount.jpg")

Viz

In [None]:
b = sns.catplot(col = "clusterPCA", x = "FemMal?", data = data, kind = "count", height = 3, hue = "FemMal?")

In [None]:
b = sns.catplot(col = "clusterPCA", x = "Cross?", data = data, kind = "count", height = 3, hue = "Cross?")

In [None]:
b = sns.displot(col = "clusterPCA", x = "WetLength", data = data,hue = "BlockID", multiple = "stack")

## Prepare data for augmentation

In [None]:
##########PREPARE DATA

#normal dataset
data_p= data_num.join(data_cat)
#make joint type dummy data and rename columns
dummies = pd.get_dummies(data = data_p, columns = ["FemMal?"], drop_first = True)
colnames = dummies.columns.values.tolist()
colnames = colnames[:-2] + ["maleJ", "femaleJ"]
dummies.columns = colnames
dummies["DeltaHeight"] = data["DeltaHeight"]

print(dummies.shape)
#dummies now has the numerical values that need to be standard scaled


#reduced dataset
data_num_reduced_df = data_num_reduced_df.iloc[:, : 6]
data_num_reduced_df["DeltaHeight"] = data["DeltaHeight"]
print(data_num_reduced_df.shape)

In [None]:
############SPLIT TRAIN AND TEST
spg = max(2, int(min(list(data.groupby("clusterPCA").size())) / 8))
data_test_idx = dummies.groupby("clusterPCA").sample(n=spg, random_state=40).index   # IMPORTANT SET RANDOOM STATE
print(" %s test samples" % len(data_test_idx), data_test_idx)

In [None]:
data

In [None]:
#Visualize TrainTest split
data_v = pd.read_csv('DemonstratorDataset.csv', delim_whitespace= True)
data_v['pos'] = data_v['edgeID'].str.find(';')
data_v['BlockID'] = data_v.apply(lambda x: x['edgeID'][0:x['pos']],axis=1)
data_v["BlockID"] = data_v["BlockID"].str[1:]
data_v["WetLength"] = data_v["WetLength"] * 10
data_v["DryLength"] = data_v["DryLength"] * 10
data_v = data_v.drop('pos', 1)


d_test = data_v.iloc[data_test_idx]
d_train = data_v.drop(data_test_idx)

fig, axes = plt.subplots(2, 2, figsize=(15, 10))

h = sns.histplot(ax = axes[0,0], data=d_train, x="DeltaHeight", hue="FemMal?", multiple="stack")
g = sns.histplot(ax = axes[1,0], data=d_train, x="DeltaHeight", hue="Cross?", multiple="stack")
j = sns.histplot(ax = axes[0,1], data=d_test, x="DeltaHeight", hue="FemMal?", multiple="stack")
k = sns.histplot(ax = axes[1,1], data=d_test, x="DeltaHeight", hue="Cross?", multiple="stack")
fig.savefig("trainTest.jpg")

In [None]:
#augmentation instructions
instructionsDic = {"Offset" : [0, 1, -2, +2],
                   "PrintHeight": [0,1, -2, 2],
                   "WetLength" : [0,0.5, -1, 1],
                   "WetAngStart" : [0, 0.125, -0.25, 0.25],
                   "WetArea" : [0, 0.0005, -0.001, 0.001],
                   "DeltaHeight" : [0,0.5,-1,1]}
catCols = ["EdgesNum","clusterPCA","Cross?", "BlockID", "maleJ", "femaleJ"]
times = 5

In [None]:
def createDuplicate(df, colnames, times):
    dfCat = df[colnames]
    for i in range(times):
        dfCat = dfCat.append(df[colnames], ignore_index=True)
        #print("round %s, the df now has %s entries" % (i, len(df.index)))

    return dfCat

In [None]:
def createNoise(df, colname, mean, sd, low, upp, times):
    #init
    original = df[colname].dropna()
    #print(original)
    rowsNum = df.shape[0]
    #print("Working on column %s, currently %s rows" % (colname, rowsNum))
    plt.figure(colname)

    #make noisyDF
    for i in range(times):
        #make noise array using truncated normal distribution
        np.random.seed(seed=i)
        noiseSp = truncnorm((low - mean) / sd, (upp - mean) / sd, loc=mean, scale=sd)
        noiseArr = np.asarray(noiseSp.rvs(rowsNum))
        #print(noiseArr)
        #sns.histplot(noiseArr).set_title("Noise for %s" % colname)
        #print(pd.DataFrame(noiseArr).describe())

        #Create a copy of the column values + noise
        noisyVal = np.add(noiseArr, original)
        #print(noisyVal)

        df = df.append(pd.DataFrame(noisyVal, columns=[colname]), ignore_index=True)
        #print("round %s, the column now has %s entries" % (i, df[colname].count()))

    return df[colname]

In [None]:
###########augment data
#get data train
data_train = dummies.drop(data_test_idx).reset_index(drop = True)
print(data_train)

if times > 1:

    #########CREATE AUGMENTED DATA
    #init
    noisy = pd.DataFrame()
    #print(noisy)

    #Numerical values
    for key, value in instructionsDic.items():
        noisy[key] = createNoise(data_train, key, value[0], value[1], value[2], value[3], times)

    #CategoricalValues
    duplicateCats = createDuplicate(data_train, catCols,times)

    #Merge
    augmented = noisy.join(duplicateCats)
    augmented = augmented[:-1]

    #create new column to see if it is real or synthetic
    augmented["origin"] = "real"
    augmented["origin"][augmented.index > data_train.shape[0] ] = "synthetic"


    data = augmented

else:
    data["origin"] = "real"


print(augmented.info())
print(augmented)

In [None]:
#############VIZUALIZE AUGMENTED DATA
for key, value in instructionsDic.items():
    print(key)
    df_real = augmented[augmented['origin'] == 'real'][key].reset_index(drop=True)
    df_synth = augmented[augmented['origin'] == 'synthetic'][key].reset_index(drop=True)
    new_df = pd.concat([df_real, df_synth], axis=1)
    new_df.columns = ["real", "synthetic"]

    plt.figure(key)
    print(new_df.describe().transpose())
    a = sns.kdeplot(data = new_df, palette = "PuRd", common_norm = False, legend = True).set(title = key)
    plt.gcf().set_size_inches(5, 5)
    plt.savefig("a%s.png" % key)
    plt.show()





In [None]:
################## PROJECT AUGMENTED INTO PCA

In [None]:
print(augmented.columns)
aug_cat = augmented[['origin']]

In [None]:
augmented_num = augmented[['Offset', 'PrintHeight', 'WetLength', 'WetAngStart', 'WetArea','DeltaHeight']]
from sklearn.preprocessing import StandardScaler
scalerAug = StandardScaler()
aug_num_scaled = scalerAug.fit_transform(augmented_num)

from sklearn.decomposition import PCA
model_pca_aug = PCA()
model_pca_aug.fit(aug_num_scaled)
aug_num_reduced = model_pca_aug.transform(aug_num_scaled)

aug_num_reduced_df = pd.DataFrame(aug_num_reduced)
aug_num_reduced_df= aug_num_reduced_df.join(aug_cat)
aug_num_reduced_df



In [None]:
def myplot_df_aug(df,coeff, scale, labels = None):
    df["xs"] = df[df.columns[0]]
    df["ys"] = df[df.columns[1]]
    #print(df)

    n = coeff.shape[0]


    if scale == True:
        scalex = 1.0/(df["xs"].max() - df["xs"].min())
        scaley = 1.0/(df["ys"].max() - df["ys"].min())

        df["xs"] = scalex *  df["xs"]
        df["ys"] = scaley * df["ys"]

#     g = sns.scatterplot(x='xs',y='ys', data=df, s=100,
#                         hue = "origin", style = "Cross?", edgecolor = (data["FemMal?"].map(di)), linewidth = 1, alpha = 0.9)
    g = sns.scatterplot(x='xs',y='ys', data=df, s= df["origin"].map({"real" : 200, "synthetic" : 50}),
                        hue = "origin", linewidth = 1, alpha = 0.5)
    g.legend(title = "BlockID", loc='center left', bbox_to_anchor=(1, 0.5), ncol=1, frameon = False)

    for i in range(n):
        plt.arrow(0, 0, coeff[i,0], coeff[i,1],color = 'r',alpha = 0.5)
        if labels is None:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, augmented_num.columns[i], color = 'g', ha = 'center', va = 'center', alpha = 0.5)
        else:
            plt.text(coeff[i,0]* 1.15, coeff[i,1] * 1.15, labels[i], color = 'g', ha = 'center', va = 'center')

plt.xlim(-0.5,0.7)
plt.ylim(-0.5,0.7)
plt.xlabel("PC{}".format(1))
plt.ylabel("PC{}".format(2))
fig = plt.gcf()
fig.set_size_inches(12, 12)
myplot_df_aug(aug_num_reduced_df, np.transpose(model_pca_aug.components_[0:2, :]), True)
fig.savefig("augmentedPCA.jpg")


## prepare data for training

In [None]:
dummies.info()
#keep all 15 features
dummies = dummies.drop(columns = ["clusterPCA", "BlockID"])
dummies

## MODEL TRAINING

### 15-Feature data

In [None]:
def simplePredict(dummies,data_test_idx, model, tag):

    dummies_x = dummies.drop(columns = ["DeltaHeight"])

    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    dummies_scaled = scaler.fit_transform(dummies_x)

    filename = 'stdScaler_15features.pk'
    pickle.dump(scaler, open(filename, 'wb'))

    dummies_scaled_df = pd.DataFrame(dummies_scaled, columns=dummies_x.columns)
    dummies_scaled_df["DeltaHeight"] = dummies["DeltaHeight"]

    data_test = dummies_scaled_df.iloc[data_test_idx]
    data_train = dummies_scaled_df.drop(data_test_idx)

    x_train_scaled = data_train.drop(columns = ["DeltaHeight"])
    y_train = data_train[["DeltaHeight"]]

    x_test_scaled = data_test.drop(columns = ["DeltaHeight"])
    y_test = data_test[["DeltaHeight"]]


    print(x_train_scaled.shape)
    print(y_train.shape)

    print(x_test_scaled.shape)
    print(y_test.shape)


    #TRAIN MODEL
    from sklearn.metrics import mean_absolute_error
    np.random.seed(2)

    print("now training " + key )
    if key == "Artificial Neural Network":
        history = model.fit(x_train_scaled,y_train,epochs=400, validation_split=0.1, verbose = 0)

    else:


        model.fit(x_train_scaled,y_train)

    y_pred_train = model.predict(x_train_scaled)
    y_pred_test = model.predict(x_test_scaled)

    mae_train = round(mean_absolute_error(y_train, y_pred_train), 2)
    mae_test = round(mean_absolute_error(y_test, y_pred_test), 2)

    fig,ax = plt.subplots(figsize = (5,5))
    plt.suptitle ("%s \n mae train =  %smm \n mae test = %smm  " % (tag, mae_train,mae_test), y=1)

    ax.plot([0, 1], [0, 1], transform=ax.transAxes, linewidth=1, color='black', ls = ":", alpha = 0.5)
    l1 = ax.scatter(y_pred_train, y_train, c= "m", alpha = 0.6, label = "train")
    l2= ax.scatter(y_pred_test, y_test, c= "r", alpha = 0.7, label = "test")
    ax.legend(handles = [l1, l2])
    ax.set_xlabel("y prediction (mm)")
    ax.set_ylabel("y truth (mm) ")
    plt.ylim(20, 45)
    plt.xlim(20,45)
    plt.savefig("rawData%s.png" % key)


    return model, mae_train, mae_test, y_pred_train, y_pred_test


In [None]:
from sklearn.linear_model import LinearRegression
model_lr = LinearRegression()

from sklearn.kernel_ridge import KernelRidge
model_krr = KernelRidge(alpha=1.0,kernel='polynomial',degree=5)

import xgboost as xgb
model_xgb = xgb.XGBRegressor(max_depth = 4, n_estimators = 20)

import tensorflow as tf
tf.random.set_seed(42)
model_ann = tf.keras.models.Sequential()
n_cols = dummies.shape[1] - 1
model_ann.add(tf.keras.layers.Dense(8, input_shape=(n_cols,), activation='relu'))
model_ann.add(tf.keras.layers.Dense(1, activation= "linear"))
model_ann.compile(optimizer = "adam", loss = "mean_squared_error")


In [None]:
modelList = {"Linear Regression" : model_lr , "Polynomial Kerner Ridge Regression deg 5" : model_krr, "XGboost 20 trees" : model_xgb, "Artificial Neural Network": model_ann}


In [None]:
for key, value in modelList.items():
    simplePredict(dummies,data_test_idx, value, key)

### PCA reduced data

In [None]:
data_num_reduced_df

In [None]:
def PCAPredict(data_num_reduced_df, data_test_idx, model, tag):

    # PCA
    data_test_PCA = data_num_reduced_df.iloc[data_test_idx]
    data_train_PCA = data_num_reduced_df.drop(data_test_idx)


    x_train_scaled = data_train_PCA.drop(columns = ["DeltaHeight"])
    y_train = data_train_PCA[["DeltaHeight"]]

    x_test_scaled = data_test_PCA.drop(columns =  ["DeltaHeight"])
    y_test = data_test_PCA[["DeltaHeight"]]


    print(x_train_scaled.shape)
    print(y_train.shape)

    print(x_test_scaled.shape)
    print(y_test.shape)

    #TRAIN MODEL
    from sklearn.metrics import mean_absolute_error
    np.random.seed(2)

    print("now training" + key)

    if key == "Artificial Neural Network":
        history = model.fit(x_train_scaled,y_train,epochs=400, validation_split=0.1, verbose = 0)

    else:


        model.fit(x_train_scaled,y_train)



    y_pred_train = model.predict(x_train_scaled)
    y_pred_test = model.predict(x_test_scaled)

    mae_train = round(mean_absolute_error(y_train, y_pred_train), 2)
    mae_test = round(mean_absolute_error(y_test, y_pred_test), 2)


    fig,ax = plt.subplots(figsize = (5,5))
    plt.suptitle ("%s \n mae train =  %smm \n mae test = %smm  " % (tag, mae_train,mae_test), y=1)

    ax.plot([0, 1], [0, 1], transform=ax.transAxes, linewidth=1, color='black', ls = ":", alpha = 0.5)
    l1 = ax.scatter(y_pred_train, y_train, c= "m", alpha = 0.6, label = "train")
    l2= ax.scatter(y_pred_test, y_test, c= "r", alpha = 0.7, label = "test")
    ax.legend(handles = [l1, l2])
    ax.set_xlabel("y prediction (mm)")
    ax.set_ylabel("y truth (mm) ")
    plt.ylim(20, 45)
    plt.xlim(20,45)
    plt.savefig("PCAData%s.png" % key)


    return model, mae_train, mae_test, y_pred_train, y_pred_test


In [None]:
from sklearn.linear_model import LinearRegression
model_lr = LinearRegression()

from sklearn.kernel_ridge import KernelRidge
model_krr = KernelRidge(alpha=1.0,kernel='polynomial', degree=5)

import xgboost as xgb
model_xgb = xgb.XGBRegressor(max_depth = 4, n_estimaros = 20)

import tensorflow as tf
tf.random.set_seed(42)
model_ann = tf.keras.models.Sequential()
n_cols = data_num_reduced_df.shape[1] - 1
model_ann.add(tf.keras.layers.Dense(4, input_shape=(n_cols,), activation='relu'))
model_ann.add(tf.keras.layers.Dense(3, activation='relu'))
model_ann.add(tf.keras.layers.Dense(1, activation= "linear"))
model_ann.compile(optimizer = "adam", loss = "mean_squared_error")

modelList = {"Linear Regression" : model_lr , "Polynomial Kerner Ridge Regression deg 5 " : model_krr, "XGboost 20 trees" : model_xgb, "Artificial Neural Network": model_ann}



In [None]:
for key, value in modelList.items():
    PCAPredict(data_num_reduced_df, data_test_idx, value, key)

## AUGMENTED DATA

In [None]:
augmented = augmented.drop(columns = ["EdgesNum", "clusterPCA", "BlockID","origin","Cross?", "maleJ", "femaleJ"])
augmented

In [None]:
#NO SCALING OF FINAL VARIABLE
def augmentPredict(dummies,augmented, data_test_idx, model, times, instructionsDic, catcols, tag):
    #compose the dataset because augmented is just training set, you need to extract test set from dummies


    common_columns = dummies.columns.intersection(augmented.columns)
    dumdum = dummies[common_columns]

    #train scaler for original dataset "dummies" with 9 features
    dummies_x = dumdum.drop(columns = ["DeltaHeight"])

    from sklearn.preprocessing import StandardScaler
    scalerAug = StandardScaler()

    #scale test samples
    dummies_scaled = scalerAug.fit_transform(dummies_x)
    dummies_scaled_df = pd.DataFrame(dummies_scaled, columns=dummies_x.columns)
    dummies_scaled_df["DeltaHeight"] = dummies["DeltaHeight"]


    filename = 'stdScaler_8features.pk'
    pickle.dump(scaler, open(filename, 'wb'))


    aug_x = augmented.drop(columns = ["DeltaHeight"])

    #scale train samples
    aug_scaled = scalerAug.transform(aug_x)
    aug_scaled_df = pd.DataFrame(aug_scaled, columns=aug_x.columns)
    aug_scaled_df["DeltaHeight"] = augmented["DeltaHeight"]

    #cast data
    data_train = aug_scaled_df
    data_test = dummies_scaled_df.iloc[data_test_idx]


    x_train_scaled = data_train.drop(columns = ["DeltaHeight"])
    y_train = data_train[["DeltaHeight"]]

    x_test_scaled = data_test.drop(columns = ["DeltaHeight"])
    y_test = data_test[["DeltaHeight"]]




    #TRAIN MODEL
    from sklearn.metrics import mean_absolute_error
    np.random.seed(2)

    print("now training" + key)
    if tag == "Artificial Neural Network":
        history = model.fit(x_train_scaled,y_train,epochs=400, validation_split=0.1, verbose = 0)

    else:


        model.fit(x_train_scaled,y_train)


#     train_score = round(model.score(x_train_scaled, y_train),2)
#     test_score = round(model.score(x_test_scaled, y_test), 2)

    y_pred_train = model.predict(x_train_scaled)
    y_pred_test = model.predict(x_test_scaled)

    mae_train = round(mean_absolute_error(y_train, y_pred_train), 2)
    mae_test = round(mean_absolute_error(y_test, y_pred_test), 2)


    fig,ax = plt.subplots(figsize = (5,5))
    plt.suptitle ("%s \n mae train =  %smm \n mae test = %smm  " % (tag, mae_train,mae_test), y=1)

    ax.plot([0, 1], [0, 1], transform=ax.transAxes, linewidth=1, color='black', ls = ":", alpha = 0.5)
    l1 = ax.scatter(y_pred_train, y_train, c= "m", alpha = 0.6, label = "train")
    l2= ax.scatter(y_pred_test, y_test, c= "r", alpha = 0.7, label = "test")
    ax.legend(handles = [l1, l2])
    ax.set_xlabel("y prediction (mm)")
    ax.set_ylabel("y truth (mm) ")
    plt.ylim(20, 45)
    plt.xlim(20,45)
    plt.savefig("GAData%s.png" % tag)


    return model, mae_train, mae_test, y_pred_train, y_pred_test


In [None]:
from sklearn.linear_model import LinearRegression
model_lr = LinearRegression()

from sklearn.kernel_ridge import KernelRidge
model_krr = KernelRidge(alpha=1.0,kernel='polynomial',degree=5)

import xgboost as xgb
model_xgb = xgb.XGBRegressor(max_depth = 4, n_estimators = 20)

import tensorflow as tf
tf.random.set_seed(42)
model_ann = tf.keras.models.Sequential()
n_cols = augmented.shape[1] - 1
model_ann.add(tf.keras.layers.Dense(4, input_shape=(n_cols,), activation='relu'))
model_ann.add(tf.keras.layers.Dense(1, activation= "linear"))
model_ann.compile(optimizer = "adam", loss = "mean_squared_error")

modelList = {"Linear Regression" : model_lr , "Polynomial Kerner Ridge Regression deg 5" : model_krr, "XGboost 20 trees" : model_xgb, "Artificial Neural Network": model_ann}

times = 5

In [None]:
for key, value in modelList.items():
    augmentPredict(dummies, augmented, data_test_idx, value, times, instructionsDic, catCols, key)

## Additional experiments

### TIGNA on whole dataset

In [None]:
def augmentPredict_WHOLE(data, model, times, instructionsDic, catcols, tag):
    if times > 1:

        #########CREATE AUGMENTED DATA
        #init
        noisy = pd.DataFrame()
        #print(noisy)

        #Numerical values
        for key, value in instructionsDic.items():
            noisy[key] = createNoise(data, key, value[0], value[1], value[2], value[3], times)

        #CategoricalValues
        duplicateCats = createDuplicate(data, catCols,times)

        #Merge
        augmented = noisy.join(duplicateCats)
        augmented = augmented[:-1]
        #print(augmented)

        #Look at distributions
        for item in noisy.columns:
            plt.figure(item)
            newDF = pd.DataFrame()
            newDF["augmented"] = augmented[item]
            newDF = newDF.join(data[item])
            #print(item)
            #print(newDF.describe().transpose())
            #sns.kdeplot(data = newDF, common_norm = False).set(title = item)
            plt.show()

        #create new column to see if it is real or synthetic
        augmented["origin"] = "real"
        augmented["origin"][augmented.index > 88] = "synthetic"


        data = augmented

    else:
        data["origin"] = "real"


    ###########RUN PCA ON NUMERICAL DATA
    data_num = data[['Offset', 'PrintHeight', 'WetLength', 'WetAngStart', 'WetArea','DeltaHeight']]

    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    data_num_scaled = scaler.fit_transform(data_num)
    #print(data_ingredients_scaled)

    from sklearn.decomposition import PCA
    model_pca = PCA()
    model_pca.fit(data_num_scaled)
    data_num_reduced = model_pca.transform(data_num_scaled)
    data_num_reduced_df = pd.DataFrame(data_num_reduced)
    data_cat = data[['FemMal?', 'Cross?', 'BlockID', 'origin']]
    data_num_reduced_df= data_num_reduced_df.join(data_cat)

    data_num_reduced_df_rev = data_num_reduced_df.reindex(index=data_num_reduced_df.index[::-1])
    data_num_reduced_df = data_num_reduced_df.drop(columns = ["BlockID", "Cross?", "FemMal?", "origin"])

    ##########CLUSTER PCA DATA
    from sklearn.cluster import KMeans
    kmeans_pca, clustered_PCAdata = clusterPlot(8, data_num_reduced_df, 5, "PCA-reduced", False)
    data["clusterPCA"] = clustered_PCAdata["cluster"]
    #sns.catplot(kind = "count", x = "clusterPCA", data = data)

    ##########PREPARE DATA
    #make joint type dummy data and rename columns
    dummies = pd.get_dummies(data = data, columns = ["FemMal?"], drop_first = True)
    colnames = dummies.columns.values.tolist()
    colnames = colnames[:-2] + ["maleJ", "femaleJ"]
    dummies.columns = colnames
    #dummies["Cross?"] = dummies["Cross?"].map({"no": 0, "yes": 1})
    dummies_x = dummies.drop(columns = ["BlockID", "origin", "clusterPCA", "DeltaHeight", "edgeID"])
    #print(dummies_x.head(5))
    #dummies now has the numerical values that need to be standard scaled

    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    dummies_scaled = scaler.fit_transform(dummies_x)
    dummies_scaled_df = pd.DataFrame(dummies_scaled, columns=dummies_x.columns)
    dummies_scaled_df["DeltaHeight"] = dummies["DeltaHeight"]

    ############SPLIT TRAIN AND TEST
    spg = max(2, int(min(list(data.groupby("clusterPCA").size())) / 8))
    data_test_idx = dummies.groupby("clusterPCA").sample(n=8, random_state=40).index   # IMPORTANT SET RANDOOM STATE
    #print(" %s test samples" % len(data_test_idx), data_test_idx)

    data_test = dummies_scaled_df.iloc[data_test_idx]
    data_train = dummies_scaled_df.drop(data_test_idx)
    #print(data_test.info(), data_train.info())

    x_train_scaled = data_train.drop(columns = ["DeltaHeight"])
    y_train = data_train[["DeltaHeight"]]

    x_test_scaled = data_test.drop(columns = ["DeltaHeight"])
    y_test = data_test[["DeltaHeight"]]


    #TRAIN MODEL
    from sklearn.metrics import mean_absolute_error
    np.random.seed(2)


    if tag == "Artificial Neural Network":
        history = model.fit(x_train_scaled,y_train,epochs=400, validation_split=0.1, verbose = 0)

    else:


        model.fit(x_train_scaled,y_train)


#     train_score = round(model.score(x_train_scaled, y_train),2)
#     test_score = round(model.score(x_test_scaled, y_test), 2)

    y_pred_train = model.predict(x_train_scaled)
    y_pred_test = model.predict(x_test_scaled)

    mae_train = round(mean_absolute_error(y_train, y_pred_train), 2)
    mae_test = round(mean_absolute_error(y_test, y_pred_test), 2)


    fig,ax = plt.subplots(figsize = (5,5))
    plt.suptitle ("%s \n mae train =  %smm \n mae test = %smm  " % (tag, mae_train,mae_test), y=1)

    ax.plot([0, 1], [0, 1], transform=ax.transAxes, linewidth=1, color='black', ls = ":", alpha = 0.5)
    l1 = ax.scatter(y_pred_train, y_train, c= "m", alpha = 0.6, label = "train")
    l2= ax.scatter(y_pred_test, y_test, c= "r", alpha = 0.7, label = "test")
    ax.legend(handles = [l1, l2])
    ax.set_xlabel("y prediction (mm)")
    ax.set_ylabel("y truth (mm) ")
    plt.ylim(20, 45)
    plt.xlim(20,45)
    plt.savefig("WD_GAData%s.png" % tag)


    return model, mae_train, mae_test, y_pred_train, y_pred_test


In [None]:
instructionsDic = {"Offset" : [0, 1, -2, +2], "PrintHeight": [0,1, -2, 2], "WetLength" : [0,0.5, -1, 1], "WetAngStart" : [0, 0.125, -0.25, 0.25], "WetArea" : [0, 0.0005, -0.001, 0.001], "DeltaHeight" : [0,0.5,-1,1]}
catCols = ["FemMal?", "Cross?", "BlockID", "edgeID"]

from sklearn.linear_model import LinearRegression
model_lr = LinearRegression()

from sklearn.kernel_ridge import KernelRidge
model_krr = KernelRidge(alpha=1.0,kernel='polynomial',degree=5)  #check this

import xgboost as xgb
model_xgb = xgb.XGBRegressor(max_depth = 4, n_estimators = 20)

import tensorflow as tf
tf.random.set_seed(42)
model_ann = tf.keras.models.Sequential()
n_cols = 8
model_ann.add(tf.keras.layers.Dense(4, input_shape=(n_cols,), activation='relu'))
model_ann.add(tf.keras.layers.Dense(1, activation= "linear"))
model_ann.compile(optimizer = "adam", loss = "mean_squared_error")

modelList = {"Linear Regression" : model_lr , "Polynomial Kerner Ridge Regression deg 5" : model_krr, "XGboost 20 trees" : model_xgb, "Artificial Neural Network": model_ann}

times = 5

In [None]:
data_v = pd.read_csv('DemonstratorDataset.csv', delim_whitespace= True)
data_v['pos'] = data_v['edgeID'].str.find(';')
data_v['BlockID'] = data_v.apply(lambda x: x['edgeID'][0:x['pos']],axis=1)
data_v["BlockID"] = data_v["BlockID"].str[1:]
data_v["WetLength"] = data_v["WetLength"] * 10
data_v["DryLength"] = data_v["DryLength"] * 10
data_v = data_v.drop('pos', 1)
#print(data_v)
for key, value in modelList.items():
    augmentPredict_WHOLE(data_v, value, times, instructionsDic, catCols, key)

### Random train test split on raw data

In [None]:
def RDPredict(dummies, model, tag, seed):

    dummies_x = dummies.drop(columns = ["DeltaHeight"])

    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    dummies_scaled = scaler.fit_transform(dummies_x)

    dummies_scaled_df = pd.DataFrame(dummies_scaled, columns=dummies_x.columns)
    rv = np.asarray(dummies["DeltaHeight"]).reshape(-1,1)

    from sklearn.model_selection import train_test_split
    x_train_scaled, x_test_scaled, y_train, y_test = train_test_split(dummies_scaled_df, rv, test_size=0.20, random_state=seed)

    #TRAIN MODEL
    from sklearn.metrics import mean_absolute_error
    np.random.seed(2)

    #print("now training " + str(seed) )

    model.fit(x_train_scaled,y_train)

    y_pred_train = model.predict(x_train_scaled)
    y_pred_test = model.predict(x_test_scaled)

    mae_train = round(mean_absolute_error(y_train, y_pred_train), 2)
    mae_test = round(mean_absolute_error(y_test, y_pred_test), 2)

    return model, mae_train, mae_test, y_pred_train, y_pred_test


In [None]:
from sklearn.linear_model import LinearRegression
model_lr = LinearRegression()

from sklearn.kernel_ridge import KernelRidge
model_krr = KernelRidge(alpha=1.0,kernel='polynomial',degree=5)

import xgboost as xgb
model_xgb = xgb.XGBRegressor(max_depth = 4, n_estimators = 20)

modelList = {"Linear Regression" : model_lr , "Polynomial Kerner Ridge Regression deg 5" : model_krr, "XGboost 20 trees" : model_xgb}




In [None]:
import random
seeds = random.sample(range(0, 100), 100)

for key, value in modelList.items():
    print(key)
    simplePredict(dummies,data_test_idx, value, key)

    err_train = []
    err_test = []
    for s in seeds:
            model, mae_train, mae_test, y_pred_train, y_pred_test = RDPredict(dummies, value, key, s)
            err_train.append(mae_train)
            err_test.append(mae_test)


    df_xgb = pd.DataFrame({"mae_train": err_train, "mae_test": err_test})

    figuuu = plt.gcf()
    az = df_xgb.plot(color=['red', "pink"])
    az.set_xlabel('Index')
    az.set_ylabel('Error (mm)')
    az.set_ylim(0,20)
    az.set_title('Generalization error wrt Train/Test split \n %s' % key)
    figuuu.savefig('noise %s.jpg' % tag)



### Export Prediction  15-feature data on XG boost 20 trees

In [None]:
dummies_x = dummies.drop(columns = ["DeltaHeight"])

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
dummies_scaled = scaler.fit_transform(dummies_x)

dummies_scaled_df = pd.DataFrame(dummies_scaled, columns=dummies_x.columns)
dummies_scaled_df["DeltaHeight"] = dummies["DeltaHeight"]

data_test = dummies_scaled_df.iloc[data_test_idx]
data_train = dummies_scaled_df.drop(data_test_idx)

x_train_scaled = data_train.drop(columns = ["DeltaHeight"])
y_train = data_train[["DeltaHeight"]]

x_test_scaled = data_test.drop(columns = ["DeltaHeight"])
y_test = data_test[["DeltaHeight"]]


print(x_train_scaled.shape)
print(y_train.shape)

print(x_test_scaled.shape)
print(y_test.shape)


In [None]:
from sklearn.metrics import mean_absolute_error
import xgboost as xgb

np.random.seed(2)
model = xgb.XGBRegressor(max_depth = 4, n_estimators = 20)

model.fit(x_train_scaled,y_train)

predictions = model.predict(dummies_scaled_df.drop(columns = ["DeltaHeight"]))

In [None]:
dummies["predictions"] = predictions
dummies["edgeID"] = data["edgeID"]
dummies.to_csv("XGB_20_predictions.csv")