In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.gaussian_process import GaussianProcessRegressor
from subprocess import check_output
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

import os

In [2]:
data = pd.read_csv('data/data_reduced.csv')
# data = pd.read_csv('data/data_reduced.csv')

In [3]:
def select_columns_elasticity(data):
    xE = data.drop(['blendE','blendSTRENGTH','blendStrainbreak', 'blendImpact', 'impurityImpact', 'impuritySTRENGTH', 'impurityStrainbreak', 'matrixImpact', 'matrixSTRENGTH', 'matrixStrainbreak'], axis=1)
    yE = data[['blendE']]
    return xE, yE
#     xE = data.drop(['blendSTRENGTH','blendStrainbreak', 'blendImpact', 'impurityImpact', 'impuritySTRENGTH', 'impurityStrainbreak', 'matrixImpact', 'matrixSTRENGTH', 'matrixStrainbreak'], axis=1)
#     return xE

class PreprocessTypePlastic:
    def transform(self, X):
        encoded = pd.get_dummies(X, columns=['MinorityPolymer'])
        encoded = encoded.drop('MajorityPolymer', axis=1)
        return encoded

    def fit(self, X, y=None):
        return self

# def preprocess_type_plastic(data):
#     encoded = pd.get_dummies(data, columns = ['MinorityPolymer'])
#     encoded = encoded.drop('MajorityPolymer', axis =1)
#     return data
    

# X_train, X_test, y_train, y_test = train_test_split(xE, yE, test_size = 0.1, random_state = 0)

In [4]:
rf = RandomForestRegressor()
rf.fit(X_train, y_train.ravel())

print(rf.feature_importances_)

result = permutation_importance(
    rf, X_test, y_test, n_repeats=10, random_state=0, n_jobs=2
)

sorted_importances_idx = result.importances_mean.argsort()
importances = pd.DataFrame(
    result.importances[sorted_importances_idx].T,
    columns=xE.columns[sorted_importances_idx],
)
print(importances)
ax = importances.plot.box(vert=False, whis=10)
ax.set_title("Permutation Importances (test set)")
ax.axvline(x=0, color="k", linestyle="--")
ax.set_xlabel("Decrease in accuracy score")
ax.figure.tight_layout()
ax.figure.show()


In [5]:
sorted_importances_idx.size
# xE.columns.size

In [6]:
n= sorted_importances_idx.size
fsRFPermutation = xE.columns[sorted_importances_idx[n-10:n]]
# reversing order 
fsRFPermutation = fsRFPermutation[n:None:-1]


In [7]:
fsRFPermutation[n:None:-1]

In [8]:
# return sorted in ascending odere!!
def fs_permutation_importance(x, y):
    X_train, X_test, y_train, y_test = train_test_split(xE, yE, test_size = 0.1, random_state = 0)  
    rf = RandomForestRegressor()

    # y_train = y_train.values.flatten()
    # Fit the model on the training data
    rf.fit(X_train, y_train.values.flatten())

    result = permutation_importance(
        rf, X_test, y_test, n_repeats=10, random_state=0, n_jobs=2
    )

    sorted_importances_idx = result.importances_mean.argsort()
    return sorted_importances_idx, result


def get_best_t(x, sorted_idx, t):
    n= sorted_idx.size
#     print(n)
    fsRFPermutation_names = x.columns.values[sorted_idx[n-t:n]]
    # reversing order 
#     fsRFPermutation_names = fsRFPermutation[n:None:-1]
    
    fsRFPermutation_idx = sorted_idx[n-t:n]
#     print(fsRFPermutation_idx)
#     fsRFPermutation_idx = fsRFPermutation_idx[n:None:-1]
#     print(fsRFPermutation_idx)
    return fsRFPermutation_names, fsRFPermutation_idx

    
# assuming in order already and only best t     
def get_plot(result, xE, sorted_idx, title):
    importances = pd.DataFrame(
        result.importances[sorted_idx].T,
        columns=xE.columns.values[sorted_idx],
    )
#     print(importances)
    ax = importances.plot.box(vert=False, whis=10)
    ax.set_title(title)
    ax.axvline(x=0, color="k", linestyle="--")
    ax.set_xlabel("Decrease in accuracy score")
    ax.figure.tight_layout()
    ax.figure.show()

    

# comparing partitions


#### P vs NP 

In [9]:
P = pd.read_csv('data/data_reduced_p.csv')
NP = pd.read_csv('data/data_reduced_np.csv')

In [10]:
xPE = P.drop(['blendE','blendSTRENGTH','blendStrainbreak', 'blendImpact', 'impurityImpact', 'impuritySTRENGTH', 'impurityStrainbreak', 'matrixImpact', 'matrixSTRENGTH', 'matrixStrainbreak'], axis=1)
yPE = P[['blendE']]
xNPE = P.drop(['blendE','blendSTRENGTH','blendStrainbreak', 'blendImpact', 'impurityImpact', 'impuritySTRENGTH', 'impurityStrainbreak', 'matrixImpact', 'matrixSTRENGTH', 'matrixStrainbreak'], axis=1)
yNPE = P[['blendE']]
X_trainP, X_testP, y_trainP, y_testP = train_test_split(xPE, yPE, test_size = 0.1, random_state = 0)
X_trainNP, X_testNP, y_trainNP, y_testNP = train_test_split(xNPE, yNPE, test_size = 0.1, random_state = 0)

In [11]:
rfP = RandomForestRegressor()

# y_train = y_train.values.flatten()
# Fit the model on the training data
rfP.fit(X_trainP, y_trainP)
# print(rfP.feature_importances_)

resultP = permutation_importance(
    rfP, X_testP, y_testP, n_repeats=10, random_state=0, n_jobs=2
)

sorted_importances_idxP = resultP.importances_mean.argsort()
importancesP = pd.DataFrame(
    resultP.importances[sorted_importances_idxP].T,
    columns=xPE.columns[sorted_importances_idxP],
)
# print(importancesP)
ax = importancesP.plot.box(vert=False, whis=10)
ax.set_title("Permutation Importances (test set) from Polyfine family")
ax.axvline(x=0, color="k", linestyle="--")
ax.set_xlabel("Decrease in accuracy score")
ax.figure.tight_layout()
ax.figure.show()


In [12]:
# taking best 10 features 
nP= sorted_importances_idxP.size
fsRFPermutationP = xPE.columns[sorted_importances_idxP[nP-10:nP]]
# reversing order 
fsRFPermutationP = fsRFPermutationP[nP:None:-1]
fsRFPermutationP

In [13]:
rfNP = RandomForestRegressor()

# y_train = y_train.values.flatten()
# Fit the model on the training data
rfNP.fit(X_trainNP, y_trainNP)
# print(rfNP.feature_importances_)

resultNP = permutation_importance(
    rfNP, X_testNP, y_testNP, n_repeats=10, random_state=0, n_jobs=2
)

sorted_importances_idxNP = resultNP.importances_mean.argsort()
importancesNP = pd.DataFrame(
    resultNP.importances[sorted_importances_idxNP].T,
    columns=xNPE.columns[sorted_importances_idxNP],
)
# print(importancesNP)
ax = importancesNP.plot.box(vert=False, whis=10)
ax.set_title("Permutation Importances on NONPoly family")
ax.axvline(x=0, color="k", linestyle="--")
ax.set_xlabel("Decrease in accuracy score")
ax.figure.tight_layout()
ax.figure.show()



In [14]:
nNP= sorted_importances_idxNP.size
fsRFPermutationNP = xNPE.columns[sorted_importances_idxNP[nNP-10:nNP]]
# reversing order 
fsRFPermutationNP = fsRFPermutationNP[nNP:None:-1]
fsRFPermutationNP

### Matrix Topology 

In [15]:

branched = pd.read_csv('data/data_reduced_branched_topo.csv')
linear = pd.read_csv('data/data_reduced_linear_topo.csv')
branched.drop('Matrix_topology', axis=1)
linear.drop('Matrix_topology', axis=1)
xBTE, yBTE = select_columns_elasticity(branched)
xLTE, yLTE = select_columns_elasticity(linear)
# tehy have polymers already 1hot enconded

fsRFPermutationBT, resultB= fs_permutation_importance(xBTE, yBTE)
fsRFPermutationLT, resultL = fs_permutation_importance(xLTE, yLTE)

# getting 10 best features 
fsRFPermutationBT_names, fsRFPermutationBT_idx = get_best_t(x= xBTE, sorted_idx= fsRFPermutationBT, t= 10 )
fsRFPermutationLT_names, fsRFPermutationLT_idx = get_best_t(x= xLTE, sorted_idx= fsRFPermutationLT, t= 10 )
# print(fsRFPermutationBT_idx)

get_plot(result=resultB, xE=xBTE, sorted_idx= fsRFPermutationBT_idx, title= "Permutation Importances Branched Topology")
get_plot(result= resultL, xE=xLTE, sorted_idx= fsRFPermutationLT_idx, title= "Permutation Importances Linear Topology")


In [16]:
xLTE.columns.values[1:10]

### Matrix Crystalinity

In [None]:
high = pd.read_csv('data/data_reduced_high_crys.csv')
low = pd.read_csv('data/data_reduced_low_crys.csv')
amorp=  pd.read_csv('data/data_reduced_amorp_crys.csv')
high.drop('Matrix_crystallinity', axis=1)
low.drop('Matrix_crystallinity', axis=1)
amorp.drop('Matrix_crystallinity', axis=1)

xHCE, yHCE = select_columns_elasticity(high)
xLCE, yLCE = select_columns_elasticity(low)
xACE, yACE = select_columns_elasticity(amorp)


In [None]:
fsRFPermutationHC, resultHC= fs_permutation_importance(xHCE, yHCE)
fsRFPermutationLC, resultLC = fs_permutation_importance(xLCE, yLCE)
fsRFPermutationAC, resultAC = fs_permutation_importance(xACE, yACE)
# getting 10 best features 
fsRFPermutationHC_names, fsRFPermutationHC_idx = get_best_t(x= xHCE, sorted_idx= fsRFPermutationHC, t= 10 )
fsRFPermutationLC_names, fsRFPermutationLC_idx = get_best_t(x= xLCE, sorted_idx= fsRFPermutationLC, t= 10 )
fsRFPermutationAC_names, fsRFPermutationAC_idx = get_best_t(x= xACE, sorted_idx= fsRFPermutationAC, t= 10 )

get_plot(result=resultHC, xE=xHCE, sorted_idx= fsRFPermutationHC_idx, title= "Permutation Importances High Crystalinity")
get_plot(result= resultLC, xE=xLCE, sorted_idx= fsRFPermutationLC_idx, title= "Permutation Importances Low Crystalinity")
get_plot(result=resultAC, xE=xACE, sorted_idx= fsRFPermutationAC_idx, title= "Permutation Importances amorphous Crystalinity")


### Polymer Matrix type

In [None]:
HDPE = pd.read_csv('data/datadata_reduced_hdpe.csv')
LDPE = pd.read_csv('data/datadata_reduced_ldpe.csv')
LLDPE = pd.read_csv('data/datadata_reduced_lldpe.csv')
PET = pd.read_csv('data/datadata_reduced_pet.csv')
PS1 = pd.read_csv('data/datadata_reduced_ps1.csv')
PS2 = pd.read_csv('data/datadata_reduced_ps2.csv')
PP1 = pd.read_csv('data/datadata_reduced_pp1.csv')
PP2 = pd.read_csv('data/datadata_reduced_pp2.csv')
PP3 = pd.read_csv('data/datadata_reduced_pp3.csv')
PA = pd.read_csv('data/datadata_reduced_pa.csv')

In [None]:
PA.columns.values

### trying to automate with pipeline

In [None]:
# def permutation_feature_plastic(data):
#     data=(
#         data
#         .pipe(preprocess_type_plastic)
#         .pipe(select_columns_elasticity)
#     )
    
pipeline1 = Pipeline([
    ('plastics',preprocess_type_plastic),
    ('elasticity',select_columns_elasticity)
    
])

pipeline2 = Pipeline([
#     split data here and feed to RF
    ('rf', RandomForestRegressor()),
    ('permutation',fs_permutation_importance ),
    ('ten best', get_best_t)
    
])

X = pipeline1.fit_transform(data)
y = data['blendE']
best_features = pipeline2.fit_transform(X, y, permutation__t=10)

print("Best Features:")
print(best_features)

    

# gaussian


In [None]:
# Fit the model to the toy data.
gpr = GaussianProcessRegressor()
gpr.fit(X_train, y_train)


# Perform interpolation prediction.
y_pred_train = gpr.predict(X_train)


# Perform extrapolation prediction. The model should
# not perform very well here.
y_pred = gpr.predict(X_test)


plt.plot(xE['matrixE'], yE, "k", linewidth = 6)
plt.plot(X_train, y_pred_train, "r")
plt.plot(X_test, y_pred, "b")
plt.legend(["Training data", "Interpolation", "Extrapolation"])
plt.xlabel("x")
plt.ylabel("f(x)")
plt.show()