# Regression Dataset 5 (Superconductivty Data)

In [30]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve 
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import VotingRegressor
import math

In [3]:
df = pd.read_csv("train.csv")
df

Unnamed: 0,number_of_elements,mean_atomic_mass,wtd_mean_atomic_mass,gmean_atomic_mass,wtd_gmean_atomic_mass,entropy_atomic_mass,wtd_entropy_atomic_mass,range_atomic_mass,wtd_range_atomic_mass,std_atomic_mass,...,wtd_mean_Valence,gmean_Valence,wtd_gmean_Valence,entropy_Valence,wtd_entropy_Valence,range_Valence,wtd_range_Valence,std_Valence,wtd_std_Valence,critical_temp
0,4,88.944468,57.862692,66.361592,36.116612,1.181795,1.062396,122.90607,31.794921,51.968828,...,2.257143,2.213364,2.219783,1.368922,1.066221,1,1.085714,0.433013,0.437059,29.00
1,5,92.729214,58.518416,73.132787,36.396602,1.449309,1.057755,122.90607,36.161939,47.094633,...,2.257143,1.888175,2.210679,1.557113,1.047221,2,1.128571,0.632456,0.468606,26.00
2,4,88.944468,57.885242,66.361592,36.122509,1.181795,0.975980,122.90607,35.741099,51.968828,...,2.271429,2.213364,2.232679,1.368922,1.029175,1,1.114286,0.433013,0.444697,19.00
3,4,88.944468,57.873967,66.361592,36.119560,1.181795,1.022291,122.90607,33.768010,51.968828,...,2.264286,2.213364,2.226222,1.368922,1.048834,1,1.100000,0.433013,0.440952,22.00
4,4,88.944468,57.840143,66.361592,36.110716,1.181795,1.129224,122.90607,27.848743,51.968828,...,2.242857,2.213364,2.206963,1.368922,1.096052,1,1.057143,0.433013,0.428809,23.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21258,4,106.957877,53.095769,82.515384,43.135565,1.177145,1.254119,146.88130,15.504479,65.764081,...,3.555556,3.223710,3.519911,1.377820,0.913658,1,2.168889,0.433013,0.496904,2.44
21259,5,92.266740,49.021367,64.812662,32.867748,1.323287,1.571630,188.38390,7.353333,69.232655,...,2.047619,2.168944,2.038991,1.594167,1.337246,1,0.904762,0.400000,0.212959,122.10
21260,2,99.663190,95.609104,99.433882,95.464320,0.690847,0.530198,13.51362,53.041104,6.756810,...,4.800000,4.472136,4.781762,0.686962,0.450561,1,3.200000,0.500000,0.400000,1.98
21261,2,99.663190,97.095602,99.433882,96.901083,0.690847,0.640883,13.51362,31.115202,6.756810,...,4.690000,4.472136,4.665819,0.686962,0.577601,1,2.210000,0.500000,0.462493,1.84


Checking for any null values

In [4]:
df.isna().sum()

number_of_elements       0
mean_atomic_mass         0
wtd_mean_atomic_mass     0
gmean_atomic_mass        0
wtd_gmean_atomic_mass    0
                        ..
range_Valence            0
wtd_range_Valence        0
std_Valence              0
wtd_std_Valence          0
critical_temp            0
Length: 82, dtype: int64

# Data Preprocessing

In [19]:
copy_df = df.copy(deep=True)
y = copy_df['critical_temp']
X = copy_df.drop(['critical_temp'], axis=1)

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [21]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [22]:
#Validation metrics for Regression algorithms
def validationmetrics_reg(model,testX,testY, verbose=True):
    
    predictions = model.predict(testX)
       
    #y_pred = model.predict(testX)
    df = pd.DataFrame({'Actual': testY, 'Predicted': predictions})
    
    # R-squared
    r2 = r2_score(testY,predictions)
    
    # Adjusted R-squared
    r2_adjusted = 1-(1-r2)*(testX.shape[0]-1)/(testX.shape[0]-testX.shape[1]-1)
    
    # MSE
    mse = mean_squared_error(testY,predictions)
    
    #RMSE
    rmse = math.sqrt(mse)
    
    if verbose:
        print("R-Squared Value: ", r2)
        print("Adjusted R-Squared: ", r2_adjusted)
        print("RMSE: ", rmse)
        #print(df)
    
    res_map = {
                "r2": r2,
                "r2_adjusted": r2_adjusted,
                "rmse": rmse,
                "model_obj": model,
                #"df": df
              }
         

In [18]:
# Regression Algorithms
def LinearReg(trainX, testX, trainY, testY, verbose=True, clf=None):
    if not clf:
        clf  = LinearRegression()
    clf.fit(trainX , trainY)
    return validationmetrics_reg(clf, testX, testY, verbose=verbose)

def RandomForestReg(trainX, testX, trainY, testY, verbose=True, clf=None):
    if not clf:
        clf = RandomForestRegressor(n_estimators=100)
    clf.fit(trainX , trainY)
    return validationmetrics_reg(clf, testX, testY, verbose=verbose)

def GradientBoostingReg(trainX, testX, trainY, testY, verbose=True, clf=None):
    if not clf:
        clf = GradientBoostingRegressor()
    clf.fit(trainX , trainY)
    return validationmetrics_reg(clf, testX, testY, verbose=verbose)

def AdaBooostReg(trainX, testX, trainY, testY, verbose=True, clf=None):
    if not clf:
        clf = AdaBoostRegressor(random_state=0, n_estimators=100)
    clf.fit(trainX , trainY)
    return validationmetrics_reg(clf, testX, testY, verbose=verbose)



def VotingReg(trainX, testX, trainY, testY, verbose=True, clf=None):
    lr = LinearRegression()
    rf = RandomForestRegressor(n_estimators=100)
    gb = GradientBoostingRegressor()
    ab = AdaBoostRegressor(random_state=0, n_estimators=100)
    if not clf:
        clf = VotingRegressor([('rf', rf), ('lr', lr), ('gb', gb), ('ab', ab)])
    clf.fit(trainX , trainY)
    return validationmetrics_reg(clf, testX, testY, verbose=verbose)

# Principle Component Analysis DR

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

copy_df = df.copy(deep=True)

y = copy_df['critical_temp']
X = copy_df.drop(['critical_temp'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [33]:
from sklearn.decomposition import PCA
pca = PCA()
X_train = pca.fit_transform(X_train)
X_test = pca.transform(X_test)

# MLR

In [26]:
LinearReg(X_train, X_test, y_train, y_test, verbose=True, clf=None)

R-Squared Value:  0.7350976364618504
Adjusted R-Squared:  0.729953284640563
RMSE:  17.559079451644354


# Voting Regression

In [31]:
VotingReg(X_train, X_test, y_train, y_test, verbose=True, clf=None)

R-Squared Value:  0.8547744784575282
Adjusted R-Squared:  0.8519542273798633
RMSE:  13.001106016947073


# t-SNE DR

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

copy_df = df.copy(deep=True)

y = copy_df['critical_temp']
X = copy_df.drop(['critical_temp'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [None]:
#takes plenty of time to execute
from sklearn.manifold import TSNE
import time
#time_start = time.time()
tsne = TSNE(n_components = 2, random_state = 0)
#tsne = TSNE()
X_train =tsne.fit_transform(X_train)
X_test = tsne.fit_transform(X_test)

#print('t-SNE done! Time elapsed: {} seconds'.format(time.time()-time_start))

# MLR

In [34]:
LinearReg(X_train, X_test, y_train, y_test, verbose=True, clf=None)

R-Squared Value:  0.7350976364618504
Adjusted R-Squared:  0.729953284640563
RMSE:  17.559079451644358


# Voting Regression

In [39]:
VotingReg(X_train, X_test, y_train, y_test, verbose=True, clf=None)

R-Squared Value:  0.8549267900394896
Adjusted R-Squared:  0.8521094968228026
RMSE:  12.99428649071536


# Lasso L1 FS

In [40]:
from sklearn.linear_model import  Lasso 
model = Lasso(alpha=0.1)
model.fit(X, y)

# get importance
importance = model.coef_

#feat_importances = pd.Series(importance, index=X_train.columns)

In [43]:
res = pd.Series(importance, index=X.columns.values).sort_values(ascending=False)*100
res.nlargest(10)
#print(res)
threshold = 10
impftrs = list(res[res > threshold].keys())
    #impftrs.append(label_col)
    
print ("Selected Features =" + str(impftrs))


Selected Features =['entropy_ThermalConductivity', 'wtd_range_Valence', 'wtd_entropy_FusionHeat', 'wtd_entropy_ThermalConductivity', 'std_ElectronAffinity', 'wtd_mean_ElectronAffinity', 'std_FusionHeat', 'mean_atomic_radius', 'range_Valence', 'wtd_mean_atomic_radius', 'wtd_mean_ThermalConductivity', 'mean_atomic_mass', 'mean_FusionHeat', 'gmean_ElectronAffinity', 'wtd_std_atomic_radius', 'wtd_range_FusionHeat', 'std_ThermalConductivity', 'range_atomic_mass', 'range_atomic_radius', 'wtd_gmean_FusionHeat']


In [44]:
from sklearn.model_selection import train_test_split

Selected_Features =['entropy_ThermalConductivity', 'wtd_range_Valence', 'wtd_entropy_FusionHeat', 'wtd_entropy_ThermalConductivity', 'std_ElectronAffinity', 'wtd_mean_ElectronAffinity', 'std_FusionHeat', 'mean_atomic_radius', 'range_Valence', 'wtd_mean_atomic_radius', 'wtd_mean_ThermalConductivity', 'mean_atomic_mass', 'mean_FusionHeat', 'gmean_ElectronAffinity', 'wtd_std_atomic_radius', 'wtd_range_FusionHeat', 'std_ThermalConductivity', 'range_atomic_mass', 'range_atomic_radius', 'wtd_gmean_FusionHeat']

copy_df = df.copy(deep=True)

y = copy_df['critical_temp']
X = copy_df[Selected_Features]

 #manual split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)


# MLR

In [45]:
LinearReg(X_train, X_test, y_train, y_test, verbose=True, clf=None)

R-Squared Value:  0.5960356232335577
Adjusted R-Squared:  0.5941265288253987
RMSE:  21.683548803490755


# voting Regression

In [46]:
VotingReg(X_train, X_test, y_train, y_test, verbose=True, clf=None)

R-Squared Value:  0.8221514575431634
Adjusted R-Squared:  0.821310963486184
RMSE:  14.387455139186024


# Recursive Feature Elimination FS

In [48]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

copy_df = df.copy(deep=True)

y = copy_df['critical_temp']
X = copy_df.drop(['critical_temp'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# MLR

In [49]:
LinearReg(X_train, X_test, y_train, y_test, verbose=True, clf=None)

R-Squared Value:  0.7350976364618504
Adjusted R-Squared:  0.729953284640563
RMSE:  17.559079451644354


# voting Regression

In [50]:
VotingReg(X_train, X_test, y_train, y_test, verbose=True, clf=None)

R-Squared Value:  0.8547419753333847
Adjusted R-Squared:  0.8519210930514389
RMSE:  13.002560833279173


# Random Forest Feature Selection

In [52]:
from sklearn.ensemble import RandomForestRegressor
trees=35
threshold = 1
clf  = RandomForestRegressor(n_estimators=trees, random_state=43)
clf.fit(X_train, y_train)
#validationmetrics(clf,testX,testY)
res = pd.Series(clf.feature_importances_, index=X.columns.values).sort_values(ascending=False)*100
res.nlargest(10)

impftrs = list(res[res > threshold].keys())
    #impftrs.append(label_col)
    
print ("Selected Features =" + str(impftrs))


Selected Features =['range_ThermalConductivity', 'wtd_gmean_ThermalConductivity', 'std_atomic_mass', 'wtd_gmean_Valence', 'wtd_entropy_ThermalConductivity', 'std_Density', 'mean_Density', 'range_atomic_radius']


In [53]:
Selected_Features =['range_ThermalConductivity', 'wtd_gmean_ThermalConductivity', 'std_atomic_mass', 'wtd_gmean_Valence', 'wtd_entropy_ThermalConductivity', 'std_Density', 'mean_Density', 'range_atomic_radius']

In [54]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

copy_df = df.copy(deep=True)

y = copy_df['critical_temp']
X = copy_df[Selected_Features]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# MLR

In [55]:
LinearReg(X_train, X_test, y_train, y_test, verbose=True, clf=None)

R-Squared Value:  0.5306583882568381
Adjusted R-Squared:  0.5297736726833354
RMSE:  23.37240254131212


# Voting Regression

In [56]:
VotingReg(X_train, X_test, y_train, y_test, verbose=True, clf=None)

R-Squared Value:  0.8133072552212592
Adjusted R-Squared:  0.8129553367579628
RMSE:  14.740850706409313


# XGBoost FS

In [58]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

copy_df = df.copy(deep=True)

y = copy_df['critical_temp']
X = copy_df.drop(['critical_temp'], axis=1)

In [59]:
from xgboost import plot_importance
from xgboost import XGBRegressor
from matplotlib import pyplot
# fit model no training data
model = XGBRegressor()
model.fit(X, y)

model.feature_importances_
# eval model
#X_train = X_train.feature_names

array([4.2191874e-05, 1.7712238e-04, 4.6843546e-04, 7.7097898e-04,
       1.8977236e-03, 4.6787944e-04, 3.4179803e-04, 1.4333441e-03,
       1.6497705e-03, 7.9780258e-03, 1.0751603e-03, 9.1478886e-04,
       5.6213245e-04, 1.4577749e-03, 6.4081344e-04, 7.0154131e-04,
       1.2919659e-03, 2.5012335e-03, 1.7838139e-03, 1.1008037e-03,
       6.6611433e-04, 1.2285092e-03, 4.9558200e-04, 9.7456184e-04,
       5.1447074e-04, 7.5199036e-04, 5.4272846e-04, 2.5924237e-02,
       9.4335625e-04, 3.7393263e-03, 9.7590522e-04, 4.4796518e-03,
       1.1236536e-03, 7.1711657e-03, 1.3665581e-03, 3.6313445e-03,
       1.2841166e-03, 4.4343213e-04, 9.8304846e-04, 1.3008382e-02,
       4.7172917e-04, 8.8417344e-04, 1.1174984e-03, 1.4615644e-02,
       2.7600154e-03, 5.8266573e-04, 9.5098867e-04, 8.9433067e-04,
       1.2135064e-03, 2.8338931e-03, 3.9688209e-03, 7.0860924e-04,
       7.2597258e-04, 1.2870702e-03, 1.1737121e-03, 1.2025309e-03,
       1.2831900e-03, 1.4585519e-03, 4.4313032e-04, 1.1323994e

In [60]:
feature = []
for col,score in zip(X.columns,model.feature_importances_):
    feature.append(col)
features = pd.Series(feature)

In [61]:
X = X[features]

In [62]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# MLR

In [63]:
LinearReg(X_train, X_test, y_train, y_test, verbose=True, clf=None)

R-Squared Value:  0.7350976364618504
Adjusted R-Squared:  0.729953284640563
RMSE:  17.559079451644354


# Voting Regression

In [64]:
VotingReg(X_train, X_test, y_train, y_test, verbose=True, clf=None)

R-Squared Value:  0.8549271023778133
Adjusted R-Squared:  0.8521098152266752
RMSE:  12.994272502550876
