In [0]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.model_selection import cross_val_predict, cross_val_score
from sklearn.model_selection import KFold
from sklearn import metrics  as metrics
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate
from scipy.sparse import csr_matrix

from sklearn.feature_selection import f_regression, mutual_info_regression

import matplotlib.pyplot as plt

In [0]:
df1 = pd.read_csv('network_backup_dataset.csv',delimiter=',', header=0)

In [0]:
# Scalar Encoding
le = preprocessing.LabelEncoder()

row=df1.shape[0]
col=df1.shape[1]

df = df1

# Scalar Encoding of the 3 categorical values
le.fit(df.iloc[0:row,1])
df.iloc[0:row,1]= le.transform(df1.iloc[0:row,1])

le.fit(df.iloc[0:row,3])
df.iloc[0:row,3]= le.transform(df1.iloc[0:row,3])

le.fit(df.iloc[0:row,4])
df.iloc[0:row,4]= le.transform(df1.iloc[0:row,4])
    
X = df.iloc[0:row,0:5]

y = df1.iloc[0:row, 5]

lr = linear_model.LinearRegression()

scores = cross_validate(lr, X, y,scoring='neg_mean_squared_error',cv=10)

print('Simple Regression with scalar encoding')
test_rmse = np.sqrt(np.mean(np.abs(scores['test_score'])))
train_rmse =np.sqrt(np.mean(np.abs(scores['train_score'])))
print('Test RMSE',test_rmse)
print('Train RMSE',train_rmse)

# Plot of actual vs fitted values

y_predicted = cross_val_predict(lr, X, y, cv=10)


fig, ax = plt.subplots()
ax.scatter(x=y, y=y_predicted,color='g',marker='o',s=0.25)
ax.plot([0, y.max()], [y.min(), y.max()],  'k--', lw=4)
ax.set_xlabel('Actual')
ax.set_ylabel('Fitted')
plt.title('Actual vs Fitted Values')
plt.savefig('Actual vs Fitted Values')
plt.show()


# Plot of Fitted vs Residual values
y_residual = y - y_predicted
fig, ax = plt.subplots()
ax.scatter(y_predicted, y_residual,color='g',marker='o',s=0.25)
ax.set_xlabel('Fitted')
ax.set_ylabel('Residual')
plt.title('Fitted vs Residual Values')
plt.savefig('Fitted vs Residual Values')
plt.show()

datapoints = range(row)

# Plot of actual and fitted values
fig, ax = plt.subplots()
a=ax.scatter(datapoints, y,color='r',marker='o',s=0.25)
b=ax.scatter(datapoints, y_predicted,color='g',marker='o',s=0.25)
ax.plot([0, y.max()], [y.min(), y.max()],  'k--', lw=4)
ax.set_xlabel('Data Points')
ax.set_ylabel('Actual and Fitted values')
plt.title('Actual and Fitted Values')
ax.legend((a,b),('Actual','Fitted'))
plt.savefig('Actual and Fitted values')
plt.show()


# Plot of Fitted and Residual values
y_residual = y - y_predicted
fig, ax = plt.subplots()
a=ax.scatter(datapoints, y_residual,color='r',marker='o',s=0.25)
b=ax.scatter(datapoints, y_predicted,color='g',marker='o',s=0.25)
ax.set_xlabel('Data Points')
ax.set_ylabel('Residual and Fitted values ')
plt.title('Fitted and Residual Values')
ax.legend((a,b),('Residual','Fitted'))
plt.savefig('Fitted and Residual Values')
plt.show()


Simple Regression with scalar encoding
('Test RMSE', 0.10193944624209859)
('Train RMSE', 0.10183435819796752)


In [0]:
# Standardization

lr = linear_model.LinearRegression()
scaler = StandardScaler()
data = X


scaler.fit(data)
standardized_data = scaler.transform(data)

scores = cross_validate(lr, standardized_data, y,scoring='neg_mean_squared_error',cv=10)
test_rmse = np.sqrt(np.mean(np.abs(scores['test_score'])))
train_rmse =np.sqrt(np.mean(np.abs(scores['train_score'])))

print('After Standardization')
print('Test RMSE',test_rmse)
print('Train RMSE',train_rmse)

predicted = cross_val_predict(lr, standardized_data, y, cv=10)



After Standardization
('Test RMSE', 0.10193944624209859)
('Train RMSE', 0.10183435819796753)


In [0]:
# Selecting Features

f_test, _ = f_regression(data, y)
f_test /= np.max(f_test)

mi = mutual_info_regression(data, y)
mi /= np.max(mi)

print('Selecting 3 best features from f_regression')
print('f_regression')
print(f_test)

print('Selecting 3 best features from Mutual Information')
print('mutual information')
print(mi)


X_reduced = X.iloc[0:row, [False,True,True,False,True]]
        
scores = cross_validate(lr,X_reduced,y,scoring='neg_mean_squared_error',cv=10)
test_rmse = np.sqrt(np.mean(np.abs(scores['test_score'])))
train_rmse =np.sqrt(np.mean(np.abs(scores['train_score'])))
print('Test RMSE after selecting 3 best features using f_regression ',test_rmse)
print('Train RMSE',train_rmse)

# Plot of actual vs fitted values

y_predicted = cross_val_predict(lr, X_reduced, y, cv=10)

fig, ax = plt.subplots()
ax.scatter(x=y, y=y_predicted,color='g',marker='o',s=0.25)
ax.plot([0, y.max()], [y.min(), y.max()],  'k--', lw=4)
ax.set_xlabel('Actual')
ax.set_ylabel('Fitted')
plt.title('Actual vs Fitted Values')
plt.savefig('After Feature Selection: Actual vs Fitted Values')
plt.show()


# Plot of Fitted vs Residual values
y_residual = y - y_predicted
fig, ax = plt.subplots()
ax.scatter(y_predicted, y_residual,color='g',marker='o',s=0.25)
ax.set_xlabel('Fitted')
ax.set_ylabel('Residual')
plt.title('Fitted vs Residual Values')
plt.savefig('After Feature Selection: Fitted vs Residual Values')
plt.show()

datapoints = range(row)
# Plot of actual and fitted values
fig, ax = plt.subplots()
a=ax.scatter(datapoints, y,color='r',marker='o',s=0.25)
b=ax.scatter(datapoints, y_predicted,color='g',marker='o',s=0.25)
ax.plot([0, y.max()], [y.min(), y.max()],  'k--', lw=4)
ax.set_xlabel('Data Points')
ax.set_ylabel('Actual and Fitted values')
plt.title('After Feature Selection: Actual and Fitted Values')
ax.legend((a,b),('Actual','Fitted'))
plt.savefig('After Feature Selection: Actual and Fitted values')
plt.show()


# Plot of Fitted and Residual values
y_residual = y - y_predicted
fig, ax = plt.subplots()
a=ax.scatter(datapoints, y_residual,color='r',marker='o',s=0.25)
b=ax.scatter(datapoints, y_predicted,color='g',marker='o',s=0.25)
ax.set_xlabel('Data Points')
ax.set_ylabel('Residual and Fitted values ')
plt.title('After Feature Selection: Fitted and Residual Values')
ax.legend((a,b),('Residual','Fitted'))
plt.savefig('After Feature Selection: Fitted and Residual Values')
plt.show()


X_reduced = X.iloc[0:row, [False,False,True,True,True]]
        
scores = cross_validate(lr,X_reduced,y,scoring='neg_mean_squared_error',cv=10)
test_rmse = np.sqrt(np.mean(np.abs(scores['test_score'])))
train_rmse =np.sqrt(np.mean(np.abs(scores['train_score'])))
print('Test RMSE after selecting 3 best features using nutual information' ,test_rmse)
print('Train RMSE',train_rmse)

Selecting 3 best features from f_regression
f_regression
[  1.83383477e-05   4.78772999e-01   3.27138366e-01   5.67261995e-02
   1.00000000e+00]
Selecting 3 best features from Mutual Information
mutual information
[ 0.00414402  0.45521467  0.62228067  0.7461687   1.        ]
('Test RMSE after selecting 3 best features using f_regression ', 0.1018934267654872)
('Train RMSE', 0.10187197479409449)
('Test RMSE after selecting 3 best features using nutual information', 0.10254728677099285)
('Train RMSE', 0.10246541286731993)


In [0]:
# One hot encoding

test_rmse_enc = [0]*32
train_rmse_enc = [0]*32

super_X = []
            

for i in range(0,32):
    
    b = "{0:05b}".format(i)

    b = list(b)
    value = []
    for x in b:
        if x=='0':
            value.append(False)
        else:
            value.append(True)
    enc = OneHotEncoder(categorical_features=value)
    enc.fit(X)
    X_encoded = enc.transform(X)
    
    if i!=0:
        X_encoded = X_encoded.todense()
        
    super_X.append(X_encoded)
    
    
    scores = cross_validate(lr,X_encoded,y,scoring='neg_mean_squared_error',cv=10)
    test_rmse = np.sqrt(np.mean(np.abs(scores['test_score'])))
    train_rmse =np.sqrt(np.mean(np.abs(scores['train_score'])))
    test_rmse_enc[i] = test_rmse
    train_rmse_enc[i] = train_rmse
    
print('Test RMSE of the 32 models') 
print(test_rmse_enc)
print('Minimum Test RMSE',min(test_rmse_enc))
print('Train RMSE of the 32 models')
print(train_rmse_enc)


Test RMSE of the 32 models
[0.10193944624209859, 0.090968244789234071, 0.090965435745335804, 0.090967777891921967, 0.1007047686560026, 0.089573220589374003, 0.089574171641399442, 0.089584844887701701, 0.1009757241245799, 0.089910883168603975, 0.089908188253164198, 0.089920089156094091, 0.099738534894066852, 0.088507099324723421, 0.088507360144404648, 0.088508373687921721, 8065370829.5817604, 17461609275.458393, 15615450317.490131, 26896548047.500687, 7001681485.6499958, 36800500816.852264, 22583523876.873058, 109184256519.27597, 13148003254.87126, 17207308281.642895, 19447376948.255455, 52596226886.327904, 23400956820.067589, 245424233914.4487, 18502358572.67556, 555674402231.9502]
('Minimum Test RMSE', 0.088507099324723421)
Train RMSE of the 32 models
[0.10183435819796752, 0.090793594435595593, 0.090796238285714856, 0.090793626996754584, 0.10058577720887095, 0.089385277370006191, 0.089386331588134707, 0.089385689070760124, 0.10088992814449357, 0.089758534916399665, 0.08975818893131426

In [0]:
# Large difference in Train and Test error is observed when all the features are vector encoded

# We try to improve the results by regularization

# We optimize the parameters for the best model i.e the one with first parameter scalar encoded and the rest vector encoded

X_chosen = super_X[16]

print('Ridge Regularizer')
al = [0.1,0.2,0.5,1,2,4,8,16,20,24,28,32,36,40,44,48,60,80,100]

test_rmse_ridge = []

for a in al:

    ridge = Ridge(alpha=a)
    lr_ridge = ridge.fit(X_chosen, y)

    scores = cross_validate(lr_ridge,X_chosen,y,scoring='neg_mean_squared_error',cv=10)
    test_rmse = np.sqrt(np.mean(np.abs(scores['test_score'])))
    train_rmse =np.sqrt(np.mean(np.abs(scores['train_score'])))

    test_rmse_ridge.append(test_rmse)
    print(test_rmse,'alpha=',a)
    
plt.plot(al,test_rmse_ridge,'x',linestyle='-')
plt.xlabel('Alpha value')
plt.ylabel('Test RMSE ')
plt.title('Test RMS value vs alpha using Ridge Regularization')
plt.show()
plt.savefig('Test RMS value vs alpha using Ridge Regularization')



print('Lasso Regularizer')
al = [0.001,0.002,0.003,0.004,0.005]

test_rmse_lasso = []

for a in al:
    lasso = Lasso(alpha=a)
    lr_lasso = lasso.fit(X_chosen, y)

    scores = cross_validate(lr_lasso,X_chosen,y,scoring='neg_mean_squared_error',cv=10)
    test_rmse = np.sqrt(np.mean(np.abs(scores['test_score'])))
    train_rmse =np.sqrt(np.mean(np.abs(scores['train_score'])))

    print(test_rmse, 'alpha=', a)
  
    test_rmse_lasso.append(test_rmse)
    
plt.plot(al,test_rmse_lasso,'x',linestyle='-')
plt.xlabel('Alpha value')
plt.ylabel('Test RMSE ')
plt.title('Test RMS value vs alpha using Lasso Regularization')
plt.show()
plt.savefig('Test RMS value vs alpha using Lasso Regularization')

print('Elastic Net Regression')

l = [0,0.01,0.02,0.03,0.04,0.05,0.06,0.07]

test_rmse_elastic = []
for r in l:

    elasticNet = ElasticNet(alpha=.001, l1_ratio=r, random_state=0)
    lr_elastic = elasticNet.fit(X_chosen, y)

    scores = cross_validate(lr_elastic,X_encoded,y,scoring='neg_mean_squared_error',cv=10)
    test_rmse = np.sqrt(np.mean(np.abs(scores['test_score'])))
    train_rmse =np.sqrt(np.mean(np.abs(scores['train_score'])))

    print('Test RMSE',test_rmse)
    print('Train RMSE',train_rmse)
    test_rmse_elastic.append(test_rmse)
    
# Best model 
# Best value seems to be alpha = 0.001 and l1_ratio=0.04

elasticNet = ElasticNet(alpha=.001, l1_ratio=0.04, random_state=0)
lr_elastic = elasticNet.fit(X_chosen, y)
y_predicted = cross_val_predict(lr_elastic, X_chosen, y, cv=10)   
scores = cross_validate(lr_elastic,X_encoded,y,scoring='neg_mean_squared_error',cv=10)
test_rmse = np.sqrt(np.mean(np.abs(scores['test_score'])))
train_rmse =np.sqrt(np.mean(np.abs(scores['train_score'])))

print(test_rmse)
print(train_rmse)

plt.plot(l,test_rmse_elastic,'x',linestyle='-')
plt.xlabel('L1 ratio')
plt.ylabel('Test RMSE ')
plt.title('Test RMS value vs l1 ratio using Elastic Net Regularization')
plt.show()
plt.savefig('Test RMS value vs l1 ratio using Elastic Net Regularization')   




datapoints = range(row)
# Plot of actual and fitted values
fig, ax = plt.subplots()
a=ax.scatter(datapoints, y,color='r',marker='o',s=0.25)
b=ax.scatter(datapoints, y_predicted,color='g',marker='o',s=0.25)
ax.plot([0, y.max()], [y.min(), y.max()],  'k--', lw=4)
ax.set_xlabel('Data Points')
ax.set_ylabel('Actual and Fitted values')
plt.title('Best Model: Actual and Fitted Values')
ax.legend((a,b),('Actual','Fitted'))
plt.savefig('Best Model: Actual and Fitted values')
plt.show()


# Plot of Fitted and Residual values
y_residual = y - y_predicted
fig, ax = plt.subplots()
a=ax.scatter(datapoints, y_residual,color='r',marker='o',s=0.25)
b=ax.scatter(datapoints, y_predicted,color='g',marker='o',s=0.25)
ax.set_xlabel('Data Points')
ax.set_ylabel('Residual and Fitted values ')
plt.title('Best Model: Fitted and Residual Values')
ax.legend((a,b),('Residual','Fitted'))
plt.savefig('Best Model: Fitted and Residual Values')
plt.show()




Ridge Regularizer
(0.10851836822962944, 'alpha=', 0.1)
(0.1075738201990262, 'alpha=', 0.2)
(0.10571574017401884, 'alpha=', 0.5)
(0.10419401459286225, 'alpha=', 1)
(0.10305106183058908, 'alpha=', 2)
(0.10243776629898954, 'alpha=', 4)
(0.10218846954135967, 'alpha=', 8)
(0.10210308285123543, 'alpha=', 16)
(0.10209037500930061, 'alpha=', 20)
(0.10208266393855286, 'alpha=', 24)
(0.10207743964450718, 'alpha=', 28)
(0.10207359927009213, 'alpha=', 32)
(0.1020705951089065, 'alpha=', 36)
(0.10206812942950977, 'alpha=', 40)
(0.10206602870644642, 'alpha=', 44)
(0.10206418603171387, 'alpha=', 48)
(0.1020596224038189, 'alpha=', 60)
(0.10205367644291624, 'alpha=', 80)
(0.10204872771630495, 'alpha=', 100)
Lasso Regularizer
(0.10193834069233614, 'alpha=', 0.001)
(0.10194992688185092, 'alpha=', 0.002)
(0.10194842622436756, 'alpha=', 0.003)
(0.10193610187392778, 'alpha=', 0.004)
(0.10192944490108458, 'alpha=', 0.005)
Elastic Net Regression
('Test RMSE', 0.088522949070799023)
('Train RMSE', 0.088335299311

In [0]:
# Ridge Regularization


print('Ridge Regularizer')
al = [0.1,0.2,0.5,1,2,4,8,16,20,24,28,32,36,40,44,48,60,80,100]

test_rmse_ridge = []

for a in al:
    test_rmse = 0
    for X_chosen in super_X:
    

        ridge = Ridge(alpha=a)
        lr_ridge = ridge.fit(X_chosen, y)

        scores = cross_validate(lr_ridge,X_chosen,y,scoring='neg_mean_squared_error',cv=10)
        test_rmse += np.sqrt(np.mean(np.abs(scores['test_score'])))
        train_rmse =np.sqrt(np.mean(np.abs(scores['train_score'])))

    test_rmse_ridge.append(test_rmse/32.0)
    print(test_rmse/32.0,a)
    
plt.plot(al,test_rmse_ridge,'x',linestyle='-')
plt.xlabel('Alpha value')
plt.ylabel('Average test RMSE value among the 32 models')
plt.title('Average test RMS value vs alpha using Ridge Regularization')
plt.show()
plt.savefig('Average test RMS value vs alpha using Ridge Regularization')


In [0]:
# Lasso Regularization

al = [0,0.001,0.002,0.003,0.004,0.005]

test_rmse_lasso = []


for a in al:
    test_rmse = 0
    for X_chosen in super_X:
    

        lasso = Lasso(alpha=a)
        lr_lasso = lasso.fit(X_chosen, y)

        scores = cross_validate(lr_lasso,X_chosen,y,scoring='neg_mean_squared_error',cv=10)
        test_rmse += np.sqrt(np.mean(np.abs(scores['test_score'])))
        train_rmse =np.sqrt(np.mean(np.abs(scores['train_score'])))

    test_rmse_lasso.append(test_rmse/32.0)
    print(test_rmse_lasso/32.0,a)
    
plt.plot(al,test_rmse_lasso,'x',linestyle='-')
plt.xlabel('Alpha value')
plt.ylabel('Average test RMSE value among the 32 models')
plt.title('Average test RMS value vs alpha using Lasso Regularization')
plt.show('Average test RMS value vs alpha using Lasso Regularization')



TypeError: unsupported operand type(s) for /: 'list' and 'float'