In [19]:
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
import numpy as np

from sklearn.feature_selection import RFE 

In [11]:
col_names = ['Sex','Length','Diameter','Height','Whole weight','Shucked weight','Viscera weight','Shell weight','Rings']
dataset = pd.read_csv("../Datasets/abalone.csv",header=None, names=col_names)

In [12]:
dataset

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
1,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
2,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
3,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
4,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
...,...,...,...,...,...,...,...,...,...
4173,F,0.565,0.45,0.165,0.887,0.37,0.239,0.249,11
4174,M,0.59,0.44,0.135,0.966,0.439,0.2145,0.2605,10
4175,M,0.6,0.475,0.205,1.176,0.5255,0.2875,0.308,9
4176,F,0.625,0.485,0.15,1.0945,0.531,0.261,0.296,10


In [13]:
from sklearn.preprocessing import LabelEncoder
X = dataset.iloc[1:, :-1].values
y = dataset.iloc[1:, 8].values
X

array([['M', '0.455', '0.365', ..., '0.2245', '0.101', '0.15'],
       ['M', '0.35', '0.265', ..., '0.0995', '0.0485', '0.07'],
       ['F', '0.53', '0.42', ..., '0.2565', '0.1415', '0.21'],
       ...,
       ['M', '0.6', '0.475', ..., '0.5255', '0.2875', '0.308'],
       ['F', '0.625', '0.485', ..., '0.531', '0.261', '0.296'],
       ['M', '0.71', '0.555', ..., '0.9455', '0.3765', '0.495']],
      dtype=object)

In [14]:
y

array(['15', '7', '9', ..., '9', '10', '12'], dtype=object)

In [15]:
from sklearn.preprocessing import LabelEncoder
labelencoder_X = LabelEncoder()
X[:,0] = labelencoder_X.fit_transform(X[:,0])
X

array([[2, '0.455', '0.365', ..., '0.2245', '0.101', '0.15'],
       [2, '0.35', '0.265', ..., '0.0995', '0.0485', '0.07'],
       [0, '0.53', '0.42', ..., '0.2565', '0.1415', '0.21'],
       ...,
       [2, '0.6', '0.475', ..., '0.5255', '0.2875', '0.308'],
       [0, '0.625', '0.485', ..., '0.531', '0.261', '0.296'],
       [2, '0.71', '0.555', ..., '0.9455', '0.3765', '0.495']],
      dtype=object)

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)
#print(X_train.shape)
#print(X_test.shape)
#print(Y_train.shape)
#print(Y_test.shape)

In [17]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

lin_model = LinearRegression()
lin_model = RFE(lin_model, 3, step = 1)
lin_model.fit(X_train, y_train)

RFE(estimator=LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
                               normalize=False),
    n_features_to_select=3, step=1, verbose=0)

In [20]:
from sklearn.metrics import r2_score
y_train_predict = lin_model.predict(X_train)
rmse = (np.sqrt(mean_squared_error(y_train, y_train_predict)))
r2 = r2_score(y_train, y_train_predict)

print("The model performance for training set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))
print("\n")

# model evaluation for testing set
y_test_predict = lin_model.predict(X_test)
rmse = (np.sqrt(mean_squared_error(y_test, y_test_predict)))
r2 = r2_score(y_test, y_test_predict)

print("The model performance for testing set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))

The model performance for training set
--------------------------------------
RMSE is 2.2672728077159956
R2 score is 0.5042215277702632


The model performance for testing set
--------------------------------------
RMSE is 2.3090733761039113
R2 score is 0.4883078882137597


In [21]:
lin_model.ranking_

array([6, 5, 1, 3, 1, 1, 2, 4])