## Ridge and Lasso regression

In [8]:
## importing libraries:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

In [9]:
df = pd.read_csv('tested.csv')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [10]:
df.shape

(418, 12)

In [11]:
# step-2: check missing values:

df.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age             86
SibSp            0
Parch            0
Ticket           0
Fare             1
Cabin          327
Embarked         0
dtype: int64

In [12]:
# dropmg columns which has nan values:

df.drop(columns = ['Age','Cabin'], axis = 1,inplace=True)

In [13]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,SibSp,Parch,Ticket,Fare,Embarked
0,892,0,3,"Kelly, Mr. James",male,0,0,330911,7.8292,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,1,0,363272,7.0,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,0,0,240276,9.6875,Q
3,895,0,3,"Wirz, Mr. Albert",male,0,0,315154,8.6625,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,1,1,3101298,12.2875,S


In [14]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Embarked'],
      dtype='object')

In [15]:
cols = df[['Pclass','Sex','SibSp','Parch','Embarked']]

In [16]:
cols.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked
0,3,male,0,0,Q
1,3,female,1,0,S
2,2,male,0,0,Q
3,3,male,0,0,S
4,3,female,1,1,S


In [17]:
cols.isnull().sum()

Pclass      0
Sex         0
SibSp       0
Parch       0
Embarked    0
dtype: int64

In [18]:
# step-3 : applying label encoder to categorical data:

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [19]:
col1 = ['Sex','Embarked']

In [20]:
cols[col1]= cols[col1].apply(le.fit_transform)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [16]:
cols.head()

Unnamed: 0,Pclass,Sex,SibSp,Parch,Embarked
0,3,1,0,0,1
1,3,0,1,0,2
2,2,1,0,0,1
3,3,1,0,0,2
4,3,0,1,1,2


In [17]:
## creating dummy values on pclass:

df1 = cols['Pclass']

df2 = pd.get_dummies(df1,drop_first=True)
df2

Unnamed: 0,2,3
0,0,1
1,0,1
2,1,0
3,0,1
4,0,1
...,...,...
413,0,1
414,0,0
415,0,1
416,0,1


In [18]:
cols.drop(['Pclass'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [19]:
cols.head()

Unnamed: 0,Sex,SibSp,Parch,Embarked
0,1,0,0,1
1,0,1,0,2
2,1,0,0,1
3,1,0,0,2
4,0,1,1,2


In [20]:
df3 = pd.concat([cols,df2],axis=1)
df3.head()

Unnamed: 0,Sex,SibSp,Parch,Embarked,2,3
0,1,0,0,1,0,1
1,0,1,0,2,0,1
2,1,0,0,1,1,0
3,1,0,0,2,0,1
4,0,1,1,2,0,1


In [21]:
# step-4: model building:

x = df3.copy()
x.head()

Unnamed: 0,Sex,SibSp,Parch,Embarked,2,3
0,1,0,0,1,0,1
1,0,1,0,2,0,1
2,1,0,0,1,1,0
3,1,0,0,2,0,1
4,0,1,1,2,0,1


In [22]:
y = df['Survived']
y.head()

0    0
1    1
2    0
3    0
4    1
Name: Survived, dtype: int64

In [23]:
# train test split:

from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=1)

In [24]:
len(x_train)

292

In [25]:
len(x_test)

126

In [26]:
# scaling:

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

scaler.fit_transform(x_train)
scaler.transform(x_test)

array([[ 0.78310898, -0.51121335, -0.38880303, -0.40213824, -0.5191967 ,
         0.97965889],
       [ 0.78310898, -0.51121335, -0.38880303,  0.73790415,  1.92605229,
        -1.02076346],
       [ 0.78310898, -0.51121335, -0.38880303, -1.54218064, -0.5191967 ,
         0.97965889],
       [ 0.78310898, -0.51121335, -0.38880303,  0.73790415, -0.5191967 ,
        -1.02076346],
       [-1.27696148,  0.61965255,  0.581543  ,  0.73790415, -0.5191967 ,
         0.97965889],
       [ 0.78310898, -0.51121335, -0.38880303,  0.73790415,  1.92605229,
        -1.02076346],
       [-1.27696148,  4.01225026,  1.55188903,  0.73790415, -0.5191967 ,
         0.97965889],
       [ 0.78310898, -0.51121335, -0.38880303,  0.73790415, -0.5191967 ,
        -1.02076346],
       [ 0.78310898, -0.51121335, -0.38880303, -0.40213824, -0.5191967 ,
         0.97965889],
       [ 0.78310898, -0.51121335, -0.38880303,  0.73790415, -0.5191967 ,
         0.97965889],
       [ 0.78310898, -0.51121335, -0.38880303,  0.

In [27]:
# uing lr model:

from sklearn.linear_model import LinearRegression
lr = LinearRegression()

In [28]:
lr.fit(x_train,y_train)

LinearRegression()

In [29]:
# usiing RFE:

from sklearn.feature_selection import RFE

rfe = RFE(lr,3)
rfe.fit(x,y)



RFE(estimator=LinearRegression(), n_features_to_select=3)

In [30]:
# making list of a  column selected by rfe:

col3 = x_train.columns[rfe.support_]
col3

Index(['Sex', 'Embarked', 2], dtype='object')

In [54]:
# update x_train having 3 columns ith rfe support:

x_train= x_train[col3]

In [42]:
x_train.shape

(415, 3)

In [55]:
# update x_test having 3 columns with rfe supprt:

x_test = x_test[col3]

In [56]:
x_test.shape

(126, 3)

In [57]:
# using lasso:

from sklearn.linear_model import Lasso

lasso = Lasso(alpha=0.0001)
lasso.fit(x_train,y_train)

Lasso(alpha=0.0001)

In [70]:
lasso.predict(x_test).shape

(126,)

In [69]:
lasso.coef_

array([-0.99957561, -0.        , -0.        ])

In [64]:
# mean squared error:

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test,lasso.predict(x_test))
mse

4.007543591590342e-08

In [65]:
# using ridge:

from sklearn.linear_model import Ridge

ridge = Ridge(alpha=0.0001)
ridge.fit(x_train,y_train)

Ridge(alpha=0.0001)

In [66]:
ridge.predict(x_test).shape

(126,)

In [67]:
ridge.coef_

array([-9.99998523e-01, -2.51820601e-08, -1.67805078e-07])

In [68]:
# mean squared error:

from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test,ridge.predict(x_test))
mse

4.933676296467313e-13