### Support Vector Regression Implementation

In [1]:
# importing Datases(Tips Dataset)
import seaborn as sns
df = sns.load_dataset('tips')
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [3]:
#for categories lets do value count
df['sex'].value_counts()

sex
Male      157
Female     87
Name: count, dtype: int64

In [4]:
df['smoker'].value_counts()


smoker
No     151
Yes     93
Name: count, dtype: int64

In [5]:
df['day'].value_counts()


day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64

In [6]:
df['time'].value_counts()


time
Dinner    176
Lunch      68
Name: count, dtype: int64

### lets do feature encoding for categorial data 
 1. label encoding
 2. one hot encoding

 label - we replace with 0's and 1's for 2 category 
one hot - right value becomes 1 other all are 0 in matrix

## Always remember do train test split before encoding to prevent DATA LEAKAGE

In [7]:
df.columns

Index(['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size'], dtype='object')

In [8]:
#independent dependent features 
X = df[['tip', 'sex', 'smoker', 'day', 'time', 'size']]
y = df['total_bill']

In [9]:
#train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=10)

In [10]:
X_train.head()

Unnamed: 0,tip,sex,smoker,day,time,size
58,1.76,Male,Yes,Sat,Dinner,2
1,1.66,Male,No,Sun,Dinner,3
2,3.5,Male,No,Sun,Dinner,3
68,2.01,Male,No,Sat,Dinner,2
184,3.0,Male,Yes,Sun,Dinner,2


In [11]:
#feature encoding for categorial data
# Label encoding
from sklearn.preprocessing import LabelEncoder
le1 = LabelEncoder()
le2 = LabelEncoder()
le3 = LabelEncoder()

In [12]:
X_train['sex'] = le1.fit_transform(X_train['sex'])
X_train['smoker'] = le2.fit_transform(X_train['smoker'])
X_train['time'] = le3.fit_transform(X_train['time'])

#to remove warnings
import warnings
warnings.filterwarnings('ignore')

In [13]:
X_train.head()

Unnamed: 0,tip,sex,smoker,day,time,size
58,1.76,1,1,Sat,0,2
1,1.66,1,0,Sun,0,3
2,3.5,1,0,Sun,0,3
68,2.01,1,0,Sat,0,2
184,3.0,1,1,Sun,0,2


In [14]:
#same for X_test data

X_test['sex'] = le1.transform(X_test['sex'])
X_test['smoker'] = le2.transform(X_test['smoker'])
X_test['time'] = le3.transform(X_test['time'])

In [15]:
X_test.head()

Unnamed: 0,tip,sex,smoker,day,time,size
162,2.0,0,0,Sun,0,3
60,3.21,1,1,Sat,0,2
61,2.0,1,1,Sat,0,2
63,3.76,1,1,Sat,0,4
69,2.09,1,1,Sat,0,2


In [16]:
# One Hot Encoding-----Column transformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

In [17]:
ct = ColumnTransformer(transformers=[('onehot',OneHotEncoder(drop='first'),[3])],remainder='passthrough')

#here 3 is column index of day column in form of list means 2D because it throw error not accept 1D data. remainder is set passthrough because we dont want to drop other column
#it kept encoded one and drop all when set to 'drop'

In [19]:
X_train = ct.fit_transform(X_train)

In [20]:
X_test = ct.transform(X_test)

In [24]:
X_test  #first 3 are result of one hot encoding remaing are columns in df table

array([[0.  , 1.  , 0.  , 2.  , 0.  , 0.  , 0.  , 3.  ],
       [1.  , 0.  , 0.  , 3.21, 1.  , 1.  , 0.  , 2.  ],
       [1.  , 0.  , 0.  , 2.  , 1.  , 1.  , 0.  , 2.  ],
       [1.  , 0.  , 0.  , 3.76, 1.  , 1.  , 0.  , 4.  ],
       [1.  , 0.  , 0.  , 2.09, 1.  , 1.  , 0.  , 2.  ],
       [0.  , 0.  , 1.  , 5.  , 1.  , 1.  , 1.  , 2.  ],
       [0.  , 1.  , 0.  , 3.51, 1.  , 0.  , 0.  , 2.  ],
       [1.  , 0.  , 0.  , 5.16, 1.  , 1.  , 0.  , 4.  ],
       [0.  , 1.  , 0.  , 5.  , 1.  , 0.  , 0.  , 2.  ],
       [1.  , 0.  , 0.  , 3.6 , 1.  , 0.  , 0.  , 3.  ],
       [0.  , 1.  , 0.  , 5.65, 1.  , 1.  , 0.  , 2.  ],
       [1.  , 0.  , 0.  , 2.5 , 0.  , 1.  , 0.  , 3.  ],
       [0.  , 0.  , 1.  , 1.44, 1.  , 0.  , 1.  , 2.  ],
       [1.  , 0.  , 0.  , 3.09, 0.  , 1.  , 0.  , 4.  ],
       [0.  , 1.  , 0.  , 2.  , 1.  , 0.  , 0.  , 4.  ],
       [0.  , 0.  , 1.  , 1.36, 0.  , 0.  , 1.  , 3.  ],
       [0.  , 0.  , 1.  , 2.  , 0.  , 0.  , 1.  , 2.  ],
       [0.  , 0.  , 1.  , 1.68,

###  Support Vector Regression

In [25]:
from sklearn.svm import SVR

In [26]:
svr = SVR()
svr.fit(X_train,y_train)

In [27]:
y_predict = svr.predict(X_test)

In [30]:
from sklearn.metrics import r2_score,mean_absolute_error
r2 = print(r2_score(y_test,y_predict))
mse = print(mean_absolute_error(y_test,y_predict))


0.46028114561159283
4.1486423210190235


### Hyper parameter Tunning

In [31]:
from sklearn.model_selection import GridSearchCV

In [32]:
para_grid = {'C':[0.1,1,10,100,1000],
            'gamma':[1,0.1,0.01,0.001],
            'kernel':['rbf']}

grid = GridSearchCV(SVR(),param_grid=para_grid,refit=True,verbose=3)
grid.fit(X_train,y_train)

Fitting 5 folds for each of 20 candidates, totalling 100 fits
[CV 1/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.067 total time=   0.0s
[CV 2/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.058 total time=   0.0s
[CV 3/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.145 total time=   0.0s
[CV 4/5] END ........C=0.1, gamma=1, kernel=rbf;, score=0.025 total time=   0.0s
[CV 5/5] END .......C=0.1, gamma=1, kernel=rbf;, score=-0.089 total time=   0.0s
[CV 1/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.013 total time=   0.0s
[CV 2/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.021 total time=   0.0s
[CV 3/5] END .....C=0.1, gamma=0.1, kernel=rbf;, score=-0.010 total time=   0.0s
[CV 4/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.124 total time=   0.0s
[CV 5/5] END ......C=0.1, gamma=0.1, kernel=rbf;, score=0.050 total time=   0.0s
[CV 1/5] END ....C=0.1, gamma=0.01, kernel=rbf;, score=-0.053 total time=   0.0s
[CV 2/5] END ....C=0.1, gamma=0.01, kernel=rbf;

In [33]:
grid.best_params_

{'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}

In [34]:
grid_pred = grid.predict(X_test)

In [35]:
r2sc = print(r2_score(y_test,grid_pred))
mse_sc= print(mean_absolute_error(y_test,grid_pred))

0.5066965174863194
3.8772383122901077
