# SVM (Support Vector Machine)

In [1]:
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn import svm
from warnings import filterwarnings
filterwarnings('ignore')

In [2]:
dataset=pd.read_csv('train.csv')
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [3]:
dataset.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [4]:
dataset.shape

(889, 12)

In [5]:
# Preprocessing
dataset.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

In [6]:
# dropping Cabin as it contains more null values 
dataset.drop(columns=['Cabin'],axis=1,inplace=True)

In [7]:
dataset.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,S


In [8]:
dataset.isnull().sum()

PassengerId    0
Survived       0
Pclass         0
Name           0
Sex            0
Age            0
SibSp          0
Parch          0
Ticket         0
Fare           0
Embarked       0
dtype: int64

In [9]:
# Dropping irrelevant columns
dataset.drop(columns=['PassengerId','Name','Ticket'],axis=1,inplace=True)

In [10]:
dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [11]:
# Conversion of variables
le=preprocessing.LabelEncoder()
le.fit(dataset['Sex'])
dataset['Sex']=le.transform(dataset['Sex'])
le.fit(dataset['Embarked'])
dataset['Embarked']=le.transform(dataset['Embarked'])

In [12]:
dataset.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,1,22.0,1,0,7.25,2
1,1,1,0,38.0,1,0,71.2833,0
2,1,3,0,26.0,0,0,7.925,2
3,1,1,0,35.0,1,0,53.1,2
4,0,3,1,35.0,0,0,8.05,2


In [13]:
# Applying SVM
def SVM(dataset,dv):
    y=dataset[dv]
    X=dataset.drop(columns=[dv],axis=1)
    cols=list()
    for i in X.columns:
        cols.append(i)
    print('\nThe Independent Variables are ',cols)
    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=0)
    clf=svm.SVC(gamma=0.01,C=100)
    clf.fit(X_train,y_train)
    ypred=clf.predict(X_test)
    print('\nAccuracy Score:',accuracy_score(y_test,ypred,normalize=True))
    print('\nConfusion matrix:\n',confusion_matrix(y_test,ypred))
    
DV=['Pclass','Survived','Sex','SibSp','Parch','Embarked']
for dv in DV:
    print('-----------------------------------------------------------------------------------------')
    print('\nThe Dependent Variable is',dv)
    SVM(dataset,dv)

-----------------------------------------------------------------------------------------

The Dependent Variable is Pclass

The Independent Variables are  ['Survived', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

Accuracy Score: 0.9026217228464419

Confusion matrix:
 [[ 64   4   2]
 [  4  36   9]
 [  1   6 141]]
-----------------------------------------------------------------------------------------

The Dependent Variable is Survived

The Independent Variables are  ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

Accuracy Score: 0.7415730337078652

Confusion matrix:
 [[124  33]
 [ 36  74]]
-----------------------------------------------------------------------------------------

The Dependent Variable is Sex

The Independent Variables are  ['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']

Accuracy Score: 0.7265917602996255

Confusion matrix:
 [[ 66  32]
 [ 41 128]]
-----------------------------------------------------------------------------

# Inference
The Accuracy is 0.9026 i.e 90.26% when the Dependent Variable is Pclass.

The Accuracy is 0.7415 i.e 74.15% when the Dependent Variable is Survived.

The Accuracy is 0.7265 i.e 72.65% when the Dependent Variable is Sex.

The Accuracy is 0.7490 i.e 74.90% when the Dependent Variable is SibSp.

The Accuracy is 0.7977 i.e 79.77% when the Dependent Variable is Parch.

The Accuracy is 0.7677 i.e 76.77% when the Dependent Variable is Embarked.

Hence,the Accuracy is highest when the Dependent Variable is Pclass.