# Implementation of SVM on GOT data

Here, we will use Game of Thrones Character data set to identify whether the character is alive or dead based on various features.

In [1]:
#Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split                      #To split train and test dataset
from sklearn import svm                                                   #To implement svm model
from sklearn.metrics import accuracy_score                                #To measure accuracy score

In [57]:
#Load the data
df_orig = pd.read_csv("character-predictions_pose.csv")
df = df_orig
df.head()

Unnamed: 0,S.No,plod,name,title,male,culture,dateOfBirth,DateoFdeath,mother,father,...,isAliveHeir,isAliveSpouse,isMarried,isNoble,age,numDeadRelations,boolDeadRelations,isPopular,popularity,isAlive
0,1,0.946,Viserys II Targaryen,,1,,,,Rhaenyra Targaryen,Daemon Targaryen,...,0.0,,0,0,,11,1,1,0.605351,0
1,2,0.613,Walder Frey,Lord of the Crossing,1,Rivermen,208.0,,,,...,,1.0,1,1,97.0,1,1,1,0.896321,1
2,3,0.507,Addison Hill,Ser,1,,,,,,...,,,0,1,,0,0,0,0.267559,1
3,4,0.924,Aemma Arryn,Queen,0,,82.0,105.0,,,...,,0.0,1,1,23.0,0,0,0,0.183946,0
4,5,0.383,Sylva Santagar,Greenstone,0,Dornish,276.0,,,,...,,1.0,1,1,29.0,0,0,0,0.043478,1


In [58]:
df.shape

(1946, 30)

In [59]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1946 entries, 0 to 1945
Data columns (total 30 columns):
S.No                 1946 non-null int64
plod                 1946 non-null float64
name                 1946 non-null object
title                938 non-null object
male                 1946 non-null int64
culture              677 non-null object
dateOfBirth          433 non-null float64
DateoFdeath          444 non-null float64
mother               21 non-null object
father               26 non-null object
heir                 23 non-null object
house                1519 non-null object
spouse               276 non-null object
book1                1946 non-null int64
book2                1946 non-null int64
book3                1946 non-null int64
book4                1946 non-null int64
book5                1946 non-null int64
isAliveMother        21 non-null float64
isAliveFather        26 non-null float64
isAliveHeir          23 non-null float64
isAliveSpouse        276 non-

In [60]:
#Remove identity columns
#Data contains few rows where age is negative which is not possible, so remove such rows

removecolumns = ['S.No', 'name', 'title', 'culture', 'DateoFdeath', 'mother', 'father', 'heir', 'house', 'spouse', 'age']
df = df.drop(columns=removecolumns)
df.head()

Unnamed: 0,plod,male,dateOfBirth,book1,book2,book3,book4,book5,isAliveMother,isAliveFather,isAliveHeir,isAliveSpouse,isMarried,isNoble,numDeadRelations,boolDeadRelations,isPopular,popularity,isAlive
0,0.946,1,,0,0,0,0,0,1.0,0.0,0.0,,0,0,11,1,1,0.605351,0
1,0.613,1,208.0,1,1,1,1,1,,,,1.0,1,1,1,1,1,0.896321,1
2,0.507,1,,0,0,0,1,0,,,,,0,1,0,0,0,0.267559,1
3,0.924,0,82.0,0,0,0,0,0,,,,0.0,1,1,0,0,0,0.183946,0
4,0.383,0,276.0,0,0,0,1,0,,,,1.0,1,1,0,0,0,0.043478,1


In [61]:
df.isna().sum()

plod                    0
male                    0
dateOfBirth          1513
book1                   0
book2                   0
book3                   0
book4                   0
book5                   0
isAliveMother        1925
isAliveFather        1920
isAliveHeir          1923
isAliveSpouse        1670
isMarried               0
isNoble                 0
numDeadRelations        0
boolDeadRelations       0
isPopular               0
popularity              0
isAlive                 0
dtype: int64

In [62]:
#Remove above columns where null value pecentage is high

list2 = ['dateOfBirth', 'isAliveMother', 'isAliveFather', 'isAliveHeir', 'isAliveSpouse']

df = df.drop(columns=list2)
df.isna().sum()

plod                 0
male                 0
book1                0
book2                0
book3                0
book4                0
book5                0
isMarried            0
isNoble              0
numDeadRelations     0
boolDeadRelations    0
isPopular            0
popularity           0
isAlive              0
dtype: int64

In [68]:
df.shape

(1946, 14)

In [63]:
#Seperate dependent and independent variables
x = df.drop(columns=['isAlive'])
y = df['isAlive']

In [64]:
#Split the datset into train and test dataset

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.2)

In [67]:
print(xtrain.shape)
print(xtest.shape)
print(ytrain.shape)
print(ytest.shape)

(1556, 13)
(390, 13)
(1556,)
(390,)


In [69]:
#Train SVM model on train dataset

classifier = svm.SVC()
classifier = classifier.fit(xtrain, ytrain)

In [70]:
classifier

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [71]:
#Predict on test data using the model
pred1 = classifier.predict(xtest)

In [72]:
#Check the accuracy of the model

acc1 = accuracy_score(pred1, ytest)
print(acc1)

0.8051282051282052


In [75]:
#Lets create a function to build a svm model, predict on test data and give the accuracy score

def svmclassifier(xtrain, ytrain,xtest, ytest, kernel, cvalue, gammavalue):
    mod = svm.SVC(kernel = kernel, C = cvalue, gamma = gammavalue)
    mod = mod.fit(xtrain, ytrain)
    pred = mod.predict(xtest)
    acc = accuracy_score(pred, ytest)
    print("Accuray of model is :", acc)

In [None]:
#Lets build the model for various parameters of c and gamma values

In [77]:
mod1 = svmclassifier(xtrain, ytrain, xtest, ytest, 'rbf', 1, 0.1)

('Accuray of model is :', 0.8179487179487179)


In [79]:
mod1 = svmclassifier(xtrain, ytrain, xtest, ytest, 'rbf', 1, 1)

('Accuray of model is :', 0.8076923076923077)


In [80]:
mod1 = svmclassifier(xtrain, ytrain, xtest, ytest, 'rbf', 1, 10)

('Accuray of model is :', 0.7948717948717948)


In [81]:
mod1 = svmclassifier(xtrain, ytrain, xtest, ytest, 'rbf', 1, 100)

('Accuray of model is :', 0.7871794871794872)


In [82]:
mod1 = svmclassifier(xtrain, ytrain, xtest, ytest, 'rbf', 10, 0.1)

('Accuray of model is :', 0.8128205128205128)


In [83]:
mod1 = svmclassifier(xtrain, ytrain, xtest, ytest, 'rbf', 100, 0.1)

('Accuray of model is :', 0.8)


In [84]:
mod1 = svmclassifier(xtrain, ytrain, xtest, ytest, 'rbf', 1000, 0.1)

('Accuray of model is :', 0.7897435897435897)


In [85]:
mod1 = svmclassifier(xtrain, ytrain, xtest, ytest, 'rbf', 10000, 0.1)

('Accuray of model is :', 0.7897435897435897)


In [87]:
#Based on above observations, we get the best model for c = 10, gamma = 0.1
mod1 = svmclassifier(xtrain, ytrain, xtest, ytest, 'rbf', 10, 0.1)

('Accuray of model is :', 0.8128205128205128)
