# SVM for Cell Samples

In [1]:
import pandas as pd
import numpy as np
from sklearn.datasets.samples_generator import make_blobs
from sklearn.model_selection import train_test_split, cross_val_predict
from sklearn.svm import LinearSVC
from sklearn.linear_model import Perceptron
from sklearn.metrics import confusion_matrix, accuracy_score, roc_curve, precision_score, recall_score, f1_score, precision_recall_curve
from sklearn.metrics import classification_report
from matplotlib import pyplot as plt
%matplotlib inline




In [2]:
df = pd.read_csv("F:\Dataset\CELL_SAMPLES.csv")

In [3]:
df.head()

Unnamed: 0,ID,Clump,UnifSize,UnifShape,MargAdh,SingEpiSize,BareNuc,BlandChrom,NormNucl,Mit,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2


In [4]:
df.shape

(699, 11)

In [5]:
df.isnull().sum()

ID             0
Clump          0
UnifSize       0
UnifShape      0
MargAdh        0
SingEpiSize    0
BareNuc        0
BlandChrom     0
NormNucl       0
Mit            0
Class          0
dtype: int64

In [6]:
df.dropna(inplace=True)
df.isnull().sum()

ID             0
Clump          0
UnifSize       0
UnifShape      0
MargAdh        0
SingEpiSize    0
BareNuc        0
BlandChrom     0
NormNucl       0
Mit            0
Class          0
dtype: int64

In [7]:
df.isnull().values.any()

False

In [8]:
df.nunique()

ID             645
Clump           10
UnifSize        10
UnifShape       10
MargAdh         10
SingEpiSize     10
BareNuc         11
BlandChrom      10
NormNucl        10
Mit              9
Class            2
dtype: int64

In [9]:
def mapping_cat_to_num (col_name):
    df[col_name].unique()
    col_list = list(df[col_name].unique())
    col_dict = {}
    for i in range (len(col_list)):
        col_dict[col_list[i]] = i
    return col_dict


In [10]:
categorical_column = list(df.select_dtypes(exclude = ['number']).columns)

In [11]:
categorical_column

['BareNuc']

In [12]:
cleanup_nums = {}
for i in categorical_column:
    cleanup_nums[i] = mapping_cat_to_num(i)


In [13]:
cleanup_nums

{'BareNuc': {'1': 0,
  '10': 1,
  '2': 2,
  '4': 3,
  '3': 4,
  '9': 5,
  '7': 6,
  '?': 7,
  '5': 8,
  '8': 9,
  '6': 10}}

In [14]:
df.replace(cleanup_nums, inplace=True)

In [15]:
df

Unnamed: 0,ID,Clump,UnifSize,UnifShape,MargAdh,SingEpiSize,BareNuc,BlandChrom,NormNucl,Mit,Class
0,1000025,5,1,1,1,2,0,3,1,1,2
1,1002945,5,4,4,5,7,1,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,3,3,7,1,2
4,1017023,4,1,1,3,2,0,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,0,1,1,1,2
696,888820,5,10,10,3,7,4,8,10,2,4
697,897471,4,8,6,4,3,3,10,6,1,4


In [17]:
X = df.iloc[:,1:10]
Y = df.iloc[:,10]

In [30]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2,random_state=90)

In [31]:
svc = LinearSVC()
svc.fit(x_train, y_train)



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [32]:
y_pred = svc.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print("Accuracy:",accuracy_score(y_test, y_pred))

[[86  2]
 [ 3 49]]
              precision    recall  f1-score   support

           2       0.97      0.98      0.97        88
           4       0.96      0.94      0.95        52

    accuracy                           0.96       140
   macro avg       0.96      0.96      0.96       140
weighted avg       0.96      0.96      0.96       140

Accuracy: 0.9642857142857143


In [33]:
clf = Perceptron(tol=1e-3, random_state=0)
clf.fit(x_train, y_train)
Perceptron()
clf.score(x_train, y_train)

0.962432915921288

In [34]:
y_pred = clf.predict(x_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print("Accuracy:",accuracy_score(y_test, y_pred))

[[83  5]
 [ 0 52]]
              precision    recall  f1-score   support

           2       1.00      0.94      0.97        88
           4       0.91      1.00      0.95        52

    accuracy                           0.96       140
   macro avg       0.96      0.97      0.96       140
weighted avg       0.97      0.96      0.96       140

Accuracy: 0.9642857142857143
