# Практика застосування SVM-класифікатора. EDA

## EDA датасету PetFinder

### Знайомство з даними

In [1]:
import warnings
import pandas as pd
import numpy as np
import category_encoders as ce
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import KBinsDiscretizer, StandardScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
data = pd.read_csv('../datasets/mod_04_topic_08_petfinder_data.csv.gz')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11537 entries, 0 to 11536
Data columns (total 15 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Type           11537 non-null  object
 1   Age            11537 non-null  int64 
 2   Breed1         11537 non-null  object
 3   Gender         11537 non-null  object
 4   Color1         11537 non-null  object
 5   Color2         11537 non-null  object
 6   MaturitySize   11537 non-null  object
 7   FurLength      11537 non-null  object
 8   Vaccinated     11537 non-null  object
 9   Sterilized     11537 non-null  object
 10  Health         11537 non-null  object
 11  Fee            11537 non-null  int64 
 12  Description    11528 non-null  object
 13  PhotoAmt       11537 non-null  int64 
 14  AdoptionSpeed  11537 non-null  int64 
dtypes: int64(4), object(11)
memory usage: 1.3+ MB


In [3]:
data.nunique()

Type                 2
Age                105
Breed1             166
Gender               2
Color1               7
Color2               7
MaturitySize         3
FurLength            3
Vaccinated           3
Sterilized           3
Health               3
Fee                 65
Description      10691
PhotoAmt            31
AdoptionSpeed        5
dtype: int64

### Адаптація ознак і цільової змінної

In [4]:
data['Description'].head()

0    Nibble is a 3+ month old ball of cuteness. He ...
1    I just found it alone yesterday near my apartm...
2    Their pregnant mother was dumped by her irresp...
3    Good guard dog, very alert, active, obedience ...
4    This handsome yet cute boy is up for adoption....
Name: Description, dtype: object

In [5]:
data.drop('Description', axis=1, inplace=True)

In [6]:
data['AdoptionSpeed'].value_counts().sort_index()

AdoptionSpeed
0     329
1    2432
2    3153
3    2543
4    3080
Name: count, dtype: int64

In [7]:
data['AdoptionSpeed'] = np.where(data['AdoptionSpeed'] == 4, 0, 1)
data['AdoptionSpeed'].value_counts()

AdoptionSpeed
1    8457
0    3080
Name: count, dtype: int64

In [8]:
data['Fee'] = data['Fee'].astype(bool).astype(int).astype(str)

# Практика застосування SVM-класифікатора. Підготовка й обробка даних

### Розбиття на тренувальну і тестову вибірки

In [9]:
X_train, X_test, y_train, y_test = (
    train_test_split(
        data.drop('AdoptionSpeed', axis=1),
        data['AdoptionSpeed'],
        test_size=0.2,
        random_state=42))

In [10]:
num_cols = X_train.select_dtypes(exclude='object').columns

kbins = KBinsDiscretizer(encode='ordinal').fit(X_train[num_cols])

X_train[num_cols] = (kbins
                     .transform(
                         X_train[num_cols])
                     .astype(int)
                     .astype(str))

X_test[num_cols] = (kbins
                    .transform(
                        X_test[num_cols])
                    .astype(int)
                    .astype(str))

### Кодування категоріальних змінних

In [11]:
encoder = ce.TargetEncoder()

X_train = encoder.fit_transform(X_train, y_train)
X_test = encoder.transform(X_test)

X_train.head()

Unnamed: 0,Type,Age,Breed1,Gender,Color1,Color2,MaturitySize,FurLength,Vaccinated,Sterilized,Health,Fee,PhotoAmt
3527,0.719879,0.856507,0.669992,0.760396,0.712899,0.714331,0.725716,0.715193,0.797233,0.808532,0.739566,0.742222,0.719681
6377,0.719879,0.622047,0.669992,0.716901,0.712899,0.714331,0.766751,0.715193,0.688564,0.613491,0.739566,0.742222,0.719681
8256,0.719879,0.893978,0.881118,0.716901,0.733775,0.712644,0.725716,0.715193,0.797233,0.808532,0.739566,0.742222,0.683806
916,0.719879,0.749135,0.669992,0.716901,0.712899,0.755591,0.766751,0.715193,0.689878,0.613491,0.739566,0.742222,0.719681
5972,0.719879,0.622047,0.881118,0.760396,0.712899,0.714331,0.741463,0.715193,0.689878,0.599174,0.739566,0.742222,0.806862


### Нормалізація змінних

In [12]:
scaler = StandardScaler().set_output(transform='pandas')

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Практика застосування SVM-класифікатора. Навчання й оцінка моделі

### Побудова моделі

In [13]:
clf = SVC(class_weight='balanced',
          kernel='poly',
          probability=True,
          random_state=42)

clf.fit(X_train, y_train)

In [14]:
preds = clf.predict(X_test)

confusion_matrix(y_test, preds)

array([[ 404,  239],
       [ 429, 1236]], dtype=int64)

In [15]:
print(f'Model accuracy is: {accuracy_score(y_test, preds):.1%}')

Model accuracy is: 71.1%


### Приклад використання моделі

In [16]:
pet = pd.DataFrame(
    data={
        'Type': 'Cat',
        'Age': 3,
        'Breed1': 'Tabby',
        'Gender': 'Male',
        'Color1': 'Black',
        'Color2': 'White',
        'MaturitySize': 'Small',
        'FurLength': 'Short',
        'Vaccinated': 'No',
        'Sterilized': 'No',
        'Health': 'Healthy',
        'Fee': True,
        'PhotoAmt': 2,
    },
    index=[0])

In [17]:
pet[num_cols] = kbins.transform(pet[num_cols]).astype(int).astype(str)

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    prob = (clf
            .predict_proba(
                scaler
                .transform(
                    encoder
                    .transform(
                        pet)))
            .flatten())

print(f'This pet has a {prob[1]:.1%} probability "of getting adopted"')

This pet has a 80.7% probability "of getting adopted"
