# РК2


ИУ5-65Б: Метод опорных векторов, Градиентный бустинг

In [1]:
import numpy as np
import pandas as pd
from typing import Dict, Tuple
from scipy import stats
from IPython.display import Image
from io import StringIO
from IPython.display import Image
import graphviz
import pydotplus
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor, export_graphviz
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import ExtraTreesClassifier, ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_squared_log_error, median_absolute_error, r2_score
from sklearn.metrics import roc_curve, roc_auc_score
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(style="ticks")

In [2]:
from google.colab import files

uploaded = files.upload()

for filename in uploaded.keys():
  print('Uploaded file "{name}" with length bytes'.format(name='houses_to_rent.csv', length=len(uploaded['houses_to_rent.csv'])))

Saving houses_to_rent.csv to houses_to_rent.csv
Uploaded file "houses_to_rent.csv" with length bytes


## Загрузка и первичный анализ данных


In [3]:
data = pd.read_csv('houses_to_rent.csv', sep=",")

In [4]:
data.shape

(6080, 14)

In [5]:
# типы колонок
data.dtypes

Unnamed: 0         int64
city               int64
area               int64
rooms              int64
bathroom           int64
parking spaces     int64
floor             object
animal            object
furniture         object
hoa               object
rent amount       object
property tax      object
fire insurance    object
total             object
dtype: object

In [6]:
data.head(5)

Unnamed: 0.1,Unnamed: 0,city,area,rooms,bathroom,parking spaces,floor,animal,furniture,hoa,rent amount,property tax,fire insurance,total
0,0,1,240,3,3,4,-,acept,furnished,R$0,"R$8,000","R$1,000",R$121,"R$9,121"
1,1,0,64,2,1,1,10,acept,not furnished,R$540,R$820,R$122,R$11,"R$1,493"
2,2,1,443,5,5,4,3,acept,furnished,"R$4,172","R$7,000","R$1,417",R$89,"R$12,680"
3,3,1,73,2,2,1,12,acept,not furnished,R$700,"R$1,250",R$150,R$16,"R$2,116"
4,4,1,19,1,1,0,-,not acept,not furnished,R$0,"R$1,200",R$41,R$16,"R$1,257"


In [7]:
data.replace('-', np.nan, inplace=True)

In [8]:
# кол-во пропущенные значений (тк нулевые значения в данном датасете - конкретное значения)
nan_count = (data.isna().sum())
print(nan_count)

Unnamed: 0           0
city                 0
area                 0
rooms                0
bathroom             0
parking spaces       0
floor             1555
animal               0
furniture            0
hoa                  0
rent amount          0
property tax         0
fire insurance       0
total                0
dtype: int64


## Обрабработка данных

In [9]:
data = data.drop('floor', axis=1)
data = data.drop('Unnamed: 0', axis=1)

In [10]:
def convert_to_float(value):
    try:
        return float(value.replace('R', '').replace('$', '').replace(',', '.'))
    except ValueError:
        return np.nan

data['hoa'] = data['hoa'].apply(convert_to_float)
data['rent amount'] = data['rent amount'].apply(convert_to_float)
data['property tax'] = data['property tax'].apply(convert_to_float)
data['fire insurance'] = data['fire insurance'].apply(convert_to_float)
data['total'] = data['total'].apply(convert_to_float)


In [11]:
data['total'] = data['total'] * 1000

In [12]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_int = le.fit_transform(data['animal'])
data['animal'] = df_int
df_int = le.fit_transform(data['furniture'])
data['furniture'] = df_int

In [13]:
data['hoa'].fillna(data['hoa'].median(), inplace=True)
data['property tax'].fillna(data['property tax'].median(), inplace=True)

In [14]:
data.head()

Unnamed: 0,city,area,rooms,bathroom,parking spaces,animal,furniture,hoa,rent amount,property tax,fire insurance,total
0,1,240,3,3,4,0,0,0.0,8.0,1.0,121.0,9121.0
1,0,64,2,1,1,0,1,540.0,820.0,122.0,11.0,1493.0
2,1,443,5,5,4,0,0,4.172,7.0,1.417,89.0,12680.0
3,1,73,2,2,1,0,1,700.0,1.25,150.0,16.0,2116.0
4,1,19,1,1,0,1,1,0.0,1.2,41.0,16.0,1257.0


In [15]:
nan_count = (data.isnull().sum())
print(nan_count)

city              0
area              0
rooms             0
bathroom          0
parking spaces    0
animal            0
furniture         0
hoa               0
rent amount       0
property tax      0
fire insurance    0
total             0
dtype: int64


## Разделение на обучающую и тестовую выборки.

In [16]:
target = data['total']
data_X_train, data_X_test, data_y_train, data_y_test = train_test_split(
    data, target, test_size=0.2, random_state=1)

In [17]:
data_X_train.shape, data_X_test.shape, data_y_train.shape, data_y_test.shape

((4864, 12), (1216, 12), (4864,), (1216,))

# Метод опорных векторов

In [18]:
from sklearn.svm import SVC, NuSVC, LinearSVC, OneClassSVM, SVR, NuSVR, LinearSVR

svr_1 = SVR() # задача регрессии
svr_1.fit(data_X_train, data_y_train)

In [19]:
data_y_pred_1 = svr_1.predict(data_X_test)

In [20]:
mean_absolute_error(data_y_test, data_y_pred_1), mean_squared_error(data_y_test, data_y_pred_1)

(24600.715122863443, 19094975128.615555)

## Модель с масштабированием данных

In [21]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
data_scaled = scaler.fit_transform(data)
data_scaled = pd.DataFrame(data_scaled, columns=data.columns)

In [22]:
target_2 = data_scaled['total']
data_X_train_2, data_X_test_2, data_y_train_2, data_y_test_2 = train_test_split(
    data_scaled, target_2, test_size=0.2, random_state=1)

In [23]:
svr_2 = SVR() # задача регрессии
svr_2.fit(data_X_train_2, data_y_train_2)

In [24]:
data_y_pred_2 = svr_2.predict(data_X_test_2)

In [25]:
mean_absolute_error(data_y_test_2, data_y_pred_2), mean_squared_error(data_y_test_2, data_y_pred_2)

(0.07029521425571589, 0.005165783043996089)

# Градиентный бустинг

## Модель градиентного бустинга с использованием библиотеки catboost для бинарной классификации по признаку "animal"

In [27]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.5-cp310-cp310-manylinux2014_x86_64.whl (98.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.5


In [29]:
from catboost import CatBoostClassifier

target_3 = data['animal']
data_X_train_3, data_X_test_3, data_y_train_3, data_y_test_3 = train_test_split(
    data, target_3, test_size=0.2, random_state=1)

ct = CatBoostClassifier(iterations=100, learning_rate=0.1, depth=6, loss_function='Logloss')

ct.fit(data_X_train_3, data_y_train_3, verbose=False)

data_y_pred_3 = ct.predict(data_X_test_3)

In [30]:
mean_absolute_error(data_y_test_3, data_y_pred_3), mean_squared_error(data_y_test_3, data_y_pred_3)

(0.0, 0.0)

In [32]:
accuracy_score(data_y_test_3, data_y_pred_3)

1.0

In [33]:
f1_score(data_y_test_3, data_y_pred_3, average='micro')

1.0

In [34]:

f1_score(data_y_test_3, data_y_pred_3, average='macro')

1.0

In [35]:
f1_score(data_y_test_3, data_y_pred_3, average='weighted')

1.0

**Вывод**: модель градиентного бустинга показала себя лучше, чем модель, основанная на методе опорных векторов