Importando módulos iniciais

In [1]:
import pandas as pd
import numpy as np
from sklearn import neighbors, metrics, preprocessing
np.set_printoptions(suppress=True)

Abrindo o arquivo insurance.csv como dataframe chamado originalDF

In [2]:
originalDF = pd.read_csv('insurance.csv', encoding = "ISO-8859-1")

Codificando os campos que não eram compostos por números, e limpando nulos e duplicados, resultando em um novo dataframe encodedDF

In [3]:
label_encoder = preprocessing.LabelEncoder()

encodedDF = originalDF.copy()
encodedDF['region']= label_encoder.fit_transform(encodedDF['region'])
encodedDF['sex']= label_encoder.fit_transform(encodedDF['sex'])
encodedDF['smoker']= label_encoder.fit_transform(encodedDF['smoker'])

encodedDF = encodedDF[~np.isnan(encodedDF).any(axis=1)]
encodedDF.dropna()
encodedDF.drop_duplicates()

encodedDF.reset_index(drop=True, inplace=True)
encodedDF

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.900,0,1,3,16884.92400
1,18,1,33.770,1,0,2,1725.55230
2,28,1,33.000,3,0,2,4449.46200
3,33,1,22.705,0,0,1,21984.47061
4,32,1,28.880,0,0,1,3866.85520
...,...,...,...,...,...,...,...
1333,50,1,30.970,3,0,1,10600.54830
1334,18,0,31.920,0,0,0,2205.98080
1335,18,0,36.850,0,0,2,1629.83350
1336,21,0,25.800,0,0,3,2007.94500


Normalizando as colunas necessários

In [4]:
from sklearn.preprocessing import StandardScaler

scaledArr = StandardScaler().fit_transform(encodedDF.loc[:, ['bmi', 'charges']].values)
encodedDF[['bmi', 'charges']] = pd.DataFrame(data = scaledArr, columns = ['bmi', 'charges'])
encodedDF

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,-0.453320,0,1,3,0.298584
1,18,1,0.509621,1,0,2,-0.953689
2,28,1,0.383307,3,0,2,-0.728675
3,33,1,-1.305531,0,0,1,0.719843
4,32,1,-0.292556,0,0,1,-0.776802
...,...,...,...,...,...,...,...
1333,50,1,0.050297,3,0,1,-0.220551
1334,18,0,0.206139,0,0,0,-0.914002
1335,18,0,1.014878,0,0,2,-0.961596
1336,21,0,-0.797813,0,0,3,-0.930362


Verificando as covariâncias de cada variável

In [5]:
corr = encodedDF.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
age,1.0,-0.020856,0.109272,0.042469,-0.025019,0.002127,0.299008
sex,-0.020856,1.0,0.046371,0.017163,0.076185,0.004588,0.057292
bmi,0.109272,0.046371,1.0,0.012759,0.00375,0.157566,0.198341
children,0.042469,0.017163,0.012759,1.0,0.007673,0.016569,0.067998
smoker,-0.025019,0.076185,0.00375,0.007673,1.0,-0.002181,0.787251
region,0.002127,0.004588,0.157566,0.016569,-0.002181,1.0,-0.006208
charges,0.299008,0.057292,0.198341,0.067998,0.787251,-0.006208,1.0


Separando o rótulo dos componentes, sendo y o rótulo

In [6]:
y = encodedDF.loc[:, ['charges']].values
X = encodedDF.loc[:, ['age','bmi','smoker']].values

Dividindo o modelo entre train e test:

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20)

Aplicando KNN onde K = 1:

In [8]:
from sklearn.neighbors import KNeighborsRegressor
regressor = KNeighborsRegressor(n_neighbors=1)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

Obtendo MSE da regressao:

In [9]:
from sklearn.metrics import mean_squared_error
mean_squared_error(y_test, y_pred)

0.4185346844752508

Aplicando KNN onde K = 2:

In [10]:
regressor = KNeighborsRegressor(n_neighbors=2)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

Obtendo MSE da regressao:

In [11]:
mean_squared_error(y_test, y_pred)

0.33200950058170836

Aplicando KNN onde K = 3:

In [12]:
regressor = KNeighborsRegressor(n_neighbors=3)
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

Obtendo MSE da regressao:

In [13]:
mean_squared_error(y_test, y_pred)

0.3863050881311439

Aplicando KNN usando como metrica chebyshev onde K = 3:

In [14]:
regressor = KNeighborsRegressor(n_neighbors=3, metric='chebyshev')
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

Obtendo MSE da regressao:

In [15]:
mean_squared_error(y_test, y_pred)

0.442999140536622

Aplicando SVR usando como kernel rbf;

In [16]:
from sklearn.svm import SVR
regressor = SVR(kernel = 'rbf')
regressor.fit(X_train, np.ravel(y_train))
y_pred = regressor.predict(X_test)

Obtendo MSE da regressao:

In [17]:
mean_squared_error(y_test, y_pred)

0.903569451857559

Aplicando SVR usando como kernel poly;

In [18]:
regressor = SVR(kernel = 'poly')
regressor.fit(X_train, np.ravel(y_train))
y_pred = regressor.predict(X_test)

Obtendo MSE da regressao:

In [19]:
mean_squared_error(y_test, y_pred)

0.6849732239035372

Aplicando SVR usando como kernel poly de 4 grau;

In [20]:
regressor = SVR(kernel = 'poly', degree = 4)
regressor.fit(X_train, np.ravel(y_train))
y_pred = regressor.predict(X_test)

Obtendo MSE da regressao:

In [21]:
mean_squared_error(y_test, y_pred)

0.6595270503445951

Aplicando SVR usando como kernel linear;

In [22]:
regressor = SVR(kernel = 'linear')
regressor.fit(X_train, np.ravel(y_train))
y_pred = regressor.predict(X_test)

Obtendo MSE da regressao:

In [23]:
mean_squared_error(y_test, y_pred)

0.28686125047657207