# Предсказываем стоимость мед страховки



## Загужаем необходимые библиотеки

In [None]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

## Загружаем данные

Собраны данные:
- age: возраст
- sex: пол
- bmi: индекс массы тела
- children: количество детей, охваченных медицинским страхованием / количество иждивенцев
- smoker: курение
- region: регион (northeast, southeast, southwest, northwest).
- charges: индивидуальные медицинские расходы (его и хотим предсказать)

In [None]:
# Загрузите данные из файла insurance.csv в переменную df
df = pd.read_csv('insurance.csv', sep=',')

In [None]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


## Смотрим статистику, что нет пропусков и отсуствующих значений

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [None]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


## Преобразуем строковые данные

In [None]:
# Заменяем пол и курение на числа
df['sex']=df['sex'].map({'male':1, 'female':0})
df['smoker']=df['smoker'].map({'yes':1,'no':0})

In [None]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,0,27.9,0,1,southwest,16884.924
1,18,1,33.77,1,0,southeast,1725.5523
2,28,1,33.0,3,0,southeast,4449.462
3,33,1,22.705,0,0,northwest,21984.47061
4,32,1,28.88,0,0,northwest,3866.8552


In [None]:
# Заменяем регион на набор отдельных колонок (is_southwest, is_southeast и тп)
df = pd.get_dummies(df, columns=['region'])
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,charges,region_northeast,region_northwest,region_southeast,region_southwest
0,19,0,27.9,0,1,16884.924,0,0,0,1
1,18,1,33.77,1,0,1725.5523,0,0,1,0
2,28,1,33.0,3,0,4449.462,0,0,1,0
3,33,1,22.705,0,0,21984.47061,0,1,0,0
4,32,1,28.88,0,0,3866.8552,0,1,0,0


## Формируем признаки и целевую переменную

In [None]:
## Сформируйте признаки и целевую переменную

In [None]:
X = df[['age',	'sex',	'bmi',	'children',	'smoker', 'region_northeast',	'region_northwest',	'region_southeast',	'region_southwest']]
y = df['charges']

## Разделяем данные на выборку для обучения/проверки

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## Создаем и обучаем модель линейной регресии

In [None]:
# Создайте и обучите модель
lr = LinearRegression()

In [None]:
lr.fit(X_train, y_train)

In [None]:
lr.coef_

array([  255.47314462,    33.90452948,   353.38740552,   613.23775228,
       24290.06798533,   624.52536222,   105.47145154,  -405.65583706,
        -324.3409767 ])

In [None]:
lr.predict(X_test)

array([ 5599.65933054, 11967.63437504, 33186.78877525,  4674.02981903,
       11323.95192189,  8763.9089098 ,  6312.53889636, 10621.59260134,
       14855.17210313,  1858.8445292 ,  1079.19545387,  8691.99463248,
        1805.80609555, 39521.21225643,  2038.78189294, 11224.81799805,
        4199.65188694, 10917.62170708,  3755.58518962,  6835.80062802,
       33071.25078635, 10088.75233589, 16278.90669078,  4030.00226872,
       12940.25731122,  8455.47709204,  3743.14379473,  7678.85784824,
        4251.85908602, 10277.16962882, 11369.42084046,  4336.7926435 ,
       32945.27757266, 13650.46192293,  7133.56787312,  5482.89149864,
       10769.16209838, 12819.0120285 , 16236.26732504, 31079.79005902,
       10014.12466427,  5550.22800116,  7724.28760884, 11863.93033112,
       29275.2504616 , 31104.00490559, 17435.06929572, 11536.21483156,
        6756.16576457, 12852.00149553,  6089.15727809,  9847.20934397,
       14230.60382901,  7403.71370544, 26188.48187002, 30748.10028155,
      

In [None]:
pd.DataFrame([y_test.values, lr.predict(X_test)]).T

Unnamed: 0,0,1
0,4561.18850,5599.659331
1,8068.18500,11967.634375
2,38746.35510,33186.788775
3,27724.28875,4674.029819
4,12629.16560,11323.951922
...,...,...
263,3044.21330,2530.951764
264,17626.23951,3585.305877
265,12094.47800,10283.803610
266,13430.26500,15030.119109


## Получаем предсказание и оцениваем качество

In [None]:
# Получите предсказание

In [None]:
# Оцените качество, при помощи метода mean_squared_error для тестовой выборки

In [None]:
mean_squared_error(y_test, lr.predict(X_test))

38321815.1682465

In [None]:
mean_squared_error(y_train, lr.predict(X_train))

36160570.46164993

## Делаем предсказание для одного человека

In [None]:
# Заполняем данные по конкретному человеку
data = [{
    "age": 20,
    "sex": 1,
    "bmi": 30,
    "children": 2,
    "smoker": 1,
    "region_northeast": 0,
    "region_northwest": 0,
    "region_southeast": 1,
    "region_southwest": 0
}]

In [None]:
df_person = pd.DataFrame(data)
df_person.head()

Unnamed: 0,age,sex,bmi,children,smoker,region_northeast,region_northwest,region_southeast,region_southwest
0,20,1,30,2,1,0,0,1,0


In [None]:
lr.predict(df_person)

array([27700.8484333])