In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'hyundaikialadacars:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F5379848%2F8941010%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240712%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240712T221539Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D10bdb5b8d4bc40dde90c8aa0ba8aec7c39a50775f6e1d326194fd251137e27b86324db4fa985a26d4e9fed656515e96d1dac0990fdd39711116f3038cde267722e4834eb6cb35f3352b294c0b85de3041d57e36b10e53fb74f7a888f8d8f88ddcbf5a8f6655d2a6d36860e7d3005ee3adc9d902d8cafc5b8295906da7a604695150900713c6e370bd738712db65f49686b64605fc550191c4aa8c8239cc7b2ed6fdacf45bc10c8b59e08f6abfee68104995f4771a23da9a2ccb4c13e2f157c1d5b269759bce934b8497c938804d4872e16ad37bb8d6f8fd441c75122ab3178a17f9eb54db6d748c53d2222332de32528c43c651733d073df6dd27d5e799358dd'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import joblib
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/hyundaikialadacars/Cars.csv


In [None]:
data  = pd.read_csv('/kaggle/input/hyundaikialadacars/Cars.csv', delimiter=';')

In [None]:
data.head(10)

Unnamed: 0,brand,litr,horse,toplivo,korob,type,year,price
0,0,12,58,1,2,1,30,1305
1,0,9,36,1,2,1,30,1076
3,0,9,36,1,2,1,30,1141
10,0,10,60,3,5,1,29,1197
11,0,12,50,1,2,1,29,1259
12,0,9,36,1,2,1,28,1042
13,0,6,15,1,2,7,28,736
16,0,12,50,1,2,1,28,1177
17,0,12,57,1,2,1,28,1032
18,0,12,50,1,2,1,28,1206


In [None]:
data.duplicated().sum()

1166

In [None]:
data = data.drop_duplicates()

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2477 entries, 0 to 3642
Data columns (total 8 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   brand    2477 non-null   int64
 1   litr     2477 non-null   int64
 2   horse    2477 non-null   int64
 3   toplivo  2477 non-null   int64
 4   korob    2477 non-null   int64
 5   type     2477 non-null   int64
 6   year     2477 non-null   int64
 7   price    2477 non-null   int64
dtypes: int64(8)
memory usage: 174.2 KB


In [None]:
unique_values = data['type'].unique()
print(unique_values)

['внедорожник' 'седан' 'минивэн' 'хэтчбек' 'универсал' 'купе' 'type'
 'лифтбек' 'компактвэн' 'фургон' 'пикап']


In [None]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

cat_columns = list(data.select_dtypes(include='object').columns)

column_transformer = ColumnTransformer([('ordinal_encoder', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1), cat_columns)],
                  remainder='passthrough')

In [None]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

cat_columns = list(data.select_dtypes(include='object').columns)
for one in cat_columns:
    data[one] = label_encoder.fit_transform(data[one])

In [None]:
Y = data["price"]
X = data.drop(["price"], axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 11)

In [None]:
from sklearn.neighbors import KNeighborsClassifier  # метод ближайших соседей

knn = KNeighborsClassifier() # создаём образец класса
knn.fit(X_train, Y_train) # тренеруем выборку

Y_train_predict = knn.predict(X_train) # предсказание на тренировочной выборке
Y_test_predict = knn.predict(X_test) # предсказание на тестовой выборке, даём новые данные и проверяем как хорошо обучили

from sklearn.metrics import mean_squared_log_error # RMSLE
error1 = mean_squared_log_error(Y_test, Y_test_predict)
print(error1)

0.779787573825513


In [None]:
from sklearn.metrics import mean_absolute_error # MAE
error2 = mean_absolute_error(Y_test, Y_test_predict)
print(error2)

218.5625


In [None]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

# Создание и обучение модели
model = XGBRegressor()
model.fit(X_train, Y_train)

# Предсказание на тестовой выборке
Y_pred = model.predict(X_test)
r2 = r2_score(Y_test, Y_pred)
print(f'R^2 Score: {r2}')

# Оценка качества модели
mse = mean_squared_error(Y_test, Y_pred)
print(f'Mean Squared Error: {mse}')
error2 = mean_absolute_error(Y_test, Y_pred)
print(error2)
error1 = mean_squared_log_error(Y_test, Y_pred)
print(error1)

R^2 Score: 0.5197620747028687
Mean Squared Error: 81707.36935608283
179.73894228473787
0.38967211956667785


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Создаем объект модели линейной регрессии
model = LinearRegression()

# Обучаем модель на обучающем наборе
model.fit(X_train, Y_train)

# Делаем предсказания на тестовом наборе
Y_predict = model.predict(X_test)

# Оцениваем качество модели
mse = mean_squared_error(Y_test, Y_predict)
r2 = r2_score(Y_test, Y_predict)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')
error2 = mean_absolute_error(Y_test, Y_predict)
print(error2)
error1 = mean_squared_log_error(Y_test, Y_predict)
print(error1)

Mean Squared Error: 132654.2029885531
R^2 Score: 0.2203202755489938
310.6413824272798
0.6115109135856818


In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, Y_train)

# Предсказание и оценка качества модели
y_pred = model.predict(X_test)
print(f'Mean Squared Error: {mean_squared_error(Y_test, y_pred)}')
print(f'Mean Absolute Error: {mean_absolute_error(Y_test, y_pred)}')
print(f'R^2 Score: {r2_score(Y_test, y_pred)}')

joblib.dump(model, 'model.pkl')

Mean Squared Error: 80685.85470683187
Mean Absolute Error: 178.14685161433823
R^2 Score: 0.5257660628337189


['model.pkl']