In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt

X = pd.read_csv("/kaggle/input/imports-85.csv", header = None)
columns = {
    0: 'symboling',
    1: 'normalized-losses',
    2: 'make',
    3: 'fuel-type',
    4: 'aspiration',
    5: 'num-of-doors',
    6: 'body-style',
    7: 'drive-wheels',
    8: 'engine-location',
    9: 'wheel-base',
    10: 'length',
    11: 'width',
    12: 'height',
    13: 'curb-weight',
    14: 'engine-type',
    15: 'num-of-cylinders',
    16: 'engine-size',
    17: 'fuel-system',
    18: 'bore',
    19: 'stroke',
    20: 'compression-ratio',
    21: 'horsepower',
    22: 'peak-rpm',
    23: 'city-mpg',
    24: 'highway-mpg',
    25: 'price'
}
X.rename(columns = columns, inplace = True)

In [None]:
print(X.shape)
X.head()

In [None]:
X = X[X['normalized-losses'] != '?']     #Skip samples with missing values in the target 'normalized_losses'
print(X.shape)

In [None]:
X.drop('symboling', axis = 1, inplace = True)     # drop 'symboling' column

X.replace('?', np.nan, inplace = True)            # replace missing values with np.nan

X.isnull().sum()       # check for number of missing values in the data

In [None]:
# Convert columns with 'int' or 'float' values, having object dtype, back to int and float dtype

X['normalized-losses'] = X['normalized-losses'].astype(str).astype(int)
X['bore'] = X['bore'].astype(str).astype(float)
X['stroke'] = X['stroke'].astype(str).astype(float)
X['horsepower'] = X['horsepower'].astype(str).astype(int)
X['peak-rpm'] = X['peak-rpm'].astype(str).astype(int)
X['price'] = X['price'].astype(str).astype(int)

X['num-of-doors'] = X['num-of-doors'].map({'two' : 2, 'four' : 4})

X.dtypes

In [None]:
#X['bore'].fillna(X['bore'].mean(), inplace = True)
#X['stroke'].fillna(X['stroke'].mean(), inplace = True)
X = X.fillna(X.mode().iloc[0])     #Fill missing values with the most frequent value of that column

# Encode categorical variables into numerical

In [None]:
X.nunique()

In [None]:
encode_cols = ['make']
one_hot_cols = ['fuel-type', 'aspiration', 'body-style', 'drive-wheels', 'engine-location', 'engine-type', 'num-of-cylinders', 'fuel-system']

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

label_encoder = LabelEncoder()
One_encoder = OneHotEncoder(handle_unknown='ignore', sparse = False)

label_X = X.copy()

for cols in encode_cols:
    label_X[cols] = label_encoder.fit_transform(X[cols])
    
OH_cols = pd.DataFrame(One_encoder.fit_transform(label_X[one_hot_cols]))
OH_cols.index = label_X.index
OH_X = label_X.drop(one_hot_cols, axis = 1)

num_X = pd.concat([OH_X, OH_cols], axis = 1)

In [None]:
plt.scatter(num_X['price'], num_X['normalized-losses'], color = 'green')
plt.xlabel('price')
plt.ylabel('normalized_losses')
plt.show()

Split the data into training and testing datas

In [None]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(num_X, test_size = 0.2)

In [None]:
train_y = train['normalized-losses']
train_x = train.drop('normalized-losses', axis = 1)

test_y = test['normalized-losses']
test_x = test.drop('normalized-losses', axis = 1)

# Modelling using *LinearRegression* model from *sklearn*

In [None]:
from sklearn.linear_model import LinearRegression

model = LinearRegression()

model.fit(train_x, train_y)

**Evaluation**

Calculating model's accuracy using MSE (Mean Squared Error) metric

In [None]:
from sklearn import metrics

y_pred = model.predict(test_x)

print(metrics.mean_squared_error(test_y, y_pred))

In [None]:
x_pred = model.predict(train_x.sort_values('price'))
plt.scatter(train_x['price'], train_y, color = 'blue')
plt.plot(train_x.sort_values('price'), x_pred)
plt.show()

# RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

model1 = RandomForestRegressor(n_estimators=100, random_state=0)
model1.fit(train_x, train_y)
preds = model1.predict(test_x)
print(metrics.mean_squared_error(test_y, preds))