# Lab1 - Linear regression

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import max_error, mean_absolute_error, mean_squared_error, r2_score

## Introduction

**Goal** : Model the price of cars with the available independent variables.

### Processing flow
1. Read raw data
2. Data preprocess
3. Build model

### Dataset - [Car Price Prediction Multiple Linear Regression](https://www.kaggle.com/hellbuoy/car-price-prediction)
#### Files ####
1. CarPrice_Assignment.csv
1. Data Dictionary - carprices.xlsx

<img src="linear_regression_dataset.jpg" style="width: 600px;"/>

## Read raw data
1. read data
1. basic observation

In [None]:
# read dataset
df_raw_data = pd.read_csv("CarPrice_Assignment.csv")

# view data
df_raw_data.head() 

In [None]:
# view column names
df_raw_data.keys()

In [None]:
# show data type of a certain column
df_raw_data['fueltype'].dtype

In [None]:
# show types of value of a certain column
df_raw_data['fueltype'].value_counts().keys()

## Data preprocess
### Selecting features that is used on prediction and prediction ground truth.
- **symboling** : Its assigned insurance risk rating, A value of +3 indicates that the auto is risky, -3 that it is probably pretty safe.(Categorical)
- **fueltype** : Car fuel type i.e gas or diesel (Categorical)
- **fuelsystem** : Fuel system of car (Categorical)		
- **horsepower** : Horsepower (Numeric)
- **price** : Price of car (Numeric)

In [None]:
# select using columns
df_selected_data = df_raw_data[['symboling', 'fueltype', 'fuelsystem', 'horsepower', 'price']]
print(f'There are {df_selected_data.shape[0]} pieces of data.\n')

df_selected_data.head()

### process missing data
1. check nan value
2. fill a cell manually
2. fillna by column
3. dropna

In [None]:
# check missing data
df_selected_data.isnull().sum()

In [None]:
# show missing data
df_selected_data[df_selected_data.isnull().any(axis=1)]

In [None]:
# fill a cell manually
df_selected_data.at[0, 'symboling'] = 3

# fillna by column
values = {'horsepower': 145}
df_selected_data = df_selected_data.fillna(value=values)

# dropna
df_selected_data = df_selected_data.dropna()

# check missing data again
df_selected_data.isnull().sum()

### Other preprocess
1. make sure every input feature is a quantitative feature.
   - **symboling** : Categorical and quantitative.
   - **fueltype** : Categorical and can be regarded as quantitative.
   - **fuelsystem** : Categorical but not quantitative.
   - **horsepower** : Numeric and quantitative
   - **price** : Numeric and quantitative

In [None]:
df_raw_data['symboling'].dtype

In [None]:
fueltype_values = df_raw_data['fueltype'].value_counts().keys()
fueltype_values

In [None]:
fuelsystem_values = df_raw_data['fuelsystem'].value_counts().keys()
fuelsystem_values

In [None]:
df_raw_data['horsepower'].dtype

In [None]:
df_raw_data['price'].dtype

In [None]:
def preprocess(df_selected_data):
    encoding = []

    # symboling
    symboling = df_selected_data['symboling'].to_numpy()
    encoding.append(symboling)

    # fueltype
    fueltype = df_selected_data['fueltype'].to_numpy()
    encoding.append(1 * (fueltype == fueltype_values[0]))
    
    # fuelsystem
    fuelsystem = df_selected_data['fuelsystem'].to_numpy()
    for fuelsystem_value in fuelsystem_values:
        encoding.append(fuelsystem == fuelsystem_value)

    # horsepower
    horsepower = df_selected_data['horsepower'].to_numpy()
    encoding.append(horsepower)

    # price
    price = df_selected_data['price'].to_numpy()
    encoding.append(price)

    processed_data = np.stack(encoding, axis=1)

    return processed_data

In [None]:
preprocessed_data = preprocess(df_selected_data)

## Build model
1. Split dataset to training set and testing set.
2. Training model
3. Evaluation

### Split dataset to training set and testing set.

In [None]:
X = preprocessed_data[:, 0:-1]
Y = preprocessed_data[:, -1]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

### Build model

In [None]:
linear_regression_model = LinearRegression()
linear_regression_model.fit(X_train, Y_train)

print(f'coef: {linear_regression_model.coef_}')
print(f'intercept: {linear_regression_model.intercept_}')

### Evaluation
1. calculating max error
1. calculating mean absolute error
2. calculating root mean squared error
3. calculating r2 score (coefficient of determination)
1. [Other metrics](https://scikit-learn.org/stable/modules/model_evaluation.html)

In [None]:
# make prediction
Y_pred = linear_regression_model.predict(X_test)

# max error
print(f'Max error : {max_error(Y_test,Y_pred)}')

# mean absolute error
print(f'Mean absolute error : {mean_absolute_error(Y_test,Y_pred)}')

# mean root mean error
print(f'Root mean squared error : {mean_squared_error(Y_test,Y_pred, squared=False)}')

# r2 score
print(f'R2 score (Coefficient of determination) : {r2_score(Y_test,Y_pred)}')

## Practice - adding new feature into linear regression model

**Hint :** 
1. make sure that added feature is quantitative.
2. the selected feature is related to output score.