## Exploratory Data Analysis

In [57]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Added in this session
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn 
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [None]:
df = pd.read_csv("CarPrice_missing.csv")
df.head()

In [None]:
df.info()

In [None]:
# check for missing values and plot the graph
df.isna().sum().plot(kind="bar")


Handling missing values - drop the rows and columns with missing values

In [None]:
# keep the original untouched, copy the original dataframe
copyone = df.copy()
copyone.head()

In [None]:
# drop the rows with missing values
copyone.dropna()

In [None]:
# drop the column itself
copytwo = df.copy()
copytwo.dropna(axis=1)

df.info()

Handling Missing values - Imputation (mean and median)

In [None]:
copythree = df.copy()

# fill missing values with the mean of the column
copythree = copythree.fillna(df.select_dtypes(["int64", 'float64']).mean())
copythree.isna().sum().plot(kind="bar")
copythree.info()

'''
fillna method applies the argument to the df but doesn't save it anywhere, 
so make sure you assign it to the existing or a new dataframe
'''

In [None]:
copyfour = df.copy()

# fill misssing values with median
copyfour = copyfour.fillna(df.select_dtypes(['int64', 'float64']).mean())
copyfour.isna().sum().plot(kind="bar")

In [None]:
# you can fill the missing values with mode too, but it is not recommended for this dataset
copyfive = df.copy()

copyfive = copyfive.fillna(df.select_dtypes(['int64', 'float64']).mode().iloc[0])
copyfive.isna().sum().plot(kind='bar')

'''
df.mean() returns a pandas Series
df.mode() returns a pandas Df

mode() returns a df because a column may have multiple modes, pandas returns a df to handle this possibility, 
with each row representing a mode

when you call fillna() with a df, pandas tries to align on the row index, the result of mode() is a df that
usually has only one row with an index of 0, therefore, it fills only NaN in copyfive df at rowindex 0, leaving
other NaN unctouched

The solution is to select the first row from the mode DataFrame. This converts it into a Series, 
which fillna() will then use to fill values by matching column names—exactly the behavior you want.
'''

Handling Missing Values - Hot deck imputation: forward fill and backward fill

In [None]:
copysix = df.copy()

# forward fill
copysix = copysix.ffill()
copysix

In [None]:
copyseven = df.copy()

# backward fill
copyseven = copyseven.bfill()
copyseven

## Categorical Encoding
### One-Hot Encoding

In [None]:
one_hot = df.copy()

pd.get_dummies(one_hot)

### Label Encoding

Grab the "object" type columns and convert them into numeric values, 
this will be done via the label encoder class from sklearn 

In [None]:
# append the object type column names for reference
labels = df.copy()

object_columns = []

for i in labels.select_dtypes("object").columns:
    # instanciate the LabelEncoder Class 
    le = LabelEncoder()

    # grab every column and pass it to the label encoder
    labels[i] = le.fit_transform(df[i])

Now all columns will have numeric values

In [None]:
labels
labels.info()

# Feature Scaling
## Normalisation

fill the missing values by imputation of mean and apply min max scaler

In [None]:
norm_data = df.copy()

# instanciate the min-max scaler
norm = MinMaxScaler()

# fill missing values with mean
norm_data = norm_data.fillna(norm_data.select_dtypes(['int64', 'float64']).mean())

# apply min max scaler to the numerical types, returns a numpy array
norm_data = norm.fit_transform(norm_data.select_dtypes(['int64', 'float64']))

# print the column names of the numeric data types, copy them and pass it to the column parameter of the norm_data
print((df.select_dtypes(['int64', 'float64']).columns))

# convert the numpy array into a pandas dataframe, there will be no column names, so add column names manually
norm_data = pd.DataFrame(norm_data, columns = ['car_ID', 'symboling', 'CarName', 'fueltype', 'aspiration',
       'doornumber', 'carbody', 'drivewheel', 'enginelocation', 'wheelbase',
       'carlength', 'carwidth', 'carheight', 'curbweight', 'enginetype',
       'cylindernumber', 'enginesize', 'fuelsystem', 'boreratio', 'stroke',
       'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg',
       'price'])

# the min is 0 and max is 1
norm_data.describe().T # T stands for transpose (for better view)

## Standardisation

Fill up the missing values with median and apply z-score (standardisation on this dataset is not recommended as the data is not normally distributed)

In [None]:
stan = df.copy()

# fill up the missing values with median (median is safer)
stan.fillna(stan.select_dtypes(['int64', 'float64']).median())

# instanciate the standard scaler class
stan_scaler = StandardScaler()

# apply standard scaler on the numeric data types
stan = stan_scaler.fit_transform(stan.select_dtypes(["int64", "float64"]))

# extract column names from the original df
print(df.select_dtypes(['int64', 'float64']).columns)

# convert the array into a pandas dataframe 
stan = pd.DataFrame(stan, columns=['car_ID', 'symboling', 'CarName', 'fueltype', 'aspiration',
       'doornumber', 'carbody', 'drivewheel', 'enginelocation', 'wheelbase',
       'carlength', 'carwidth', 'carheight', 'curbweight', 'enginetype',
       'cylindernumber', 'enginesize', 'fuelsystem', 'boreratio', 'stroke',
       'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg',
       'price'])

# describe the standardised dataframe
stan.describe().T

''' 
The mean is close to zero, but not exactly 0, this is because the data is not normally distributed
'''


# Outliers

## Detection

In [None]:
# plot the outliers for each column

# iterate through the numeric columns
for i in df.select_dtypes(['int64', 'float64']).columns: 
    # plt a figure of size 3 x 3 (300x300)
    plt.figure(figsize=(3,3))

    # seaborn boxplot
    sn.boxplot(y=i, data=df)

    # show the plotted image
    plt.show()

## Outlier Handling

In [None]:
outlier_data = df.copy()

for i in outlier_data.select_dtypes(['int64', 'float64']).columns:
    # Calculate IQR
    q1 = outlier_data[i].quantile(0.25)
    q3 = outlier_data[i].quantile(0.75)
    IQR = q3 - q1

    # calculate upper and lower bound
    lower = q1 - (1.5 * IQR)
    upper = q3 + (1.5 * IQR)

    # calulate median of the column
    med = outlier_data[i].median()

    # replace outliers with median
    outlier_data[i] = np.where((outlier_data[i] < lower) | (outlier_data[i] > upper), med, outlier_data[i])

# Binning/Grouping data

In [67]:
# converting numerical data into categorical data 
df['wheelbase_bin'] = pd.cut(df['wheelbase'], bins = [85, 95, 100, 120], labels = ['Small', 'Mid', 'Large'])
df[['wheelbase', 'wheelbase_bin']]

Unnamed: 0,wheelbase,wheelbase_bin
0,88.6,Small
1,88.6,Small
2,94.5,Small
3,99.8,Mid
4,99.4,Mid
...,...,...
200,109.1,Large
201,109.1,Large
202,109.1,Large
203,109.1,Large


In [70]:
# group the avg wheelbase according to fuel type
df.groupby(['fueltype', 'carbody'])['wheelbase'].mean()

fueltype  carbody
0         1          106.700000
          2           95.700000
          3          103.133333
          4          112.800000
1         0           92.700000
          1           97.328571
          2           95.533333
          3          100.154321
          4          100.704545
Name: wheelbase, dtype: float64

# Duplicate values

In [72]:
df.duplicated().sum()

np.int64(0)

## Model Building and Training

Split the data into independent and dependent variables, in this case the dependent variable (the one which we want to predict the values for) is car price, so we take the price column as 'y' and others as 'X', we drop the columns which are not necessary for the X variable

In [None]:
# label Encoded Dataset
object_columns = []

for i in df.select_dtypes("object").columns:
    # instanciate the LabelEncoder Class 
    le = LabelEncoder()

    # grab every column and pass it to the label encoder
    df[i] = le.fit_transform(df[i])

In [None]:
X = df.drop(['symboling', 'CarName', 'price'], axis=1)
y = df['price']

In [None]:
# split the X and y variables into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Build the model, using Multiple Linear Regression (Only as an example)

In [None]:
model = LinearRegression()

# Train the model on 'Train data'
model.fit(X_train, y_train)

In [None]:
# Predict on the test data
y_pred_test = model.predict(X_test)
y_pred_train = model.predict(X_train)


In [None]:
# compare the predicted test data with the actual test data
test_accuracy = r2_score(y_test, y_pred_test)
train_accuracy = r2_score(y_train, y_pred_train)

print("Test Accuracy:", test_accuracy)
print("Train Accuracy:", train_accuracy)