# Introduction to Data Science and Machine Learning

In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("./Nigerian_Car_Prices.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Make,Year of manufacture,Condition,Mileage,Engine Size,Fuel,Transmission,Price,Build
0,0,Toyota,2007.0,Nigerian Used,166418.0,2400.0,Petrol,Automatic,3120000,
1,1,Lexus,,,138024.0,,,Automatic,5834000,
2,2,Mercedes-Benz,2008.0,Nigerian Used,376807.0,3000.0,Petrol,Automatic,3640000,
3,3,Lexus,,,213362.0,,,Automatic,3594000,
4,4,Mercedes-Benz,,,106199.0,,,Automatic,8410000,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4095 entries, 0 to 4094
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           4095 non-null   int64  
 1   Make                 4095 non-null   object 
 2   Year of manufacture  3617 non-null   float64
 3   Condition            3616 non-null   object 
 4   Mileage              4024 non-null   float64
 5   Engine Size          3584 non-null   float64
 6   Fuel                 3607 non-null   object 
 7   Transmission         4075 non-null   object 
 8   Price                4095 non-null   object 
 9   Build                1127 non-null   object 
dtypes: float64(3), int64(1), object(6)
memory usage: 320.1+ KB


### Missing Values

In [4]:
missing_percentage = df.isnull().sum() / len(df) * 100
missing_percentage

Unnamed: 0              0.000000
Make                    0.000000
Year of manufacture    11.672772
Condition              11.697192
Mileage                 1.733822
Engine Size            12.478632
Fuel                   11.916972
Transmission            0.488400
Price                   0.000000
Build                  72.478632
dtype: float64

In [5]:
df['Build'].unique()

array([nan, 'SUV'], dtype=object)

In [6]:
# Dropping column

df.drop(columns=['Build'])
df

Unnamed: 0.1,Unnamed: 0,Make,Year of manufacture,Condition,Mileage,Engine Size,Fuel,Transmission,Price,Build
0,0,Toyota,2007.0,Nigerian Used,166418.0,2400.0,Petrol,Automatic,3120000,
1,1,Lexus,,,138024.0,,,Automatic,5834000,
2,2,Mercedes-Benz,2008.0,Nigerian Used,376807.0,3000.0,Petrol,Automatic,3640000,
3,3,Lexus,,,213362.0,,,Automatic,3594000,
4,4,Mercedes-Benz,,,106199.0,,,Automatic,8410000,
...,...,...,...,...,...,...,...,...,...,...
4090,4090,Honda,2004.0,Nigerian Used,207446.0,3500.0,Petrol,Automatic,1125000,
4091,4091,Toyota,2005.0,Nigerian Used,106914.0,1800.0,Petrol,Automatic,2643750,
4092,4092,Honda,2006.0,Nigerian Used,247149.0,1800.0,Petrol,Automatic,1462500,
4093,4093,Toyota,2007.0,Nigerian Used,249325.0,2500.0,Petrol,Automatic,2475000,


In [7]:
df.drop(columns=['Build'], inplace=True)
df

Unnamed: 0.1,Unnamed: 0,Make,Year of manufacture,Condition,Mileage,Engine Size,Fuel,Transmission,Price
0,0,Toyota,2007.0,Nigerian Used,166418.0,2400.0,Petrol,Automatic,3120000
1,1,Lexus,,,138024.0,,,Automatic,5834000
2,2,Mercedes-Benz,2008.0,Nigerian Used,376807.0,3000.0,Petrol,Automatic,3640000
3,3,Lexus,,,213362.0,,,Automatic,3594000
4,4,Mercedes-Benz,,,106199.0,,,Automatic,8410000
...,...,...,...,...,...,...,...,...,...
4090,4090,Honda,2004.0,Nigerian Used,207446.0,3500.0,Petrol,Automatic,1125000
4091,4091,Toyota,2005.0,Nigerian Used,106914.0,1800.0,Petrol,Automatic,2643750
4092,4092,Honda,2006.0,Nigerian Used,247149.0,1800.0,Petrol,Automatic,1462500
4093,4093,Toyota,2007.0,Nigerian Used,249325.0,2500.0,Petrol,Automatic,2475000


In [8]:
df['Mileage'].fillna(df['Mileage'].mean(), inplace=True)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Mileage'].fillna(df['Mileage'].mean(), inplace=True)


Unnamed: 0.1,Unnamed: 0,Make,Year of manufacture,Condition,Mileage,Engine Size,Fuel,Transmission,Price
0,0,Toyota,2007.0,Nigerian Used,166418.0,2400.0,Petrol,Automatic,3120000
1,1,Lexus,,,138024.0,,,Automatic,5834000
2,2,Mercedes-Benz,2008.0,Nigerian Used,376807.0,3000.0,Petrol,Automatic,3640000
3,3,Lexus,,,213362.0,,,Automatic,3594000
4,4,Mercedes-Benz,,,106199.0,,,Automatic,8410000
...,...,...,...,...,...,...,...,...,...
4090,4090,Honda,2004.0,Nigerian Used,207446.0,3500.0,Petrol,Automatic,1125000
4091,4091,Toyota,2005.0,Nigerian Used,106914.0,1800.0,Petrol,Automatic,2643750
4092,4092,Honda,2006.0,Nigerian Used,247149.0,1800.0,Petrol,Automatic,1462500
4093,4093,Toyota,2007.0,Nigerian Used,249325.0,2500.0,Petrol,Automatic,2475000


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4095 entries, 0 to 4094
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           4095 non-null   int64  
 1   Make                 4095 non-null   object 
 2   Year of manufacture  3617 non-null   float64
 3   Condition            3616 non-null   object 
 4   Mileage              4095 non-null   float64
 5   Engine Size          3584 non-null   float64
 6   Fuel                 3607 non-null   object 
 7   Transmission         4075 non-null   object 
 8   Price                4095 non-null   object 
dtypes: float64(3), int64(1), object(5)
memory usage: 288.1+ KB


In [10]:
df['Condition'].unique()

array(['Nigerian Used', nan, 'Foreign Used', 'Brand New'], dtype=object)

In [11]:
df['Condition'].fillna(df['Condition'].mode()[0], inplace=True)
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Condition'].fillna(df['Condition'].mode()[0], inplace=True)


Unnamed: 0.1,Unnamed: 0,Make,Year of manufacture,Condition,Mileage,Engine Size,Fuel,Transmission,Price
0,0,Toyota,2007.0,Nigerian Used,166418.0,2400.0,Petrol,Automatic,3120000
1,1,Lexus,,Nigerian Used,138024.0,,,Automatic,5834000
2,2,Mercedes-Benz,2008.0,Nigerian Used,376807.0,3000.0,Petrol,Automatic,3640000
3,3,Lexus,,Nigerian Used,213362.0,,,Automatic,3594000
4,4,Mercedes-Benz,,Nigerian Used,106199.0,,,Automatic,8410000
...,...,...,...,...,...,...,...,...,...
4090,4090,Honda,2004.0,Nigerian Used,207446.0,3500.0,Petrol,Automatic,1125000
4091,4091,Toyota,2005.0,Nigerian Used,106914.0,1800.0,Petrol,Automatic,2643750
4092,4092,Honda,2006.0,Nigerian Used,247149.0,1800.0,Petrol,Automatic,1462500
4093,4093,Toyota,2007.0,Nigerian Used,249325.0,2500.0,Petrol,Automatic,2475000


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4095 entries, 0 to 4094
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           4095 non-null   int64  
 1   Make                 4095 non-null   object 
 2   Year of manufacture  3617 non-null   float64
 3   Condition            4095 non-null   object 
 4   Mileage              4095 non-null   float64
 5   Engine Size          3584 non-null   float64
 6   Fuel                 3607 non-null   object 
 7   Transmission         4075 non-null   object 
 8   Price                4095 non-null   object 
dtypes: float64(3), int64(1), object(5)
memory usage: 288.1+ KB


### Handling Duplicates

In [13]:
duplicates = df.duplicated().sum()
print(f"\nNumber of duplicate rows: {duplicates}")


Number of duplicate rows: 0


In [14]:
duplicate_rows = df[df.duplicated(keep=False)]
duplicate_rows

Unnamed: 0.1,Unnamed: 0,Make,Year of manufacture,Condition,Mileage,Engine Size,Fuel,Transmission,Price


In [15]:
df.drop_duplicates(inplace=True)
df

Unnamed: 0.1,Unnamed: 0,Make,Year of manufacture,Condition,Mileage,Engine Size,Fuel,Transmission,Price
0,0,Toyota,2007.0,Nigerian Used,166418.0,2400.0,Petrol,Automatic,3120000
1,1,Lexus,,Nigerian Used,138024.0,,,Automatic,5834000
2,2,Mercedes-Benz,2008.0,Nigerian Used,376807.0,3000.0,Petrol,Automatic,3640000
3,3,Lexus,,Nigerian Used,213362.0,,,Automatic,3594000
4,4,Mercedes-Benz,,Nigerian Used,106199.0,,,Automatic,8410000
...,...,...,...,...,...,...,...,...,...
4090,4090,Honda,2004.0,Nigerian Used,207446.0,3500.0,Petrol,Automatic,1125000
4091,4091,Toyota,2005.0,Nigerian Used,106914.0,1800.0,Petrol,Automatic,2643750
4092,4092,Honda,2006.0,Nigerian Used,247149.0,1800.0,Petrol,Automatic,1462500
4093,4093,Toyota,2007.0,Nigerian Used,249325.0,2500.0,Petrol,Automatic,2475000


### Removing Outliers

In [17]:
# Define a function to calculate Z-scores and remove outliers
def remove_outliers_manual(df, column):
    # Calculate menan and standard deviation
    mean = df[column].mean()
    std_dev = df[column].std()

    # Calculate Z-scores manually
    z_scores = np.abs((df[column] - mean) / std_dev)

    # Filter the rows with Z-scores less than 3
    df_no_outliers = df[z_scores < 3]

    return df_no_outliers

In [18]:
df = remove_outliers_manual(df, 'Mileage')
df

Unnamed: 0.1,Unnamed: 0,Make,Year of manufacture,Condition,Mileage,Engine Size,Fuel,Transmission,Price
0,0,Toyota,2007.0,Nigerian Used,166418.0,2400.0,Petrol,Automatic,3120000
1,1,Lexus,,Nigerian Used,138024.0,,,Automatic,5834000
2,2,Mercedes-Benz,2008.0,Nigerian Used,376807.0,3000.0,Petrol,Automatic,3640000
3,3,Lexus,,Nigerian Used,213362.0,,,Automatic,3594000
4,4,Mercedes-Benz,,Nigerian Used,106199.0,,,Automatic,8410000
...,...,...,...,...,...,...,...,...,...
4090,4090,Honda,2004.0,Nigerian Used,207446.0,3500.0,Petrol,Automatic,1125000
4091,4091,Toyota,2005.0,Nigerian Used,106914.0,1800.0,Petrol,Automatic,2643750
4092,4092,Honda,2006.0,Nigerian Used,247149.0,1800.0,Petrol,Automatic,1462500
4093,4093,Toyota,2007.0,Nigerian Used,249325.0,2500.0,Petrol,Automatic,2475000


In [19]:
df['Condition'].unique()

array(['Nigerian Used', 'Foreign Used', 'Brand New'], dtype=object)

In [20]:
df['Fuel'].unique()

array(['Petrol', nan, 'Diesel', 'Hybrid', 'Electric'], dtype=object)

In [21]:
df['Make'].unique()

array(['Toyota', 'Lexus', 'Mercedes-Benz', 'Fiat', 'Land Rover', 'Foton',
       'JAC', 'Acura', 'Tata', 'Saturn', 'Porsche', 'Lincoln', 'GMC',
       'Renault', 'Mini', 'Ford', 'Buick', 'Cadillac', 'Audi', 'Infiniti',
       'Jeep', 'Peugeot', 'Dodge', 'Jaguar', 'Seat', 'Isuzu', 'Skoda',
       'Nissan', 'Hyundai', 'Pontiac', 'Hummer', 'Chevrolet', 'MG',
       'Honda', 'Chrysler', 'Kia', 'SsangYong', 'Mazda', 'Changan',
       'Mitsubishi', 'Suzuki', 'BMW', 'Rover', 'Citroen', 'Volkswagen',
       'Volvo', 'Opel', 'Bentley', 'Daihatsu'], dtype=object)

In [22]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

In [None]:
# List of categoricl columns to encode
categorical_cols = ['Make', 'Condition', 'Fuel', 'Transmission']

# Apply Label Encoding to each categorical column
for col in categorical_col:
    df[col] = 