In [None]:
# First we import all the necessary packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# We will follow the EDA workflow explained in lecture 13a

# 1. Load & Initial Reconnaissance
raw_data = pd.read_csv("data/data.csv")

# After loading the data we perform an initial exploration to understand the structure and content of the data set
print(
    f"An initial exploration of the data shows that there are {raw_data.shape[0]} enties (rows), and {raw_data.shape[1]} attributes (columns). All columns are object, float64 and int64 data types. However, some columns could be converted to categorical variables to make a better use of the data and optimize performace."
)
print(
    "These columns are: Make, Model, Year Engine Fuel Type, Transmission Type, Driven_Wheels, Number of Doors, Market Category, Vehicle Size, and Vehicle Style."
)
print(
    "The only column that represents time data is the year. However, it is no woth converting it to a datetime variable since the analysis can be done considering it a categorical variable."
)
print(
    f"\nWe can also see that the non-null count for some of the variables(columns) is different than the total number of rows ({raw_data.shape[0]}), which indicates that there are missing values. In particular, the columns missing some values are: Engine Fuel Type, Engine HP, Engine Cylinders, Number of Doors, and Market Category.\n"
)
display(raw_data.info())


# rows print
print(
    "\nBy looking at the first and last rows of the data we can deduce several things. Firstly, we notice that the data seems to be ordered by the Model column, since it starts with those models whose first character is a 1 and ends with those whose first character is z."
)
print(
    "We can also see some of values on the different categories, like Transmission Type (manual and automatic), how the Market Category is defined, and the order of magnitude of the Popularity rating."
)
print("\nFirst five rows:")
display(raw_data.head())
print("\nLast five rows:")
display(raw_data.tail())
print("\nNow we explore the number of categories for each variable:\n")
print(
    raw_data[
        [
            "Make",
            "Model",
            "Year",
            "Engine Fuel Type",
            "Transmission Type",
            "Driven_Wheels",
            "Number of Doors",
            "Market Category",
            "Vehicle Size",
            "Vehicle Style",
        ]
    ].nunique()
)

# Transmission Type
print(
    "\nWe are surpreised to find that there are five different transmission types, since we only expected to find two (manual and automatic). Below are all the transmission types.\n"
)
print(raw_data["Transmission Type"].value_counts())
print(
    "\nWe have found that the fourth category is 'UNKNOWN', which we consider missing data. This indicates that some of the variables could be storing missing values inside of specific categories. In this case the number of missing values is 19, which is an acceptable amount of data to drop from the set."
)


print(
    "\nAs for the driven wheels we were expecting three different values: front, rear and all wheel drive. Below we print all the alternatives.\n"
)
print(raw_data["Driven_Wheels"].value_counts())
print(
    " \nWe have found out that four wheel drive and all wheel drive are different things, and that explains the fourth category. We will later check all the values for the different variables to verify that there are no missing values stored in any category."
)


print("\nNow we perform some basic statistics on the data:")
print(raw_data.describe())
print(
    "\nThere aren't many relevant conclusions that we can derive from this analysis. However, there are some interesting inshights. For example, there is a 1000 HP car, a car with 16 cyclinders, popularity ranges between 2 and 5657 with an average of 1554.9, a car with a highway milage of 354 MPG, and the cars years ranges between 1990 and 2007.\n"
)

In [None]:
# 2. Data Quality Assessment
print(
    "In this phase we will assess the quality of the data. To do so we will explore the values of the different categories, looking for variable categories storing missing values. We will also look for missing values and outliers in the data."
)

# Exploration of the categories and missing values
print(
    "\nFirst, we will analyzie some of the relevant categories in order to find which are the possible values they can take and ensure no missing data are passed to the analysis phase.\n"
)
# Make
# print(raw_data['Make'].unique()) #I comment this line of code for cleanliness in the output, but it was used to explore the different values of the Make variable
# Model
# print(raw_data['Model'].unique()) #I comment this line of code for cleanliness in the output, but it was used to explore the different values of the Model variable
# Market category
# print(raw_data['Market Category'].unique()) #I comment this line of code for cleanliness in the output, but it was used to explore the different values of the Market Category variable
# Fuel type
print(raw_data["Engine Fuel Type"].value_counts())
print()
# Transmission Type
print(raw_data["Transmission Type"].value_counts())
print()
# Driven_Wheels
print(raw_data["Driven_Wheels"].value_counts())
print()
# Number of Doors
print(raw_data["Number of Doors"].value_counts())
print()
# Vehicle Size
print(raw_data["Vehicle Size"].value_counts())
print()
# Vehicle Style
print(raw_data["Vehicle Style"].value_counts())
print()

print(
    "Thanks to this analysis we have found that Transmission type contains a category for missing values called 'UNKONWN' with a total count of 19."
)
raw_data["Transmission Type"] = raw_data["Transmission Type"].replace(
    "UNKNOWN", np.nan
)  # to include UNKNOWN value as missing data
print(
    "After adding that variable value to the list of NaN values, the total count of missing values is:\n"
)
print(raw_data.isna().sum()[lambda x: x > 0])
print(
    "\nWe have found that the quality of the data is good, with sufficient information for an analysis, a good organization and structure. However, there are some some missing values that need to be handled."
)
print(
    f"Moreover, the values in the Market Category variable contain multiple classifications. By extracting and splitting those pieces of information a better understanding of the data could be achieved, and the subsequent analysis would be simplified. To better understand this take row 0 as an example: row 0 stores in the Market Category the value: {raw_data.loc[0, 'Market Category']} which could be split into 3 categories."
)
print(
    "In conclusion, data requires to be cleaned and processed to make it adequate for the subsequent analysis."
)

In [None]:
Market_Categories = raw_data["Market Category"].str.split(",").explode().str.strip().unique()
print(
    f"The market Category variable can take any combination of {len(Market_Categories)} values: {Market_Categories}. As expected there is a category storing missing values in the Market Category variable. "
)

In [None]:
# 3. Cleaning Decisions
# Firstly, we will reassing the variable types to the best suitable type.
raw_data = raw_data.astype(
    {
        "Make": "category",
        "Year": "category",
        "Model": "category",
        "Engine Fuel Type": "category",
        "Transmission Type": "category",
        "Driven_Wheels": "category",
        "Number of Doors": "category",
        "Market Category": "category",
        "Vehicle Size": "category",
        "Vehicle Style": "category",
    }
)

# we confirm that the variable type has been converted.
raw_data.dtypes

# Now we explore the missing values
print(
    "\nWe can see that there are missing values on six different categories, but the count of missing values is very different. While Engine Fuel Type is only missing 3 values, market category is missing 3742.\n"
)
print(raw_data.isna().sum()[lambda x: x > 0])

print(
    "\n\nNext, we analyze the rows with missing data, in search of any patterns or reasons for that data to be missing, and with the intention to fill the missing values when possible:\n"
)

# IGNORE the following commented lines
# print("Engine Fuel Type:")
# print(raw_data[raw_data['Engine Fuel Type'].isna()])
# print("\nAll the rows missing the value of the Engine Fuel Type correspond to Suzuki Veronas. If we filter all the rows of Suzuki Verona we see that their Engine Fuel Type is regular unleaded in all cases. However, all values missing are from 2004 and it would be risky to assume that prior models were all regular unleaded as well. Therefore, we will simply drop teh missing values\n")
# print(raw_data[raw_data['Model']=='Verona'])

# print("\n\nNumber of doors:")
# print(raw_data[raw_data['Number of Doors'].isna()])
# print("\nMost of the rows missing the value of the Number of Doors correspond to Tesla Model S cars. Since all Tesla Model S cars have 4 doors we will manually correct the missing data replacing it with a 4 ")
# print(raw_data[raw_data['Model']=='Model S'])


# Engine Fuel Type & Number of Doors
print(
    f"Dropping the rows missing the values for engine fuel type and number of doors results in losing 9 rows, which we consider acceptable compared to the size of the data set ({raw_data.shape[0]}) ."
)


# Engine HP
print(
    "\n\nWe suspect that most the rows missing values for Engine HP might be electric, and that most electric vehicles don't have a value for Engine HP in most of the cases."
)
missing_values_hp = raw_data[(raw_data["Engine HP"].isna())]
print(
    f"\nRows missing Engine HP data with sorted by Engine Fuel Type: {missing_values_hp['Engine Fuel Type'].value_counts()[lambda x: x > 0]}"
)
print(
    f"\nTotal amount of rows with 'Electric' as Engine Fuel Type: {(raw_data['Engine Fuel Type'] == 'electric').sum()}"
)
print(
    "This indicates that 44 out of the 66 electric vehicle entries don't include the Engine HP. We will try to correct the missing values, when possible."
)
print(
    "Looking at the raw data for those rows missing Engine HP and 'electric' as Engine Fuel Type we can conclude that see that Tesla Model S and Nissan Leaf take most of the missing values"
)
print(missing_values_hp.groupby("Model", observed=True).size())
print(
    f"\nIn particular Tesla Model S accounts for {missing_values_hp[missing_values_hp['Model'] == 'Model S'].shape[0]} rows of missing data and Nissan leaf for {missing_values_hp[missing_values_hp['Model'] == 'Leaf'].shape[0]}."
)
# print(raw_data[(raw_data['Engine Fuel Type']=='electric') & (raw_data['Engine HP'].isna() == True)]) #I comment this part of code to simplify output, but it was used to investigae rows with missing data about Engine HP
# print(raw_data[raw_data['Model']=='Model S'][['Make','Model','Engine HP']]) #I comment this out, but it was used to verify that there is no data about Tesla Model S Enginge HP
# print(raw_data[raw_data['Model']=='Leaf'][['Make','Model','Engine HP']]) #I comment this out, but it was used to verify that there is no data about Nissan Leaf Enginge HP
print(
    "Unfortunately, non of the data enties for Tesla Model S or Nissan Leaf include information about the HP that we could assume equal to all the models, so we will have to drop all rows missing values for Engine HP"
)


# Enigne Cylinders
missing_values_cylinders = raw_data[raw_data["Engine Cylinders"].isna()]
print(
    f"\n\nNow we look at the rows missing data about the engine cylinders, and find that Mazda RX-8 accounts for {missing_values_cylinders[missing_values_cylinders['Model'] == 'RX-8'].shape[0]} of the {missing_values_cylinders.shape[0]} total missing."
)
# print(missing_values_cylinders[['Make','Model','Engine Cylinders']]) #I comment this part of the code to simplify the output, bu it was used to investigate rows missing data about Engine Cylinders
print(missing_values_cylinders.groupby("Model", observed=True).size())
# print(raw_data[raw_data['Model']=='RX-8'][['Make','Model','Engine Cylinders']]) #I comment this out, but it was used to verify that there is no data about Mazda RX-8 Enginge Cylinders
print(
    "Unfortunately, once again there is no information about this model on the other rows that could be used to replace the NaN values, therefore all rows with missing values for Engince Cylinders will be dropped."
)


# Transmission Type
missing_values_transmission = raw_data[raw_data["Transmission Type"].isna()]
print(
    f"\n\nNow we look at the rows missing data about the transmission type, and find that Dodge RAM 150 accounts for {missing_values_transmission[missing_values_transmission['Model'] == 'RAM 150'].shape[0]} of the {missing_values_transmission.shape[0]} total missing."
)
print(missing_values_transmission.groupby("Model", observed=True).size())
# print(missing_values_transmission[['Make','Model','Transmission Type']]) #I comment this part of the code to simplify the output, bu it was used to investigate rows missing data about Transmission Type
# print(raw_data[raw_data['Model']=='RAM 150']) #I comment this out, but it was used to verify that the rows of Dodge RAM 150 could be used to fill missing values on other rows
print(
    "\nIn this case we can see that there are more entries for this car model and they all have similar data. Therefore we will take the risk and assume that the transmission was manual on the rows with missing information for the Dodge RAM 150."
)
raw_data["Market Category"] = raw_data["Market Category"].cat.add_categories("NO CATEGORY")
raw_data["Transmission Type"] = raw_data["Transmission Type"].fillna("MANUAL")
# print(raw_data[raw_data['Model']=='RAM 150'][['Make','Model','Transmission Type']]) #verify that the change was made


# Market category
print(
    f"\n\nSince the missing values in the Market Category column are considerable ({raw_data['Market Category'].isna().sum()}) and the different unique possible values of this variable are too many ({raw_data['Market Category'].nunique()}) a new category will be created and assigned to those missing values."
)
raw_data["Market Category"] = raw_data["Market Category"].fillna("NO CATEGORY")


# Clean data
print(
    "\n\nIn conclusion, to clean the data we have dediced to drop the missing values for all the variables except for those missing the Transmission Type corresponding to Dodge RAM 150, and those for the Market Category, which will be reassigned to a new category. We've demonstated that we can handle missing values by dropping them or replacing them when adequate."
)
clean_data = raw_data.dropna(
    subset=[
        "Engine Fuel Type",
        "Engine HP",
        "Engine Cylinders",
        "Transmission Type",
        "Number of Doors",
    ]
)

In [None]:
print("We also want to explore some of the potential outliers:")
print(clean_data[clean_data["Engine Cylinders"] == 16])
print()
print(clean_data[clean_data["highway MPG"] == 354])
# print(raw_data[raw_data['Model']=='A6'][['Make','Model','highway MPG']]) #I comment this line of code, but it was used to verify that there was an outlier
print(
    "\nAs we suspected, there is an outlier. It corresponds to an Audi A6 with a highway mileage of 354 MPG. When compared to other Audi A6 milage we see that the outlier is 10 times higher. We dediced to remove this row."
)
clean_data = clean_data.drop(clean_data[clean_data["highway MPG"] == 354].index)

print(
    "\nWe are aware that there might be more outliers. A visual analysis of the data is required to continue with the data exploration"
)

In [None]:
# 5. Transformation
