In [None]:
# First we import all the necessary packages
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Retrieve the cleaned data
clean_data = pd.read_pickle(
    "C:/Users/gongi/miniconda3/envs/insy6500/my_repo/projects/final_project/data/cleaned_data.pkl"
)

In [None]:
# 4. Statistical EDA & 5. Transformation & Features
print(
    "Our objective is to understand the dataset and learn from it. Thats why we ask ourselves the following questions which we will try to answer in the EDA phase."
)
print(
    "- What numerical characteristics are related to one another. To do this we will do a correlation heatmap?"
)
print(
    "- How many cars have been produced each year. Does it follow a trend? Is there any year with more cars?"
)
print("- Is there a relationship between the type of transmission of the car and its horsepower?")
print("- What brands and models are more popular and why?")
print("- What is the relationship between HP and MPG?")
print(
    "- What is the relationship between price and the category of the car (luxury, performance, etc.)?"
)
print("- What other factors affect price")
# Correlation Heatmap
print(
    "A great way to start the analysis is to make a correlation heatmap. For this we have to select only the numerical data."
)
numeric_data = clean_data.select_dtypes(include=["float64", "int64"])
correlation_matrix = numeric_data.corr()

plt.figure(figsize=(6, 4))
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm", fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()
print(
    "From this graph we can come to many conclusions like: \n-More cyliders mean more horsepower."
)
print(
    "-The more HP, the lower the MPG is in both the city and the highway, since the engine uses more fuel to create that extra HP\n-The more HP the engine has, the higher is the retail price for the car (MSRP)"
)
print(
    "-We can also see there is a strong correlation between the MPG in the city and in the highway, which is obvious."
)
print(
    "-Finally, we can see that the popularity score of the cars isn't related to any metrics in this dataset. Maybe it is because popularity is more related to a variable not included in the dataset like the appearance of the car."
)
print("")
print("Now we will take a look at the cars per year, to see when have the most cars been produced.")
fig, ax = plt.subplots(figsize=(10, 4))
clean_data["Year"].hist(bins=28, ax=ax, edgecolor="black", alpha=0.7)
ax.set_xlabel("Year")
ax.set_ylabel("Frequency")
ax.set_title("Distribution of cars per year")
plt.tight_layout()
plt.show()
print(
    "We notice that the number of manufactured cars increases slowly from 1990, and that the majority of the cars in this dataset are from the years 2015, 2016 and 2017."
)
print("")


# Transmission vs Engine HP
print(
    "We can now take a look at the relationship between the transmission type and the engine HP. "
)
plt.figure(figsize=(10, 4))
sns.violinplot(x="Transmission Type", y="Engine HP", data=clean_data)
plt.show()
print(
    "From this graph we can come to a few conclusions: \n-Direct drive type transmsion is only used on cars with low HP, this is because direct drive connects the engine directly to the driven part, which would probably break in cars with high HP."
)
print(
    "-We can also see how between manual and automatic cars, the automatic option is more used when the car has higher HP. This is probably because both high HP and manual transmission are features of high-end models and it wouldnt make much sense to have one without the other."
)
print(
    "-Finally, we can say that the cars with the most HP almost always use automated-manual transmission. This is probably because cars with that much HP are sports cars directed for people who love driving and they will maybe want to use the manual function to get have a better and more authentic driving experience."
)


# Popularity of cars
print("")
print("Now we will analyze the popularity of the cars:")
plt.figure(figsize=(10, 4))
data_popular_cars = clean_data[clean_data["Popularity"] > 2000]

plot_data = (
    clean_data.groupby("Make", observed=True)["Popularity"]
    .mean()
    .sort_values(ascending=False)
    .reset_index()
)
my_order = plot_data["Make"]
plt.figure(figsize=(8, 10))
sns.barplot(x="Popularity", y="Make", data=plot_data, order=my_order)
plt.title("Popularity Score by Brand (Ordered)")
plt.xlabel("Popularity Score")
plt.show()


print("We notice that the majority of the cars are between 0 and 2000 popualrity.")
print("We take a look at the most popular car makers.")
# We create a new dataframe where only the cars that have a popularity rating higher than 3000 are included.

data_popular_cars = clean_data[clean_data["Popularity"] > 3000]
brands = data_popular_cars["Make"].value_counts()
popular_brands = brands[brands > 0]
print(popular_brands)
print(
    "We can see that the 3 most popular car brands are Ford, BMW and Audi, with Ford being the most popular out of all of them. We will now take a look at the most popular model for this brand, to find out what makes it so special."
)
popular_ford = data_popular_cars[data_popular_cars["Make"] == "Ford"]

popular_ford = popular_ford.groupby("Model", observed=True)["Popularity"].count()
print("")
print(popular_ford[popular_ford > 100])
print("\nWe find out that the most popular car is the ford F-150")

print("")


# Popularity to Price Ratio by Car Brand
print(
    "We found that Ford is the most popular brand in general. However, we want to integrate the MSRP in the popularity analysis since we suspect that the price of the car will influece the perception and expectations of the consumer. We will normalize the values of Popularity and MSRP and take the ratio of Popularity to Price. This will tell us how popular the brands are compared to their price."
)
# normalization of Popularity and MSRP
clean_data["Popularity_norm"] = (clean_data["Popularity"] - clean_data["Popularity"].min()) / (
    clean_data["Popularity"].max() - clean_data["Popularity"].min()
)
clean_data["MSRP_log"] = np.log1p(clean_data["MSRP"])
clean_data["MSRP_log_norm"] = (clean_data["MSRP_log"] - clean_data["MSRP_log"].min()) / (
    clean_data["MSRP_log"].max() - clean_data["MSRP_log"].min()
)


def safe_ratio(row):
    try:
        return row["Popularity_norm"] / row["MSRP_log_norm"]
    except ZeroDivisionError:
        return None
    except Exception:
        return None


clean_data["Pop_to_Price_Ratio"] = clean_data.apply(safe_ratio, axis=1)

maker_ratio = (
    clean_data.groupby("Make", observed=True)["Pop_to_Price_Ratio"]
    .mean()
    .sort_values(ascending=False)
    .reset_index()
)
my_order = maker_ratio["Make"]

# Plot of Popularity to price ratio
plt.figure(figsize=(12, 6))
sns.barplot(x="Make", y="Pop_to_Price_Ratio", data=maker_ratio, order=my_order)
plt.xticks(rotation=90)
plt.title("Average Popularity-to-Price Ratio by Car Brand")
plt.xlabel("Car Maker")
plt.ylabel("Average Popularity / MSRP (normalized)")
plt.show()


print(
    "We can see that Plymouth is the car brand with the highest popularity to price ratio. However, after a deeper analysis we found that plymouth has low values for both popularity, and price. This creates a flase sense of high popularity. We will analyze the results afetr removing Plymouth from the dataset."
)

# Plot of popularity to price ratio after removing plymouth
maker_ratio_no_plymouth = (
    clean_data[clean_data["Make"] != "Plymouth"]
    .groupby("Make", observed=True)["Pop_to_Price_Ratio"]
    .mean()
    .sort_values(ascending=False)
    .reset_index()
)
my_order = maker_ratio_no_plymouth["Make"]
plt.figure(figsize=(12, 6))
sns.barplot(x="Make", y="Pop_to_Price_Ratio", data=maker_ratio_no_plymouth, order=my_order)

plt.xticks(rotation=90)
plt.title("Average Popularity-to-Price Ratio by Car Brand (Plymouth removed)")
plt.xlabel("Car Maker")
plt.ylabel("Average Popularity / MSRP (normalized)")
plt.show()

print(
    "After removing Plymouth from the data set we see that the tree brands with the highest Popularity to price ratio are Ford, Dodge, and Toyota. Therefore, Ford remains as the most popular car brand."
)


# Scatter plot Engine HP vs MPG
print(
    "Now we will make a scatter plot to see the relationship between the cars HP and its MPG in more detail. "
)

gas_cars = clean_data[clean_data["Engine Fuel Type"] != "electric"]
plt.figure(figsize=(12, 6))
sns.regplot(
    data=gas_cars,
    x="Engine HP",
    y="highway MPG",
    scatter_kws={"alpha": 0.3},
    line_kws={"color": "red"},
    order=2,
)
plt.title("HP vs MPG")
plt.ylim(0, 60)
plt.show()
print(
    "As we saw earlier on the correlation heatmap, there is a significant correlation between HP and the MPG. However this graph shows a curve, meaning that going from 100 HP to 200 HP will probably have a big impact on MPG but if we go from 500 HP to 600 HP the MPG is not gonna be really affected. "
)

# PRICE
print("\n##PRICE##")
print("Now we will take a look at how the cateogry of the car affects price.")
# For this we create a row for each category of the car, this will make thing easier. We do this with the explode function.

# cat_clean = clean_data.dropna(subset=['Market Category']).copy()   #We drop rows with missing data
cat_clean = clean_data.copy()
# cat_clean['Market Category'] = cat_clean['Market Category'].str.split(',')      #We split data
cat_exploded = cat_clean.explode("Market Category")
sorted_order = (
    cat_exploded.groupby("Market Category")["MSRP"].median().sort_values(ascending=False).index
)  # We set the order so the  plot shows organized
plt.figure(figsize=(12, 8))
sns.boxplot(
    x="MSRP",
    y="Market Category",
    data=cat_exploded,
    order=sorted_order,
    hue="Market Category",
    showfliers=False,
    palette="magma",
    legend=False,
)  # Here we drop the extreme outliers like buggati
plt.title("Price Ranges by Market Category")
plt.xlabel("Price (USD)")
plt.ylabel("Market Category")
plt.show()

print(
    "From this plot we learn that the category of a car is a great indicator for its price. We cleary see that exotic cars are on average the most expensive ones, followed by high-pperformance, factory-tunner and luxury cars. We can also see that the cheapest cars are the hatchback cars."
)
print(
    "We know that the engine HP (from the correlation heatmap) and the category of the cars are two great indicators of price. We will take a look at the rest of cateogrical data to see if any of them is related to price."
)


fig, axes = plt.subplots(
    2, 2, figsize=(14, 11)
)  # We organize the plots so we can see the 4 of them at the same time.

sns.boxplot(x="Transmission Type", y="MSRP", data=clean_data, showfliers=False, ax=axes[0, 0])
axes[0, 0].set_title("Price by Transmission")
axes[0, 0].tick_params(axis="x", rotation=45)  # We rotate the nnames so we can see it more clearly

sns.boxplot(x="Driven_Wheels", y="MSRP", data=clean_data, showfliers=False, ax=axes[0, 1])
axes[0, 1].set_title("Price by Driven Wheels")
axes[0, 1].tick_params(axis="x", rotation=45)

sns.boxplot(
    x="Vehicle Size",
    y="MSRP",
    data=clean_data,
    order=["Compact", "Midsize", "Large"],
    showfliers=False,
    ax=axes[1, 0],
)
axes[1, 0].set_title("Price by Vehicle Size")

sns.boxplot(y="Vehicle Style", x="MSRP", data=clean_data, showfliers=False, ax=axes[1, 1])
axes[1, 1].set_title("Price by Vehicle Style")
plt.tight_layout()
plt.show()

print("From this graphs we can make different conclusions:")
print(
    "-From the first graph that relates price vs transmission type we can see that the automated_manual category is generally the one with the most expensive cars. This makes sense as the type of transimission is used in high end cars with high HP. We can also see that manual transimission cars are generally the cheapest."
)
print(
    "-From the second plot we can see that see that rear wheel drive and all wheel drive are generally the cars with the highest prices while front wheel drive cars are the cheapest ones."
)
print(
    "-From the third graph realating price vs vehicle size we can clearly see that the bigger the car, the more expensive it is."
)
print(
    "-Finally, from the last plot we conclude that coupes and convertibles are generally the most expensice type of cars. 2dr cars, cargo vans, covertible suvs and cab pick ups are generally the cheapest type of cars."
)
print(
    "\nFrom this analysis we can conclude for example that a large, exotic high-performance car with high HP, automated-manual transmission, rear-wheel drive and convertible is gonna be more expensive than a hatchback, manual, front wheel drive, compact 2 wheel, 2dr SUV car.."
)
print(
    "It would be very interesting to predict the price of a car based on its  categories, however, that is a machine learning project."
)
# display(popular_ford)