In [None]:
import pandas as pd
from plotnine import ggplot, aes, geom_point, labs, scale_color_manual, theme_minimal, theme, element_text, element_rect, element_line


In [None]:
# Load the penguins dataset
data = pd.read_csv("penglings.csv")

# Drop rows with 'NaN' values in specified columns
columns_to_check = ['flipper_length_mm', 'body_mass_g', 'bill_length_mm']
data = data.dropna(subset=columns_to_check)

In [None]:
# Normalize the bill length
min_bill_length = data["bill_length_mm"].min()
max_bill_length = data["bill_length_mm"].max()
data["normalized_bill_length"] = (data["bill_length_mm"] - min_bill_length) / (max_bill_length - min_bill_length)
data["scaled_sizes"] = data["normalized_bill_length"]

# Set 'sizes' column (this step is really important)
data["sizes"] = data["scaled_sizes"]

# Replace NaN values in 'sizes' with a default value (0)
data["sizes"].fillna(0, inplace=True)

# Create scatter plot using plotnine
scatter_plot = (
    ggplot(data, aes(x="flipper_length_mm", y="body_mass_g", color="species")) +
    geom_point(size = 4) +
    scale_color_manual(values={"Adelie": "orange", "Gentoo": "green", "Chinstrap": "purple"}) +
    labs(x="Flipper Length (mm)", y="Body Mass (g)") +
    theme_minimal() +
    theme(
         panel_background=element_rect(fill="white"),
        legend_title=element_text("Species"),
        axis_text=element_text(color="black"),
        axis_line=element_line(color="black"),  
        panel_grid_major=element_line(color="lightgray"), 
        panel_grid_minor=element_line(color="lightgray"), 
        plot_background=element_rect(fill="white") 
        
    )
)

# Show the plot in Jupyter Notebook
scatter_plot.draw()