# Data Analysis

This notebook contains data analysis and exploration.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plotting style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (10, 6)

# Load the data
df = pd.read_csv('../datasets/BMW-sales-data-2010-2024.csv')
print(f"Data loaded: {len(df)} rows, {len(df.columns)} columns")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst rows:")
df.head()

Matplotlib is building the font cache; this may take a moment.


Daten geladen: 50000 Zeilen, 11 Spalten

Spalten: ['Model', 'Year', 'Region', 'Color', 'Fuel_Type', 'Transmission', 'Engine_Size_L', 'Mileage_KM', 'Price_USD', 'Sales_Volume', 'Sales_Classification']

Erste Zeilen:


Unnamed: 0,Model,Year,Region,Color,Fuel_Type,Transmission,Engine_Size_L,Mileage_KM,Price_USD,Sales_Volume,Sales_Classification
0,5 Series,2016,Asia,Red,Petrol,Manual,3.5,151748,98740,8300,High
1,i8,2013,North America,Red,Hybrid,Automatic,1.6,121671,79219,3428,Low
2,5 Series,2022,North America,Blue,Petrol,Automatic,4.5,10991,113265,6994,Low
3,X3,2024,Middle East,Blue,Petrol,Automatic,1.7,27255,60971,4047,Low
4,7 Series,2020,South America,Black,Diesel,Manual,2.1,122131,49898,3080,Low


## Data Loading and Statistics Summary


In [None]:
# ============================================
# EXERCISE: Autocomplete in Cursor
# ============================================
# These exercises demonstrate Cursor's tab autocomplete feature
# Tip: Start typing code and press TAB to get suggestions
# Tip: Select the line and press CMD + L to copy the lines to the chat

# Step 1: Load the data
df = pd.read_csv('../datasets/BMW-sales-data-2010-2024.csv')

# Task 1: Helper function with AI code autocomplete
# ============================================
# Helper function: Calculates statistical summary for numeric columns
def calculate_summary_statistics(df):
    """
    Calculates a statistical summary for numeric columns in the DataFrame.
    
    Args:
        dataframe: A pandas DataFrame
        
    Returns:
        A DataFrame with statistical summaries
    """
    # Select only numeric columns and calculate statistical summary
    # Tip: Start typing "df.select" and press TAB for autocomplete
    # Tip: After select_dtypes() you can type ".describe" and press TAB
    # Tip: When typing "np.number" you will also get autocomplete suggestions
    return df.select_dtypes(include=[np.number]).describe()

# Run the helper function to see results in the console
print("\nGeneral Statistics:")
result = calculate_summary_statistics(df)
print(result)



Allgemeine Statistiken:
               Year  Engine_Size_L     Mileage_KM      Price_USD  Sales_Volume
count  50000.000000   50000.000000   50000.000000   50000.000000  50000.000000
mean    2017.015700       3.247180  100307.203140   75034.600900   5067.514680
std        4.324459       1.009078   57941.509344   25998.248882   2856.767125
min     2010.000000       1.500000       3.000000   30000.000000    100.000000
25%     2013.000000       2.400000   50178.000000   52434.750000   2588.000000
50%     2017.000000       3.200000  100388.500000   75011.500000   5087.000000
75%     2021.000000       4.100000  150630.250000   97628.250000   7537.250000
max     2024.000000       5.000000  199996.000000  119998.000000   9999.000000


In [None]:
# Task 2: Helper function with CMD+K (Inline Edit)
# ============================================
# Helper function: Groups data and calculates aggregated statistics
def calculate_grouped_statistics(df, group_by_column, agg_columns=None):
    """
    Groups data by a column and calculates aggregated statistics.
    
    Args:
        df: A pandas DataFrame
        group_by_column: Column to group by (e.g. 'Model', 'Region', 'Fuel_Type')
        agg_columns: List of columns for aggregation (optional, uses all numeric columns if None)
        
    Returns:
        A DataFrame with grouped statistics
    """
    
    # If no columns specified, use all numeric columns
    if agg_columns is None:
        agg_columns = df.select_dtypes(include=[np.number]).columns.tolist()
    
    # Group by the specified column and calculate aggregated statistics
    # Tip: Select the following line and press CMD+K for Inline Edit
    # Tip: Describe what you want to implement, e.g. "group by group_by_column, aggregate agg_columns with mean, sum and count, and reset the index"
    grouped = df.groupby(group_by_column)[agg_columns].agg(['mean', 'sum', 'count']).reset_index()
    
    return grouped

# Run the helper function to see results in the console
# Example: Group by model and calculate statistics
result_grouped = calculate_grouped_statistics(df, 'Model', ['Price_USD', 'Sales_Volume'])
print("\nGrouped Statistics by Model:")
print(result_grouped)


Gruppierte Statistiken nach Modell:
       Model     Price_USD                  Sales_Volume                
                      mean        sum count         mean       sum count
0   3 Series  75566.233950  347226845  4595  5066.660065  23281303  4595
1   5 Series  75287.844077  345721780  4592  5029.947517  23097519  4592
2   7 Series  75570.196742  352610538  4666  5097.828118  23786466  4666
3         M3  74841.588715  330275931  4413  5064.512576  22349694  4413
4         M5  74474.930996  333498741  4478  5087.022778  22779688  4478
5         X1  75262.219037  343948341  4570  5121.676149  23406060  4570
6         X3  75016.616856  337349726  4497  5057.933956  22745529  4497
7         X5  74708.116782  335215320  4487  5061.232226  22709749  4487
8         X6  74434.600491  333318141  4478  5060.738276  22661986  4478
9         i3  74800.268081  345427638  4618  5009.495236  23133849  4618
10        i8  75366.270951  347137044  4606  5085.516934  23423891  4606
