# Aircraft Price Analysis

This notebook contains a comprehensive analysis of aircraft prices and their determining factors.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
import plotly.express as px
import plotly.graph_objects as go

## 1. Data Loading and Cleaning

In [None]:
def load_data():
    """Load the raw aircraft price dataset."""
    df = pd.read_csv('../data/airplane_price_dataset.csv')
    return df

def clean_data(df):
    """Clean and preprocess the data."""
    df_clean = df.copy()
    
    # Rename columns to English
    column_names = {
        'Üretim Yılı': 'Production_Year',
        'Motor Sayısı': 'Number_of_Engines',
        'Motor Türü': 'Engine_Type',
        'Kapasite': 'Capacity',
        'Menzil (km)': 'Range_km',
        'Yakıt Tüketimi (L/saat)': 'Fuel_Consumption_Lph',
        'Saatlik Bakım Maliyeti ($)': 'Hourly_Maintenance_Cost',
        'Yaş': 'Age',
        'Satış Bölgesi': 'Sales_Region',
        'Fiyat ($)': 'Price_USD'
    }
    df_clean = df_clean.rename(columns=column_names)
    
    # Remove outliers using IQR method
    numerical_cols = ['Price_USD', 'Capacity', 'Range_km', 'Fuel_Consumption_Lph', 'Hourly_Maintenance_Cost']
    for col in numerical_cols:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df_clean = df_clean[(df_clean[col] >= lower_bound) & (df_clean[col] <= upper_bound)]
    
    return df_clean

## 2. Feature Engineering

In [None]:
def engineer_features(df):
    """Create new features for analysis."""
    df = df.copy()
    
    # Cost per seat
    df['Cost_Per_Seat'] = df['Price_USD'] / df['Capacity']
    
    # Maintenance cost per seat
    df['Maintenance_Cost_Per_Seat'] = df['Hourly_Maintenance_Cost'] / df['Capacity']
    
    # Fuel efficiency
    df['Fuel_Efficiency'] = df['Fuel_Consumption_Lph'] / df['Capacity']
    
    # Range efficiency
    df['Range_Efficiency'] = df['Range_km'] / df['Fuel_Consumption_Lph']
    
    # Price per range km
    df['Price_Per_Range'] = df['Price_USD'] / df['Range_km']
    
    # Operational cost index
    df['Operational_Cost_Index'] = (df['Hourly_Maintenance_Cost'] + 
                                  df['Fuel_Consumption_Lph'] * 100) / df['Capacity']
    
    # Age factor
    df['Age_Factor'] = np.exp(-0.05 * df['Age'])
    
    # Categories
    df['Size_Category'] = pd.qcut(df['Capacity'], q=4, 
                                labels=['Small', 'Medium', 'Large', 'Extra Large'])
    df['Range_Category'] = pd.qcut(df['Range_km'], q=4, 
                                 labels=['Short', 'Medium', 'Long', 'Ultra Long'])
    
    return df

## 3. Load and Prepare Data

In [None]:
# Load and prepare data
df_raw = load_data()
df_clean = clean_data(df_raw)
df = engineer_features(df_clean)
df.head()

## 4. Exploratory Data Analysis

In [None]:
# Price distribution
plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='Model', y='Price_USD')
plt.xticks(rotation=45)
plt.title('Price Distribution by Aircraft Model')
plt.show()

In [None]:
# Price vs Age relationship
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df, x='Age', y='Price_USD', hue='Model', size='Capacity')
plt.title('Price vs Age by Aircraft Model')
plt.show()

## 5. Feature Importance Analysis

In [None]:
# Prepare features for model
numeric_features = [
    'Production_Year', 'Number_of_Engines', 'Capacity', 'Range_km',
    'Fuel_Consumption_Lph', 'Hourly_Maintenance_Cost', 'Age',
    'Cost_Per_Seat', 'Maintenance_Cost_Per_Seat', 'Fuel_Efficiency',
    'Range_Efficiency', 'Price_Per_Range', 'Operational_Cost_Index',
    'Age_Factor'
]

categorical_features = ['Model', 'Sales_Region', 'Size_Category', 'Range_Category']

# Prepare features
X = df[numeric_features].copy()
X_cat = pd.get_dummies(df[categorical_features])
X = pd.concat([X, X_cat], axis=1)
y = df['Price_USD']

# Train model
model = RandomForestRegressor(n_estimators=200, random_state=42)
model.fit(X, y)

# Plot feature importance
feature_importance = pd.DataFrame({
    'feature': X.columns,
    'importance': model.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False).head(10)

plt.figure(figsize=(12, 6))
sns.barplot(data=feature_importance, x='importance', y='feature')
plt.title('Top 10 Most Important Features')
plt.show()

## 6. Price Prediction Example

In [None]:
# Example prediction
example_data = pd.DataFrame({
    'Production_Year': [2020],
    'Number_of_Engines': [2],
    'Capacity': [50],
    'Range_km': [3000],
    'Fuel_Consumption_Lph': [8.42],
    'Hourly_Maintenance_Cost': [2782],
    'Age': [5],
    'Model': ['Bombardier CRJ200'],
    'Sales_Region': ['Asia']
})

# Engineer features for the example
example_data = engineer_features(example_data)

# Prepare features
X_example = example_data[numeric_features].copy()
X_example_cat = pd.get_dummies(example_data[categorical_features])
X_example = pd.concat([X_example, X_example_cat], axis=1)

# Add missing columns
for col in X.columns:
    if col not in X_example.columns:
        X_example[col] = 0

# Make prediction
prediction = model.predict(X_example[X.columns])[0]
print(f"Predicted price: ${prediction:,.2f}")