In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
mahatiratusher_flight_price_dataset_of_bangladesh_path = kagglehub.dataset_download('mahatiratusher/flight-price-dataset-of-bangladesh')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

## Title: Bangladesh Flight Prices Dataset

#### Description: This dataset contains flight price information for various domestic and international routes in Bangladesh. It includes details such as airline, departure and arrival times, duration, stops, class, and ticket prices. Useful for price trend analysis and travel cost predictions.

## Import dataset

In [None]:
df = pd.read_csv('/kaggle/input/flight-price-dataset-of-bangladesh/Flight_Price_Dataset_of_Bangladesh.csv')

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.dtypes

In [None]:
df.corr

In [None]:
df.columns

## Data visualizations

In [None]:
import plotly.express as px
# Convert datetime columns
df['Departure Date & Time'] = pd.to_datetime(df['Departure Date & Time'])
df['Arrival Date & Time'] = pd.to_datetime(df['Arrival Date & Time'])

# 1. Histogram of Total Fare (BDT)
plt.figure(figsize=(8,5))
sns.histplot(df['Total Fare (BDT)'], bins=30, kde=True, color='blue')
plt.title('Distribution of Total Fare (BDT)')
plt.xlabel('Total Fare (BDT)')
plt.ylabel('Frequency')
plt.show()

# 2. Boxplot of Total Fare by Airline
plt.figure(figsize=(12,6))
sns.boxplot(x='Airline', y='Total Fare (BDT)', data=df)
plt.xticks(rotation=90)
plt.title('Flight Prices by Airline')
plt.show()

# 3. Line Plot - Average Fare vs. Days Before Departure
avg_fare_by_days = df.groupby('Days Before Departure')['Total Fare (BDT)'].mean()
plt.figure(figsize=(10,5))
plt.plot(avg_fare_by_days.index, avg_fare_by_days.values, marker='o', linestyle='-')
plt.title('Average Total Fare vs. Days Before Departure')
plt.xlabel('Days Before Departure')
plt.ylabel('Average Total Fare (BDT)')
plt.grid()
plt.show()

# 4. Scatter Plot - Duration vs. Total Fare
plt.figure(figsize=(8,5))
sns.scatterplot(x='Duration (hrs)', y='Total Fare (BDT)', hue='Stopovers', data=df)
plt.title('Duration vs. Total Fare')
plt.show()

# 5. Pie Chart - Class Distribution
class_counts = df['Class'].value_counts()
plt.figure(figsize=(6,6))
plt.pie(class_counts, labels=class_counts.index, autopct='%1.1f%%', colors=['blue', 'green', 'red'])
plt.title('Class Distribution')
plt.show()

# 6. Heatmap - Average Fare by Seasonality & Airline
pivot_table = df.pivot_table(values='Total Fare (BDT)', index='Seasonality', columns='Airline', aggfunc='mean')
plt.figure(figsize=(12,6))
sns.heatmap(pivot_table, cmap='coolwarm', annot=True, fmt='.0f')
plt.title('Average Fare by Seasonality & Airline')
plt.show()

# 7. Map of Source-Destination Pairs with Average Fare
flight_routes = df.groupby(['Source', 'Destination'])['Total Fare (BDT)'].mean().reset_index()
fig = px.scatter_geo(flight_routes, locations='Source', locationmode='country names',
                     hover_name='Destination', size='Total Fare (BDT)', title='Flight Prices by Route')
fig.show()


## Predictive modeling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score

In [None]:
# Drop non-numeric columns
df = df.drop(columns=['Source Name', 'Destination Name'])

In [None]:
# Label Encode categorical variables
label_encoders = {}
categorical_cols = ['Airline', 'Source', 'Destination', 'Aircraft Type', 'Class', 'Booking Source', 'Seasonality', 'Stopovers']
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [None]:
# Train-test split
X = df.drop(columns=['Total Fare (BDT)'])
y = df['Total Fare (BDT)']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Check for datetime columns
datetime_cols = X_train.select_dtypes(include=['datetime64']).columns

# Drop datetime columns
X_train = X_train.drop(columns=datetime_cols)
X_test = X_test.drop(columns=datetime_cols)

In [None]:
# Standardize numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Support Vector Regressor': SVR(),
    'K-Nearest Neighbors': KNeighborsRegressor(n_neighbors=5),
    'Decision Tree': DecisionTreeRegressor(random_state=42)
}

In [None]:
# Train and evaluate models
accuracy_results = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = r2_score(y_test, y_pred) * 100
    accuracy_results[name] = accuracy
    print(f'{name} Accuracy: {accuracy:.2f}%')

In [None]:
# Display results
plt.figure(figsize=(8, 5))
sns.barplot(x=list(accuracy_results.keys()), y=list(accuracy_results.values()), palette='coolwarm')
plt.xticks(rotation=45)
plt.ylabel('R2 Score (%)')
plt.title('Model Performance on Flight Price Prediction')
plt.show()

## Thank you!!!!....pls upvote!!!!