In [None]:
#This section imports all of the needed libraries 
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
from scipy.stats import pearsonr, norm, ttest_ind

In [None]:
#This section references the the files path of my data set
file_path = 'Aviation _Market_Canada.csv'
data = pd.read_csv(file_path)

In [None]:
data.rename(columns=lambda x: x.strip(), inplace=True)

In [None]:
# Read the CSV file 
df = pd.read_csv(r'Aviation _Market_Canada.csv')
print(df.head())

In [None]:
print(data.columns)

Histogram and Desctiptive Analysis:

In [None]:
#This section plots the histograms considering the 5 variables. 
def plot_histogram_and_descriptive_stats(data, column_name):
    plt.figure(figsize=(8, 5))
    plt.hist(data[column_name].dropna(), bins=15, edgecolor='black')
    plt.title(f'Histogram of {column_name}')
    plt.xlabel(column_name)
    plt.ylabel('Frequency')
    plt.show()
#This section calculates the descriptive statistical values for each variable
    mean_value = data[column_name].mean()
    mode_value = data[column_name].mode()[0] if not data[column_name].mode().empty else None
    std_dev = data[column_name].std()
    min_value = data[column_name].min()
    max_value = data[column_name].max()

    return {
        "Mean": mean_value,
        "Mode": mode_value,
        "Standard Deviation": std_dev,
        "Min": min_value,
        "Max": max_value,
    }

In [None]:
#This section creates the loop so that all of the variables are considered
descriptive_stats = {}
for column in ["Month", "Operating Airline   Capacity", "DepCount", "Distance (km)", "Seats per Operation"]:
    descriptive_stats[column] = plot_histogram_and_descriptive_stats(data, column)

Probability Mass Function

In [None]:
# This section defines the high-load and low-load flights based on the median split of 'Seats per Operation'
median_seats = data['Seats per Operation'].median()
high_seats_flights = data[data['Seats per Operation'] > median_seats]
low_seats_flights = data[data['Seats per Operation'] <= median_seats]

In [None]:
# This section will calculate PMF for high-seat and low-seat flights
def calculate_pmf(data, column_name):
    counts = data[column_name].value_counts(normalize=True).sort_index()
    return counts
pmf_high_seats = calculate_pmf(high_seats_flights, 'Seats per Operation')
pmf_low_seats = calculate_pmf(low_seats_flights, 'Seats per Operation')

In [None]:
#This section plots the PMF graph
plt.figure(figsize=(14, 6))
plt.plot(pmf_high_seats.index, pmf_high_seats.values, label='High Seats Flights', marker='o', linestyle='-', alpha=0.7)
plt.plot(pmf_low_seats.index, pmf_low_seats.values, label='Low Seats Flights', marker='o', linestyle='-', alpha=0.7)
plt.title('PMF of Seats per Operation: High vs Low Seats Flights')
plt.xlabel('Seats per Operation')
plt.ylabel('Probability')
plt.legend()
plt.tight_layout()
plt.show()

CDF Analysis:

In [None]:
#This section will calculate and plot the CDF for Seats per Operation
def calculate_cdf(data, column_name):
    sorted_data = np.sort(data[column_name].dropna())
    cumulative_prob = np.arange(1, len(sorted_data) + 1) / len(sorted_data)
    return sorted_data, cumulative_prob
sorted_seats, cumulative_prob_seats = calculate_cdf(data, 'Seats per Operation')

In [None]:
plt.figure(figsize=(10, 6))
plt.plot(sorted_seats, cumulative_prob_seats, marker='.', linestyle='-', color='b')
plt.title('CDF of Seats per Operation')
plt.xlabel('Seats per Operation')
plt.ylabel('Cumulative Probability')
plt.grid(True)
plt.tight_layout()
plt.show()

Analytical Distribution Fit

In [None]:
#This section will fit a normal distribution to the Seats per Operation data and generate values for the normal distribution using the fitted parameters
seats_data = data['Seats per Operation'].dropna()
mu, sigma = norm.fit(seats_data)

x_values = np.linspace(seats_data.min(), seats_data.max(), 100)
fitted_pdf = norm.pdf(x_values, mu, sigma)

In [None]:
#This section plots the observed data from the dataset and the fitted distribution calculated above
plt.figure(figsize=(10, 6))
plt.hist(seats_data, bins=30, density=True, alpha=0.6, color='g', edgecolor='black', label='Observed Data')
plt.plot(x_values, fitted_pdf, 'r--', linewidth=2, label='Fitted Normal Distribution')
plt.title('Seats per Operation - Fitted Normal Distribution')
plt.xlabel('Seats per Operation')
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

Scatterplot and Correlation Analysis

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(data['DepCount'], data['Seats per Operation'], alpha=0.5, edgecolor='k')
plt.title('Seats per Operation vs DepCount')
plt.xlabel('DepCount (Number of Departures)')
plt.ylabel('Seats per Operation')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
seats_depcount_corr, _ = pearsonr(data['DepCount'].dropna(), data['Seats per Operation'].dropna())
print(f"Pearson correlation between DepCount and Seats per Operation: {seats_depcount_corr}")

In [None]:
filtered_data = data[['Distance (km)', 'Seats per Operation']].dropna()

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(data['Distance (km)'], data['Seats per Operation'], alpha=0.5, edgecolor='k')
plt.title('Seats per Operation vs Distance')
plt.xlabel('Distance (km)')
plt.ylabel('Seats per Operation')
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
seats_distance_corr, _ = pearsonr(filtered_data['Distance (km)'], filtered_data['Seats per Operation'])
print(f"Pearson correlation between Distance and Seats per Operation: {seats_distance_corr}")

Hypothesis Testing 

In [None]:
#This section tests the hypothesis that mean seats per operation differ significantly for flights with high departure counts vs flights with low departure counts 
high_dep_flights = data[data['DepCount'] > data['DepCount'].median()]
low_dep_flights = data[data['DepCount'] <= data['DepCount'].median()]
t_stat, p_value = ttest_ind(high_dep_flights['Seats per Operation'].dropna(), low_dep_flights['Seats per Operation'].dropna())
print(f"Hypothesis Test - T-statistic: {t_stat}, P-value: {p_value}")

In [None]:
X = data['DepCount'].dropna()
y = data['Seats per Operation'].dropna()

#This is the constant for the intercept
X = sm.add_constant(X)

model = sm.OLS(y, X).fit()
print(model.summary())