In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset
file_path = "M1_final.csv"

df = pd.read_csv(r"C:\Dev\GIT\DataScience_Bootcamp\Week4\M1_final.csv")

# Display basic information about the dataset
df.info(), df.head()


In [10]:
# Check for missing values
missing_values = df.isnull().sum()
missing_values[missing_values > 0]


Series([], dtype: int64)

The Wind column has 2 missing values, due to the dataset's size, the two missing values in the Wind column are considered insignificant.
I will fill these with the most frequent values in this case (the mode)

In [11]:
# Fill missing values in 'Wind' with the most frequent value (mode)
df['Wind'].fillna(df['Wind'].mode()[0], inplace=True)

# Verify if missing values are handled
df.isnull().sum().sum()  # Should be 0 if all missing values are addressed


0

Check for data inconsistencies, convert categorical variables where needed, and analyze correlations

In [None]:
# Convert relevant categorical columns to appropriate types
df['Dew Point'] = pd.to_numeric(df['Dew Point'], errors='coerce')  # Convert to numeric
df['Wind Speed'] = pd.to_numeric(df['Wind Speed'], errors='coerce')  # Convert to numeric
df['Wind Gust'] = pd.to_numeric(df['Wind Gust'], errors='coerce')  # Convert to numeric

# Check for inconsistencies after conversion
df.info()


In [None]:
#check for missing values after conversion
df.isnull().sum()

This showed that Dew Point had 1,725 missing values. Fill missing value with median of column


In [17]:
df['Dew Point'].fillna(df['Dew Point'].median(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Dew Point'].fillna(df['Dew Point'].median(), inplace=True)


In [None]:
# 1. Summary statistics of numerical features
num_summary = df.describe()

# 2. Distribution of the target variable (TAXI_OUT)
plt.figure(figsize=(8, 5))
sns.histplot(df['TAXI_OUT'], bins=30, kde=True, color='blue')
plt.title("Distribution of TAXI_OUT")
plt.xlabel("Taxi-Out Time (minutes)")
plt.ylabel("Frequency")
plt.show()

# 3. Box plots of Taxi-Out time by Day of Week
plt.figure(figsize=(8, 5))
sns.boxplot(x=df['DAY_OF_WEEK'], y=df['TAXI_OUT'], palette="coolwarm")
plt.title("Taxi-Out Time by Day of the Week")
plt.xlabel("Day of Week (1=Monday, 7=Sunday)")
plt.ylabel("Taxi-Out Time (minutes)")
plt.show()

# 4. Scatter plot of Departure Delay vs. Taxi-Out Time
plt.figure(figsize=(8, 5))
sns.scatterplot(x=df['DEP_DELAY'], y=df['TAXI_OUT'], alpha=0.5, color='red')
plt.title("Departure Delay vs. Taxi-Out Time")
plt.xlabel("Departure Delay (minutes)")
plt.ylabel("Taxi-Out Time (minutes)")
plt.show()

# 5. Boxplot of Taxi-Out Time by Month
plt.figure(figsize=(10, 6))
sns.boxplot(x=df['MONTH'], y=df['TAXI_OUT'], palette="viridis")
plt.title("Taxi-Out Time by Month")
plt.xlabel("Month")
plt.ylabel("Taxi-Out Time (minutes)")
plt.show()

# Display summary statistics
import ace_tools as tools
tools.display_dataframe_to_user(name="Summary Statistics", dataframe=num_summary)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Select only numeric columns
numeric_df = df.select_dtypes(include=['float64', 'int64'])

# Compute the correlation matrix
corr_matrix = numeric_df.corr()

# Plot the heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm", linewidths=0.5)
plt.title("Correlation Matrix of Numerical Features")
plt.show()


In [20]:
# Encode categorical variables using one-hot encoding for relevant features
df_encoded = pd.get_dummies(df, columns=['OP_UNIQUE_CARRIER', 'DEST', 'Condition'], drop_first=True)

# Drop non-essential columns that do not contribute to prediction
df_encoded.drop(columns=['TAIL_NUM', 'Wind'], inplace=True)

# Display the updated dataset structure
df_encoded.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 28820 entries, 0 to 28819
Columns: 114 entries, MONTH to Condition_Wintry Mix / Windy
dtypes: bool(96), float64(2), int64(16)
memory usage: 6.6 MB


In [21]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Define features and target variable
X = df_encoded.drop(columns=['TAXI_OUT'])
y = df_encoded['TAXI_OUT']

# Split into training and test sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model performance
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

mae, rmse, r2


(5.085751902053599, 6.388169166486702, 0.13421742880847676)