In [2]:
import numpy as np
import pandas as pd
dataframe = pd.read_csv(r'/content/Flight_delay.csv')
data_array = dataframe.to_numpy()
data = pd.DataFrame({
'AirTime': data_array[:100, 11],
'TaxiIn': data_array[:100, 19],
'TaxiOut': data_array[:100, 20],
'CarrierDelay': data_array[:100, 24]
})
# Function to calculate entropy
def entropy(class_labels):
  unique_labels, counts = np.unique(class_labels, return_counts=True)
  prob = counts / len(class_labels)
  entropy = -np.sum(prob * np.log2(prob))
  return entropy
# Function to calculate information gain
def information_gain(data, attribute, class_label):
  total_entropy = entropy(data[class_label])
  unique_values = data[attribute].unique()
  weighted_entropy = 0
  for value in unique_values:
    subset = data[data[attribute] == value]
    subset_entropy = entropy(subset[class_label])
    weight = len(subset) / len(data)
    weighted_entropy += weight * subset_entropy
  return total_entropy - weighted_entropy
# ID3 algorithm for attribute selection
def id3(data, class_label, attributes):
  if len(attributes) == 0:
    # If no attributes are left, return the majority class
    return data[class_label].mode().iloc[0]
  unique_classes = data[class_label].unique()
  if len(unique_classes) == 1:
    # If all examples have the same class, return that class
    return unique_classes[0]
  best_attribute = max(attributes, key=lambda attr: information_gain(data, attr, class_label))
  tree = {best_attribute: {}}
  for value in data[best_attribute].unique():
    subset = data[data[best_attribute] == value]
    if len(subset) == 0:
      # If the subset is empty, return the majority class
      tree[best_attribute][value] = data[class_label].mode().iloc[0]
    else:
      new_attributes = [attr for attr in attributes if attr != best_attribute]
      tree[best_attribute][value] = id3(subset, class_label, new_attributes)
  return tree
# Define the class label and attributes
class_label = 'CarrierDelay'
attributes = ['AirTime', 'TaxiIn', 'TaxiOut']
# Build the ID3 decision tree
decision_tree = id3(data, class_label, attributes)
# Print the decision tree
import pprint
pprint.pprint(decision_tree)

{'AirTime': {32: 0,
             34: 3,
             36: {'TaxiIn': {3: 27, 4: 0, 5: 0}},
             37: 4,
             41: {'TaxiIn': {9: 61, 16: 18}},
             42: {'TaxiIn': {6: 0, 7: 16}},
             43: {'TaxiIn': {3: 114, 5: 2, 7: 0}},
             44: 13,
             45: 15,
             46: {'TaxiOut': {10: 20, 15: 9, 17: 17, 18: 1}},
             47: {'TaxiOut': {5: 7, 16: 10, 24: 9}},
             48: {'TaxiIn': {3: 4, 4: 6}},
             49: {'TaxiIn': {2: 12, 5: 2, 7: 0}},
             59: {'TaxiOut': {7: 282, 9: 0}},
             60: {'TaxiOut': {7: 26, 10: 7, 11: 15}},
             65: 50,
             72: {'TaxiIn': {5: {'TaxiOut': {9: 0}}}},
             73: 9,
             76: 2,
             77: {'TaxiOut': {8: 45, 10: 2, 13: 0, 16: 26}},
             78: {'TaxiIn': {2: 10, 4: {'TaxiOut': {10: 11, 18: 14}}, 6: 7}},
             80: 2,
             81: {'TaxiIn': {3: 15, 8: 7}},
             88: 0,
             90: 25,
             91: 3,
             95: 27

In [11]:
import pandas as pd
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load the dataset from CSV file
dataset = pd.read_csv('/content/Flight_delay.csv')

# Drop non-numeric columns
dataset = dataset.select_dtypes(include='number')

# Split the dataset into features (X) and target variable (y)
X = dataset.drop('ActualElapsedTime', axis=1)
y = dataset['ActualElapsedTime']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create an instance of the DecisionTreeRegressor
regressor = DecisionTreeRegressor()

# Train the regressor on the training data
regressor.fit(X_train, y_train)

# Make predictions on the testing data
y_pred = regressor.predict(X_test)

# Calculate the mean squared error of the regressor
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

Mean Squared Error: 3.9583535408777126
