<a href="https://colab.research.google.com/github/dreamingv-oid/CS290/blob/main/Project2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import matplotlib.dates as mdates
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix




In [None]:
def Attribute_selection_method(df, target_var, criterion):
    # Splitting features and target variable
    X = df.drop(target_var, axis=1)
    y = df[target_var]

    # Identify categorical columns
    categorical_cols = X.select_dtypes(include=['object', 'category']).columns

    # One-hot encode categorical features
    if len(categorical_cols) > 0:
        X = pd.get_dummies(X, columns=categorical_cols)

    # Use DecisionTreeClassifier instead of DecisionTreeRegressor
    if criterion == "entropy":
        model = DecisionTreeClassifier(criterion="entropy")  # Using entropy criterion
    elif criterion == "gini":
        model = DecisionTreeClassifier(criterion="gini")  # Using gini criterion
    else:
        return "Invalid criterion"

    # Fit the model to the data
    model.fit(X, y)

    # Get feature importances from the decision tree model
    importances = model.feature_importances_

    # Find the best feature with the highest importance
    best_feature = X.columns[np.argmax(importances)]

    return best_feature

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Step 1: Data Preprocessing with One-Hot Encoding
def preprocess_data(df):
    # Separate features and target
    X = df.drop('satisfaction', axis=1)
    y = df['satisfaction']

    # Identify categorical columns
    categorical_cols = X.select_dtypes(include='object').columns.tolist()

    # Target label encoding (satisfied or dissatisfied)
    target_encoder = LabelEncoder()
    y = target_encoder.fit_transform(y)  # Binary encoding for the target variable

    return X, y, categorical_cols, target_encoder

# Load your dataset (assuming it's stored as 'airline_dataset.csv')
df = pd.read_csv('https://raw.githubusercontent.com/dreamingv-oid/CS290/main/train.csv') # Use raw.githubusercontent.com to access raw data
df.dropna(inplace = True)

# Preprocess the data
X, y, categorical_cols, target_encoder = preprocess_data(df)

# Step 2: Define the One-Hot Encoder for Categorical Columns
# Create a ColumnTransformer that will One-Hot Encode the categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)  # OneHotEncode categorical columns
    ], remainder='passthrough')  # Keep other columns (e.g., numerical columns) as is

# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 4: Build a Pipeline with One-Hot Encoding and DecisionTreeClassifier
clf = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Preprocess the data
    ('classifier', DecisionTreeClassifier(random_state=42))  # Train the decision tree classifier
])

# Step 5: Cross-Validation and Model Training
cv_scores = cross_val_score(clf, X_train, y_train, cv=5)  # 5-fold cross-validation
print(f"Cross-Validation Accuracy: {cv_scores.mean():.4f}")

# Train the model on the full training data
clf.fit(X_train, y_train)

# Step 6: Model Evaluation on the Test Set
y_pred = clf.predict(X_test)

# Step 7: Performance Assessment
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=target_encoder.classes_))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Attribute_selection_method(df,'satisfaction','gini')


Cross-Validation Accuracy: 0.9430
Accuracy Score: 0.9473912833630966

Classification Report:
                          precision    recall  f1-score   support

neutral or dissatisfied       0.95      0.95      0.95     11655
              satisfied       0.94      0.94      0.94      9064

               accuracy                           0.95     20719
              macro avg       0.95      0.95      0.95     20719
           weighted avg       0.95      0.95      0.95     20719


Confusion Matrix:
 [[11107   548]
 [  542  8522]]


'Online boarding'

Project 2 Task 1

In [None]:
import pandas as pd
import numpy as np

def naive_bayes_classifier(X, y, new_instance):
    # Ensure y is a pandas Series and reset indices for alignment
    y = pd.Series(y).reset_index(drop=True)
    X = X.reset_index(drop=True)

    # Calculate class counts and priors
    class_counts = y.value_counts().to_dict()
    total_count = len(y)
    priors = {cls: np.log(count / total_count) for cls, count in class_counts.items()}

    # Calculate the likelihoods for each feature per class
    likelihoods = {}
    feature_values = {col: X[col].unique() for col in X.columns}

    for feature in X.columns:
        likelihoods[feature] = {}
        for cls in class_counts.keys():
            X_class = X[y == cls]  # Filter data for the class
            feature_counts = X_class[feature].value_counts().to_dict()

            # Calculate smoothed likelihoods and store log probabilities
            likelihoods[feature][cls] = {
                value: np.log((feature_counts.get(value, 0) + 1) / (class_counts[cls] + len(feature_values[feature])))
                for value in feature_values[feature]
            }

    # Calculate posterior probabilities for each class
    posteriors = {}

    for cls in class_counts.keys():
        # Start with the log of the prior probability
        posteriors[cls] = priors[cls]

        # Add the log likelihoods for each feature
        for feature, value in new_instance.items():
            if value in likelihoods[feature][cls]:
                posteriors[cls] += likelihoods[feature][cls][value]
            else:
                # Handle unseen values with a small probability
                posteriors[cls] += np.log(1 / (class_counts[cls] + len(feature_values[feature])))

    # Predict the class with the highest posterior probability
    return max(posteriors, key=posteriors.get)


# Usage Example
# Assuming X and y have been prepared using preprocess_data function
new_instance = {'Gender': 'Female', 'Customer Type': 'Loyal Customer', 'Age': 35, 'Class': 'Business'}
predicted_class = naive_bayes_classifier(X, y, new_instance)
print("Predicted Class:", predicted_class)


Predicted Class: 1


Task 2
I am using the same dataset here, but I will be training it on the Arrival Delay in minutes feature as the target variable.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

# Load the airline dataset
url = 'https://raw.githubusercontent.com/dreamingv-oid/CS290/main/train.csv'
df = pd.read_csv(url)

# Drop rows with missing values for simplicity
df.dropna(inplace=True)

# Select the target variable for regression
target_var = 'Arrival Delay in Minutes'
X = df.drop(columns=[target_var])
y = df[target_var]

# Identify categorical columns
categorical_cols = X.select_dtypes(include=['object']).columns

# One-hot encode categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ], remainder='passthrough'
)

# Apply the preprocessing and split the dataset
X_encoded = preprocessor.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Initialize and train the DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=42)
regressor.fit(X_train, y_train)

# Make predictions on the test set
y_pred = regressor.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Output the performance metrics
print("Performance of DecisionTreeRegressor on the Airline Dataset:")
print(f"Mean Squared Error (MSE): {mse:.2f}")
print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"R-squared (R²): {r2:.2f}")


Performance of DecisionTreeRegressor on the Airline Dataset:
Mean Squared Error (MSE): 212.42
Mean Absolute Error (MAE): 7.62
R-squared (R²): 0.86


Task 3

In [2]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
import pandas as pd

# Load the airline dataset
url = 'https://raw.githubusercontent.com/dreamingv-oid/CS290/main/train.csv'
df = pd.read_csv(url)
df.dropna(inplace=True)

# Separate features and target variable
X = df.drop(columns=['satisfaction'])
y = df['satisfaction']

# Encode categorical variables using one-hot encoding
categorical_cols = X.select_dtypes(include=['object']).columns
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ], remainder='passthrough'
)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the pipeline including preprocessor and classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=42))
])

# Define parameter grid for hyperparameter tuning
param_grid = {
    'classifier__max_depth': [5, 10, 15, None],
    'classifier__min_samples_split': [2, 10, 20],
    'classifier__min_samples_leaf': [1, 5, 10]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

# Print the best parameters found by GridSearchCV
print("Best Parameters:", grid_search.best_params_)

# Train the model with the best parameters
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Evaluate the tuned model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy of Tuned Model:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['dissatisfied', 'satisfied']))


Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Parameters: {'classifier__max_depth': 15, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 20}
Accuracy of Tuned Model: 0.9554515179304021

Classification Report:
               precision    recall  f1-score   support

dissatisfied       0.95      0.97      0.96     11655
   satisfied       0.97      0.93      0.95      9064

    accuracy                           0.96     20719
   macro avg       0.96      0.95      0.95     20719
weighted avg       0.96      0.96      0.96     20719



Task 4

In [5]:
from sklearn.ensemble import RandomForestClassifier

from sklearn.pipeline import Pipeline



# Train the RandomForestClassifier
pipeline.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print("RandomForestClassifier Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['dissatisfied', 'satisfied']))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


RandomForestClassifier Accuracy: 0.9649596988271635

Classification Report:
               precision    recall  f1-score   support

dissatisfied       0.96      0.98      0.97     11655
   satisfied       0.98      0.94      0.96      9064

    accuracy                           0.96     20719
   macro avg       0.97      0.96      0.96     20719
weighted avg       0.97      0.96      0.96     20719


Confusion Matrix:
 [[11451   204]
 [  522  8542]]
