In [1]:
import warnings
warnings.simplefilter("ignore")

In [2]:
import numpy as np
import pandas as pd

# **First we make the tools we gonna use from scratch:**

In [3]:
#Simple Imputer
class SimpleImputer:
    def __init__(self, strategy='mean'):
        self.strategy = strategy
        self.fill_values_ = {}

    def fit(self, X):
        if isinstance(X, pd.Series):
            X = X.to_frame()

        for col in X.columns:
            if X[col].dtype in [np.float64, np.int64]:  # Numerical columns
                if self.strategy == 'mean':
                    self.fill_values_[col] = X[col].mean()
                elif self.strategy == 'median':
                    self.fill_values_[col] = X[col].median()
            else:  # Categorical columns
                self.fill_values_[col] = X[col].mode()[0]

    def transform(self, X):
        is_series = False
        if isinstance(X, pd.Series):
            X = X.to_frame()
            is_series = True

        X_filled = X.copy()
        for col, fill_value in self.fill_values_.items():
            X_filled[col].fillna(fill_value, inplace=True)
        
        return X_filled if not is_series else X_filled.iloc[:, 0]

    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

In [4]:
#Standard Scaler
class StandardScaler:
    def fit(self, X):
        self.mean_ = np.mean(X, axis=0)
        self.std_ = np.std(X, axis=0)
        
    def transform(self, X):
        return (X - self.mean_) / self.std_
    
    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)


In [5]:
#Ordinal Encoder
class OrdinalEncoder:
    
    def fit(self, X):
        self.categories_ = [np.unique(col) for col in X.T]
        self.category_dicts_ = [{cat: idx for idx, cat in enumerate(cats)} for cats in self.categories_]
        
    def transform(self, X):
        return np.array([[self.category_dicts_[i].get(value, -1) for i, value in enumerate(row)] for row in X])
    
    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)


In [6]:
#train_test_split
def train_test_split(x, y=None, test_size=0.2, random_state=None):
    if random_state is not None:
        np.random.seed(random_state)

    indices = np.arange(len(x))
    np.random.shuffle(indices)

    split_index = int(len(x) * (1 - test_size))

    train_indices = indices[:split_index]
    test_indices = indices[split_index:]

    x_train = x[train_indices]
    x_test = x[test_indices]
    
    if y is not None:
        y_train = y[train_indices]
        y_test = y[test_indices]
        return x_train, x_test, y_train, y_test
    
    return x_train, x_test

In [7]:
#Dummy Classifier
class DummyClassifier:
    def fit(self, X, y):
        counts = np.bincount(y)
        self.most_frequent_class_ = np.argmax(counts)
        
    def predict(self, X):
        return np.full(X.shape[0], self.most_frequent_class_)

In [8]:
#Logistic Regression
class LogisticRegression:
    def __init__(self, learning_rate=0.01, num_iterations=1000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
    
    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def fit(self, X, y):
        self.theta = np.zeros(X.shape[1])
        self.bias = 0
        
        for _ in range(self.num_iterations):
            linear_model = np.dot(X, self.theta) + self.bias
            y_pred = self.sigmoid(linear_model)
            
            # Compute gradients
            dw = (1 / X.shape[0]) * np.dot(X.T, (y_pred - y))
            db = (1 / X.shape[0]) * np.sum(y_pred - y)
            
            # Update parameters
            self.theta -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
    
    def predict_proba(self, X):
        linear_model = np.dot(X, self.theta) + self.bias
        return self.sigmoid(linear_model)
    
    def predict(self, X):
        y_pred_proba = self.predict_proba(X)
        return (y_pred_proba >= 0.5).astype(int)

In [9]:
#Accuracy
def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

# **Now let's test them on a dataset :**

In [10]:
airline = pd.read_csv("/kaggle/input/airline/airline_passenger_satisfaction.csv")
airline

Unnamed: 0,ID,Gender,Age,Customer Type,Type of Travel,Class,Flight Distance,Departure Delay,Arrival Delay,Departure and Arrival Time Convenience,...,On-board Service,Seat Comfort,Leg Room Service,Cleanliness,Food and Drink,In-flight Service,In-flight Wifi Service,In-flight Entertainment,Baggage Handling,Satisfaction
0,1,Male,48,First-time,Business,Business,821,2,5.0,3,...,3,5,2,5,5,5,3,5,5,Neutral or Dissatisfied
1,2,Female,35,Returning,Business,Business,821,26,39.0,2,...,5,4,5,5,3,5,2,5,5,Satisfied
2,3,Male,41,Returning,Business,Business,853,0,0.0,4,...,3,5,3,5,5,3,4,3,3,Satisfied
3,4,Male,50,Returning,Business,Business,1905,0,0.0,2,...,5,5,5,4,4,5,2,5,5,Satisfied
4,5,Female,49,Returning,Business,Business,3470,0,1.0,3,...,3,4,4,5,4,3,3,3,3,Satisfied
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129875,129876,Male,28,Returning,Personal,Economy Plus,447,2,3.0,4,...,5,1,4,4,4,5,4,4,4,Neutral or Dissatisfied
129876,129877,Male,41,Returning,Personal,Economy Plus,308,0,0.0,5,...,5,2,5,2,2,4,3,2,5,Neutral or Dissatisfied
129877,129878,Male,42,Returning,Personal,Economy Plus,337,6,14.0,5,...,3,3,4,3,3,4,2,3,5,Neutral or Dissatisfied
129878,129879,Male,50,Returning,Personal,Economy Plus,337,31,22.0,4,...,4,4,5,3,3,4,5,3,5,Satisfied


In [11]:
airline.isna().sum()

ID                                          0
Gender                                      0
Age                                         0
Customer Type                               0
Type of Travel                              0
Class                                       0
Flight Distance                             0
Departure Delay                             0
Arrival Delay                             393
Departure and Arrival Time Convenience      0
Ease of Online Booking                      0
Check-in Service                            0
Online Boarding                             0
Gate Location                               0
On-board Service                            0
Seat Comfort                                0
Leg Room Service                            0
Cleanliness                                 0
Food and Drink                              0
In-flight Service                           0
In-flight Wifi Service                      0
In-flight Entertainment           

# **Use of Simple Imputer on Arrival Delay column to clean the Nan-Values**

In [12]:
imputer = SimpleImputer(strategy="mean")
airline['Arrival Delay'] = imputer.fit_transform(airline['Arrival Delay'])

In [13]:
airline.isna().sum()

ID                                        0
Gender                                    0
Age                                       0
Customer Type                             0
Type of Travel                            0
Class                                     0
Flight Distance                           0
Departure Delay                           0
Arrival Delay                             0
Departure and Arrival Time Convenience    0
Ease of Online Booking                    0
Check-in Service                          0
Online Boarding                           0
Gate Location                             0
On-board Service                          0
Seat Comfort                              0
Leg Room Service                          0
Cleanliness                               0
Food and Drink                            0
In-flight Service                         0
In-flight Wifi Service                    0
In-flight Entertainment                   0
Baggage Handling                

# **Ordinal Encoder on categorical columns**

In [14]:
#Categorical columns
cat = ['Gender', 'Customer Type', 'Type of Travel', 'Class', 'Satisfaction']
#Encode the Categorical columns
def encode_cat(df):
    for col in cat:
        encoder = OrdinalEncoder()
        df[col] = encoder.fit_transform(df[[col]].values)
    return df

In [15]:
airline["Gender"].unique()

array(['Male', 'Female'], dtype=object)

In [16]:
airline = encode_cat(airline)
airline

Unnamed: 0,ID,Gender,Age,Customer Type,Type of Travel,Class,Flight Distance,Departure Delay,Arrival Delay,Departure and Arrival Time Convenience,...,On-board Service,Seat Comfort,Leg Room Service,Cleanliness,Food and Drink,In-flight Service,In-flight Wifi Service,In-flight Entertainment,Baggage Handling,Satisfaction
0,1,1,48,0,0,0,821,2,5.0,3,...,3,5,2,5,5,5,3,5,5,0
1,2,0,35,1,0,0,821,26,39.0,2,...,5,4,5,5,3,5,2,5,5,1
2,3,1,41,1,0,0,853,0,0.0,4,...,3,5,3,5,5,3,4,3,3,1
3,4,1,50,1,0,0,1905,0,0.0,2,...,5,5,5,4,4,5,2,5,5,1
4,5,0,49,1,0,0,3470,0,1.0,3,...,3,4,4,5,4,3,3,3,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129875,129876,1,28,1,1,2,447,2,3.0,4,...,5,1,4,4,4,5,4,4,4,0
129876,129877,1,41,1,1,2,308,0,0.0,5,...,5,2,5,2,2,4,3,2,5,0
129877,129878,1,42,1,1,2,337,6,14.0,5,...,3,3,4,3,3,4,2,3,5,0
129878,129879,1,50,1,1,2,337,31,22.0,4,...,4,4,5,3,3,4,5,3,5,1


# **Standard Scaler on Numerical columns**

In [17]:
airline.columns

Index(['ID', 'Gender', 'Age', 'Customer Type', 'Type of Travel', 'Class',
       'Flight Distance', 'Departure Delay', 'Arrival Delay',
       'Departure and Arrival Time Convenience', 'Ease of Online Booking',
       'Check-in Service', 'Online Boarding', 'Gate Location',
       'On-board Service', 'Seat Comfort', 'Leg Room Service', 'Cleanliness',
       'Food and Drink', 'In-flight Service', 'In-flight Wifi Service',
       'In-flight Entertainment', 'Baggage Handling', 'Satisfaction'],
      dtype='object')

In [18]:
#Numerical columns
num = ['Age', 'Flight Distance', 'Departure Delay', 'Arrival Delay']
#Scale the Numerical columns
def scaler(df):
    for col in num:
        scaler = StandardScaler()
        df[col] = scaler.fit_transform(df[[col]].values)
    return df

In [19]:
airline = scaler(airline)
airline

Unnamed: 0,ID,Gender,Age,Customer Type,Type of Travel,Class,Flight Distance,Departure Delay,Arrival Delay,Departure and Arrival Time Convenience,...,On-board Service,Seat Comfort,Leg Room Service,Cleanliness,Food and Drink,In-flight Service,In-flight Wifi Service,In-flight Entertainment,Baggage Handling,Satisfaction
0,1,1,0.566960,0,0,0,-0.370261,-0.333948,-0.262740,3,...,3,5,2,5,5,5,3,5,5,0
1,2,0,-0.292868,1,0,0,-0.370261,0.296454,0.622509,2,...,5,4,5,5,3,5,2,5,5,1
2,3,1,0.103976,1,0,0,-0.338179,-0.386481,-0.392924,4,...,3,5,3,5,5,3,4,3,3,1
3,4,1,0.699242,1,0,0,0.716512,-0.386481,-0.392924,2,...,5,5,5,4,4,5,2,5,5,1
4,5,0,0.633101,1,0,0,2.285515,-0.386481,-0.366887,3,...,3,4,4,5,4,3,3,3,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129875,129876,1,-0.755852,1,1,2,-0.745218,-0.333948,-0.314814,4,...,5,1,4,4,4,5,4,4,4,0
129876,129877,1,0.103976,1,1,2,-0.884573,-0.386481,-0.392924,5,...,5,2,5,2,2,4,3,2,5,0
129877,129878,1,0.170117,1,1,2,-0.855499,-0.228881,-0.028409,5,...,3,3,4,3,3,4,2,3,5,0
129878,129879,1,0.699242,1,1,2,-0.855499,0.427787,0.179885,4,...,4,4,5,3,3,4,5,3,5,1


# **Modeling**

In [20]:
x = airline.drop(columns = ['ID', 'Satisfaction'], axis=1)
y = airline['Satisfaction'].values

In [21]:
x_train, x_test, y_train, y_test = train_test_split(x.values, y, test_size=0.2, random_state=4)

In [22]:
model = LogisticRegression(learning_rate=0.01, num_iterations=1000)
model.fit(x_train, y_train)

In [23]:
y_pred = model.predict(x_test)

In [24]:
accuracy = accuracy(y_test, y_pred)
print("Model Accuracy:", accuracy)

Model Accuracy: 0.8233369263935941


In [25]:
model1 = DummyClassifier()
model1.fit(x_train, y_train)

In [26]:
y_pred = model1.predict(x_test)