## Feature Engineering - Lab 01

Importing necessary libraries

In [None]:
import pandas as pd

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from xgboost import XGBClassifier

In [34]:
# Load the CSV file containing the data set
df_train = pd.read_csv('train.csv')
print(f"Shape of the train data set: {df_train.shape}")

df_valid = pd.read_csv('valid.csv')
print(f"Shape of the validation data set: {df_valid.shape}")

df_test = pd.read_csv('X_test.csv')
print(f"Shape of the test data set: {df_test.shape}")


Shape of the train data set: (517788, 145)
Shape of the validation data set: (172596, 145)
Shape of the test data set: (172596, 144)


In [None]:
null_counts = df_train.isnull().sum()
print(null_counts)

In [35]:
# Remove columns with more than 50% missing values
threshold = 0.5
columns_to_drop = df_train.columns[df_train.isnull().mean() > threshold].tolist()

df_train.drop(columns_to_drop, axis=1, inplace=True)
df_valid.drop(columns_to_drop, axis=1, inplace=True)
df_test.drop(columns_to_drop, axis=1, inplace=True)

print(df_train.shape)
print(df_valid.shape)
print(df_test.shape)

(517788, 87)
(172596, 87)
(172596, 86)


In [None]:
df_test.to_csv('Mid.csv', index=False)

In [36]:
# Get the number of unique values in each column
unique_counts = df_train.nunique()

# Filter columns with only one unique value
constant_columns = unique_counts[unique_counts == 1].index.tolist()

df_train.drop(constant_columns, axis=1, inplace=True)
df_valid.drop(constant_columns, axis=1, inplace=True)
df_test.drop(constant_columns, axis=1, inplace=True)

print(df_train.shape)
print(df_valid.shape)
print(df_test.shape)

(517788, 82)
(172596, 82)
(172596, 81)


In [37]:
categorical_cols = df_train.select_dtypes(include=['object']).columns
print(categorical_cols)

Index(['term', 'grade', 'sub_grade', 'emp_title', 'emp_length',
       'home_ownership', 'verification_status', 'issue_d', 'purpose', 'title',
       'zip_code', 'addr_state', 'earliest_cr_line', 'initial_list_status',
       'last_pymnt_d', 'last_credit_pull_d', 'application_type',
       'disbursement_method', 'debt_settlement_flag'],
      dtype='object')


In [38]:
# Imput missing values in the categorical columns
for col in categorical_cols:
    mode_value = df_train[col].mode()[0]

    df_train[col] = df_train[col].fillna(mode_value)
    df_valid[col] = df_valid[col].fillna(mode_value)
    df_test[col] = df_test[col].fillna(mode_value)

In [40]:
ordinal_cols = pd.Index(['term', 'grade', 'sub_grade', 'emp_length', 'home_ownership', 'verification_status', 'initial_list_status', 'application_type', 'disbursement_method', 'debt_settlement_flag'])
nominal_cols = categorical_cols.drop(ordinal_cols)
print(ordinal_cols)
print(nominal_cols)

Index(['term', 'grade', 'sub_grade', 'emp_length', 'home_ownership',
       'verification_status', 'initial_list_status', 'application_type',
       'disbursement_method', 'debt_settlement_flag'],
      dtype='object')
Index(['emp_title', 'issue_d', 'purpose', 'title', 'zip_code', 'addr_state',
       'earliest_cr_line', 'last_pymnt_d', 'last_credit_pull_d'],
      dtype='object')


In [41]:
# Perform ordinal encoding on ordinal columns
from sklearn.preprocessing import OrdinalEncoder

ordinal_encoder = OrdinalEncoder()
combined_data = pd.concat([df_train, df_valid])
ordinal_encoder.fit(combined_data[ordinal_cols])

df_train[ordinal_cols] = ordinal_encoder.transform(df_train[ordinal_cols])
df_valid[ordinal_cols] = ordinal_encoder.transform(df_valid[ordinal_cols])
df_test[ordinal_cols] = ordinal_encoder.transform(df_test[ordinal_cols])

print(df_train.shape)
print(df_valid.shape)
print(df_test.shape)

(517788, 82)
(172596, 82)
(172596, 81)


In [60]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()

for col in nominal_cols:
    df_train[col] = label_encoder.fit_transform(df_train[col])
    df_valid[col] = label_encoder.transform(df_valid[col])
    df_test[col] = label_encoder.transform(df_test[col])

print(df_train.shape)
print(df_valid.shape)
print(df_test.shape)

(517788, 82)
(172596, 82)
(172596, 81)


In [None]:
# Split the Dataset into Features (X) and Target Variable (y)

X_train = df_train.drop(columns=['loan_status']) # Features
y_train = df_train['loan_status'] # Target Variable

X_val = df_valid.drop(columns=['loan_status']) # Features
y_val = df_valid['loan_status'] # Target Variable

In [None]:
# Train a model with the XGBoost Classifier
xgb_model = XGBClassifier()
xgb_model.fit(X_train, y_train)

In [None]:
y_pred = xgb_model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print("Accuracy:", accuracy)

In [None]:
# Save the predictions to a CSV file

results = pd.DataFrame({'loan_status': y_pred})
combined_data = pd.concat([X_val, results], axis=1)
combined_data.to_csv('210173T.csv', index=False)