In [9]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/teco-psda-exercisesheet1-credit-2024/credit_test.csv
/kaggle/input/teco-psda-exercisesheet1-credit-2024/credit_test_sample.csv
/kaggle/input/teco-psda-exercisesheet1-credit-2024/credit_train.csv


In [3]:
# Load the data
data_path = '/kaggle/input/teco-psda-exercisesheet1-credit-2024/credit_train.csv'
data = pd.read_csv(data_path)

# Display the first few rows of the dataframe
data.head()

Unnamed: 0.1,Unnamed: 0,Income,Limit,Rating,Cards,Age,Education,Gender,Student,Married,Ethnicity,Balance
0,1,14.891,3606,283,2,34,11,Male,No,Yes,Caucasian,333
1,3,104.593,7075,514,4,71,11,Male,No,No,Asian,580
2,4,148.924,9504,681,3,36,11,Female,No,No,Asian,964
3,5,55.882,4897,357,2,68,16,Male,No,Yes,Caucasian,331
4,7,20.996,3388,259,2,37,12,Female,No,No,African American,203


In [4]:
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

columns_to_check = ['Income', 'Limit', 'Rating']

filtered_data = data.copy()
for column in columns_to_check:
    filtered_data = remove_outliers(filtered_data, column)

original_shape = data.shape
filtered_shape = filtered_data.shape
original_shape, filtered_shape

((350, 12), (321, 12))

In [11]:
train_data_encoded = pd.get_dummies(filtered_data, columns=['Student', 'Married','Gender','Ethnicity'], drop_first=True)

# View processed data
train_data_encoded.head()

X = train_data_encoded.drop(['Balance'], axis=1)
y = train_data_encoded['Balance']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
from sklearn.metrics import mean_absolute_error
import xgboost as xgb


dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'reg:squarederror',  
    'max_depth': 3,  
    'gamma':0.2,
    'alpha': 10,
    'learning_rate': 0.1,             
    'colsample_bytree':0.9,
    'subsample':0.8
}

model_xgb = xgb.train(params, dtrain, num_boost_round=100)
predictions_xgb = model_xgb.predict(dtest)

mae = mean_absolute_error(y_test, predictions_xgb)
print("MAE: ", mae)

MAE:  59.763629630895764


In [13]:
test_values = pd.read_csv('/kaggle/input/teco-psda-exercisesheet1-credit-2024/credit_test.csv')
test_data_encoded_corrected = pd.get_dummies(test_values, columns=['Student', 'Married', 'Gender', 'Ethnicity'], drop_first=True)

best_model = model_xgb
dtest = xgb.DMatrix(test_data_encoded_corrected)
test_predictions = best_model.predict(dtest)
print(test_predictions)

results_df = pd.DataFrame({
    'Id': test_values['Unnamed: 0'],  
    'Expected': test_predictions
})

results_df.to_csv('predictions_xgb.csv', index=False)

[ 828.8682    1054.2678      58.299007   918.08044    969.67804
  976.4312       5.2253566  282.83356   1089.2023     983.7004
  905.5854     430.02924    543.58887     -7.504702    52.00324
  112.85096    151.78253     33.223583   746.5915     398.58368
  769.03876     34.118305   528.55804   1046.0884    1050.1025
  933.0319       6.314094   162.63896    184.48509    469.87396
  184.54236    732.54083    161.88261     58.50698   1081.1266
  544.3246    1242.4419      30.636057     4.6072335  310.3351
   23.845478  1027.0413     879.462        6.2956276 1044.65
 1044.65       -35.754215   966.1698     380.78302      9.822888 ]
