Data Analysis

In [2]:
import pandas as pd

# Load the data
train_path = '/home/john/ai/kaggle/satander-customer-transaction-binary-prediction/train.csv'
test_path = '/home/john/ai/kaggle/satander-customer-transaction-binary-prediction/test.csv'

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [3]:
# 1. Understand the structure of the data
print("Train DataFrame shape:", train_df.shape)
print("Test DataFrame shape:", test_df.shape)
print("\nTrain DataFrame columns:\n", train_df.columns)
print("\nTest DataFrame columns:\n", test_df.columns)

Train DataFrame shape: (200000, 202)
Test DataFrame shape: (200000, 201)

Train DataFrame columns:
 Index(['ID_code', 'target', 'var_0', 'var_1', 'var_2', 'var_3', 'var_4',
       'var_5', 'var_6', 'var_7',
       ...
       'var_190', 'var_191', 'var_192', 'var_193', 'var_194', 'var_195',
       'var_196', 'var_197', 'var_198', 'var_199'],
      dtype='object', length=202)

Test DataFrame columns:
 Index(['ID_code', 'var_0', 'var_1', 'var_2', 'var_3', 'var_4', 'var_5',
       'var_6', 'var_7', 'var_8',
       ...
       'var_190', 'var_191', 'var_192', 'var_193', 'var_194', 'var_195',
       'var_196', 'var_197', 'var_198', 'var_199'],
      dtype='object', length=201)


In [4]:
# 2. Check for missing values
print("\nMissing values in Train DataFrame:\n", train_df.isnull().sum().sort_values(ascending=False).head())
print("\nMissing values in Test DataFrame:\n", test_df.isnull().sum().sort_values(ascending=False).head())



Missing values in Train DataFrame:
 ID_code    0
var_136    0
var_126    0
var_127    0
var_128    0
dtype: int64

Missing values in Test DataFrame:
 ID_code    0
var_137    0
var_127    0
var_128    0
var_129    0
dtype: int64


In [5]:
# 3. Descriptive statistics
print("\nDescriptive statistics for Train DataFrame:\n", train_df.describe())


Descriptive statistics for Train DataFrame:
               target          var_0          var_1          var_2  \
count  200000.000000  200000.000000  200000.000000  200000.000000   
mean        0.100490      10.679914      -1.627622      10.715192   
std         0.300653       3.040051       4.050044       2.640894   
min         0.000000       0.408400     -15.043400       2.117100   
25%         0.000000       8.453850      -4.740025       8.722475   
50%         0.000000      10.524750      -1.608050      10.580000   
75%         0.000000      12.758200       1.358625      12.516700   
max         1.000000      20.315000      10.376800      19.353000   

               var_3          var_4          var_5          var_6  \
count  200000.000000  200000.000000  200000.000000  200000.000000   
mean        6.796529      11.078333      -5.065317       5.408949   
std         2.043319       1.623150       7.863267       0.866607   
min        -0.040200       5.074800     -32.562600      

In [6]:
# 4. Target variable distribution
print("\nTarget variable distribution:\n", train_df['target'].value_counts(normalize=True))


Target variable distribution:
 target
0    0.89951
1    0.10049
Name: proportion, dtype: float64


In [7]:
# 5. Correlation analysis (for a subset to avoid overloading)
correlation_matrix = train_df.iloc[:, 1:12].corr()
print("\nCorrelation matrix for first 10 features:\n", correlation_matrix)


Correlation matrix for first 10 features:
           target     var_0     var_1     var_2     var_3     var_4     var_5  \
target  1.000000  0.052390  0.050343  0.055870  0.011055  0.010915  0.030979   
var_0   0.052390  1.000000 -0.000544  0.006573  0.003801  0.001326  0.003046   
var_1   0.050343 -0.000544  1.000000  0.003980  0.000010  0.000303 -0.000902   
var_2   0.055870  0.006573  0.003980  1.000000  0.001001  0.000723  0.001569   
var_3   0.011055  0.003801  0.000010  0.001001  1.000000 -0.000322  0.003253   
var_4   0.010915  0.001326  0.000303  0.000723 -0.000322  1.000000 -0.001368   
var_5   0.030979  0.003046 -0.000902  0.001569  0.003253 -0.001368  1.000000   
var_6   0.066731  0.006983  0.003258  0.000883 -0.000774  0.000049  0.002588   
var_7  -0.003025  0.002429  0.001511 -0.000991  0.002500  0.004549 -0.000995   
var_8   0.019584  0.004962  0.004098  0.002648  0.003553  0.001194  0.000147   
var_9  -0.042805 -0.002613 -0.000832 -0.001932 -0.000826 -0.000918 -0.005279

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Separate features and target
X = train_df.drop(columns=['ID_code', 'target'])
y = train_df['target']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score

# Initialize the model
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train_scaled, y_train)

# Predict on the validation set
y_val_pred = model.predict_proba(X_val_scaled)[:, 1]

# Evaluate the model using AUC-ROC
roc_auc = roc_auc_score(y_val, y_val_pred)
print(f'Validation AUC-ROC Score: {roc_auc:.4f}')


Validation AUC-ROC Score: 0.8198
