In [13]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [14]:
IN_DIR = 'kaggle/input/apl-2025-spring-smoker-status'
OUT_DIR = 'kaggle/output'
SEED = 42

## Read the Data

In [15]:
traindata = np.genfromtxt(f"{IN_DIR}/train.csv", delimiter=',', skip_header=1)
testdata = np.genfromtxt(f"{IN_DIR}/test.csv", delimiter=',', skip_header=1)

In [16]:
trainset, valset = train_test_split(traindata, test_size=0.2, random_state=SEED)

In [17]:
X_train, y_train, X_val, y_val = trainset[:, :-1], trainset[:, -1], valset[:, :-1], valset[:, -1]
X_test = testdata

## Build a Model/Pipline

In [18]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(max_iter=1_000, random_state=SEED))
])

### Train

In [19]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier',
                 LogisticRegression(max_iter=1000, random_state=42))])

### Evaluate on Validation Set

In [20]:
val_preds = pipe.predict(X_val)
val_acc = accuracy_score(y_val, val_preds)

In [21]:
print(f"Validation Accuracy: {val_acc * 100:.4f}%")

Validation Accuracy: 79.3667%


## Make Predictions on Test Set

In [22]:
test_preds = pipe.predict(X_test).astype(int)

### Build the Submission File

In [23]:
test_ids = X_test[:, 0].astype(int)

In [24]:
sub_data = np.column_stack((test_ids, test_preds))

np.savetxt(f"{OUT_DIR}/smoking.csv", sub_data, delimiter=',', header='id,smoking', comments='', fmt=['%d', '%d'])