In [33]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [None]:
DIR = 'kaggle/input/apl-2025-spring-smoker-status'
OUT_DIR = 'kaggle/output/'
SEED = 42

## Read the Data

In [25]:
traindata = np.genfromtxt(f"{DIR}/train.csv", delimiter=',', skip_header=1)
testdata = np.genfromtxt(f"{DIR}/test.csv", delimiter=',', skip_header=1)

In [26]:
trainset, valset = train_test_split(traindata, test_size=0.2, random_state=SEED)

In [27]:
X_train, y_train, X_val, y_val = trainset[:, :-1], trainset[:, -1], valset[:, :-1], valset[:, -1]
X_test = testdata

## Build a Model/Pipline

In [28]:
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('classifier', LogisticRegression(max_iter=1000, random_state=SEED))
])

### Train

In [30]:
pipe.fit(X_train, y_train)

Pipeline(steps=[('scaler', StandardScaler()),
                ('classifier',
                 LogisticRegression(max_iter=1000, random_state=42))])

### Evaluate on Validation Set

In [34]:
val_preds = pipe.predict(X_val)
val_acc = accuracy_score(y_val, val_preds)

In [35]:
print(f"Validation Accuracy: {val_acc * 100:.4f}%")

Validation Accuracy: 79.3667%


## Make Predictions on Test Set

In [37]:
test_preds = pipe.predict(X_test)

### Build the Submission File

In [39]:
test_ids = X_test[:, 0].astype(int)

In [None]:
sub_data = np.column_stack((test_ids, test_preds))

np.savetxt('smoking.csv', sub_data, delimiter=',', header='id,smoking', comments='')