# Lab 2 — Transform & Feature Engineering (AWS ML Associate)
Use **SageMaker Processing (sklearn)** to clean/encode/scale and split data.

In [None]:
LAB_BUCKET = 's3://CHANGE-ME-BUCKET'
LAB_PREFIX = 'ml-assoc/l2'
print(LAB_BUCKET, LAB_PREFIX)

### Upload input CSV to S3

In [None]:
!aws s3 cp /tmp/l1/customers.csv {LAB_BUCKET}/{LAB_PREFIX}/input/customers.csv || true

### Create processing script

In [None]:
%%bash
cat > /tmp/processing_l2.py << 'PY'
import argparse, os
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
def main():
  p=argparse.ArgumentParser(); p.add_argument('--input'); p.add_argument('--output'); a=p.parse_args()
  df=pd.read_csv(os.path.join(a.input,'customers.csv')).dropna()
  X=df[['age','income','state']]; y=df['churn']
  X_cat=OneHotEncoder(sparse_output=False,handle_unknown='ignore').fit_transform(X[['state']])
  X_num=StandardScaler().fit_transform(X[['age','income']])
  import numpy as np; Xp=np.hstack([X_num,X_cat])
  Xtr,Xte,ytr,yte=train_test_split(Xp,y,test_size=0.2,stratify=y,random_state=42)
  os.makedirs(a.output,exist_ok=True)
  pd.DataFrame(Xtr).to_csv(os.path.join(a.output,'X_train.csv'),index=False)
  pd.DataFrame(Xte).to_csv(os.path.join(a.output,'X_test.csv'),index=False)
  ytr.to_csv(os.path.join(a.output,'y_train.csv'),index=False)
  yte.to_csv(os.path.join(a.output,'y_test.csv'),index=False)
if __name__=='__main__': main()
PY

### Run Processing job

In [None]:
import sagemaker, os
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.processing import ProcessingInput, ProcessingOutput
sess=sagemaker.Session(); role=sagemaker.get_execution_role(); bucket=LAB_BUCKET.replace('s3://','')
proc=SKLearnProcessor(framework_version='1.2-1', role=role, instance_type='ml.m5.large', instance_count=1)
proc.run(code='/tmp/processing_l2.py',
        inputs=[ProcessingInput(source=f's3://{bucket}/{LAB_PREFIX}/input/', destination='/opt/ml/processing/input')],
        outputs=[ProcessingOutput(source='/opt/ml/processing/output', destination=f's3://{bucket}/{LAB_PREFIX}/output/')],
        arguments=['--input','/opt/ml/processing/input','--output','/opt/ml/processing/output'])

### Verify

In [None]:
!aws s3 ls {LAB_BUCKET}/{LAB_PREFIX}/output/