In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, classification_report
import sagemaker
from sagemaker import get_execution_role
from sagemaker.feature_store.feature_group import FeatureGroup
from sagemaker.feature_store.feature_definition import FeatureDefinition, FeatureTypeEnum
from sagemaker.sklearn.estimator import SKLearn

import os
import time
import json
from datetime import datetime

In [20]:
df = pd.read_csv('/home/sagemaker-user/src/storedata_total.csv', encoding='latin1')

In [None]:
print(df.head())
print(df.info())
print(df.describe())
print(df.isnull().sum())

In [None]:
# Initialize SageMaker session
role = get_execution_role()
session = sagemaker.Session()
region = session.boto_region_name
bucket = session.default_bucket()
prefix = 'store-prediction'

print(f"SageMaker Role ARN: {role}")
print(f"SageMaker Session: {session}")
print(f"Default S3 bucket: {bucket}")


In [1]:
print(df.head())
print(df.info())
print(df.describe())
print(df.isnull().sum())

# ------------------------------------------------
# Step 2: Data Cleaning
# ------------------------------------------------

# For numeric columns: fill with median
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
for col in numeric_cols:
    if df[col].isnull().sum() > 0:
        median_val = df[col].median()
        df[col] = df[col].fillna(median_val)



# Check for and remove duplicates
duplicate_count = df.duplicated().sum()
if duplicate_count > 0:
    df = df.drop_duplicates()


# ------------------------------------------------
# Step 3: Exploratory Data Analysis
# ------------------------------------------------

# Create visualizations to understand the data
# Set up the plot style
plt.style.use('seaborn-whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)

# Histograms of numerical features
fig, axes = plt.subplots(nrows=(len(numeric_cols) + 1) // 3, ncols=3, figsize=(18, 15))
axes = axes.flatten()

for i, col in enumerate(numeric_cols):
    if i < len(axes):
        sns.histplot(df[col], kde=True, ax=axes[i])
        axes[i].set_title(f'Distribution of {col}')
        axes[i].grid(True)

plt.tight_layout()
plt.savefig('numerical_distributions.png')
plt.close()

# Correlation heatmap
plt.figure(figsize=(14, 10))
corr_matrix = df[numeric_cols].corr()
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, cmap='coolwarm', fmt='.2f', linewidths=0.5)
plt.title('Correlation Matrix of Numerical Features')
plt.tight_layout()
plt.savefig('correlation_heatmap.png')
plt.close()

# ------------------------------------------------
# Step 4: Feature Engineering
# ------------------------------------------------

# Check if we have 'retained' column, if not create it for demonstration
if 'retained' not in df.columns:
    if 'Sales' in df.columns:
        median_sales = df['Sales'].median()
        df['retained'] = (df['Sales'] > median_sales).astype(int)
    elif 'Revenue' in df.columns:
        median_revenue = df['Revenue'].median()
        df['retained'] = (df['Revenue'] > median_revenue).astype(int)
    else:
        # Just create a random binary column for demonstration
        df['retained'] = np.random.randint(0, 2, size=df.shape[0])

# 1. Create ratio features
if 'Items_Available' in df.columns and 'Items_Sold' in df.columns:
    df['Items_Sold_Ratio'] = df['Items_Sold'] / df['Items_Available']
    print("Created 'Items_Sold_Ratio' feature")

if 'Store_Size' in df.columns and 'Sales' in df.columns:
    df['Sales_per_Area'] = df['Sales'] / df['Store_Size']
    print("Created 'Sales_per_Area' feature")

if 'Regular_Items' in df.columns and 'Discounted_Items' in df.columns:
    df['Discount_Ratio'] = df['Discounted_Items'] / (df['Regular_Items'] + df['Discounted_Items'])
    print("Created 'Discount_Ratio' feature")

# 2. Create polynomial features for important numerical columns
important_cols = []
for col in numeric_cols:
    if col != 'retained':  # Don't include the target
        # Calculate correlation with target if target is numeric
        if pd.api.types.is_numeric_dtype(df['retained']):
            corr = abs(df[col].corr(df['retained']))
            if corr > 0.2:  # Arbitrary threshold
                important_cols.append(col)
        else:
            important_cols.append(col)

# Limit to top 3 columns to avoid feature explosion
important_cols = important_cols[:3]
for col in important_cols:
    df[f'{col}_squared'] = df[col] ** 2


# 3. Create interaction terms between important features
if len(important_cols) >= 2:
    for i in range(len(important_cols)):
        for j in range(i+1, len(important_cols)):
            col1, col2 = important_cols[i], important_cols[j]
            df[f'{col1}_x_{col2}'] = df[col1] * df[col2]


# 4. Logarithmic transformations for skewed numerical features
for col in numeric_cols:
    if col != 'retained':
        skewness = df[col].skew()
        if abs(skewness) > 1:  # Highly skewed
            # Ensure all values are positive by shifting if needed
            min_val = df[col].min()
            shift = abs(min_val) + 1 if min_val <= 0 else 0
            df[f'{col}_log'] = np.log1p(df[col] + shift)


# 5. Bin continuous variables into categories
for col in important_cols:
    df[f'{col}_binned'] = pd.qcut(df[col], q=4, labels=['Low', 'Medium-Low', 'Medium-High', 'High'], duplicates='drop')

# 6. Create aggregations if we have store/group identifiers
group_cols = [col for col in df.columns if any(term in col.lower() for term in ['id', 'store', 'group', 'region'])]
if group_cols:
    group_col = group_cols[0]  # Use the first one
    for col in numeric_cols:
        if col != 'retained' and col != group_col:
            # Calculate group statistics
            group_stats = df.groupby(group_col)[col].agg(['mean', 'std', 'min', 'max']).reset_index()
            group_stats.columns = [group_col, f'{col}_group_mean', f'{col}_group_std', f'{col}_group_min', f'{col}_group_max']
            df = df.merge(group_stats, on=group_col, how='left')
            
            # Create normalized features
            df[f'{col}_group_normalized'] = (df[col] - df[f'{col}_group_mean']) / df[f'{col}_group_std'].fillna(1)

# ------------------------------------------------
# Step 5: Feature Selection
# ------------------------------------------------


# Update numeric_cols after feature engineering
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns
numeric_cols = [col for col in numeric_cols if col != 'retained']  # Exclude target

if len(numeric_cols) > 1:
    corr_matrix = df[numeric_cols].corr().abs()
    upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    
    # Find features with correlation greater than 0.95
    high_corr_features = [column for column in upper.columns if any(upper[column] > 0.95)]
    
    if high_corr_features:
        print(f"Removing {len(high_corr_features)} highly correlated features: {high_corr_features}")
        df = df.drop(columns=high_corr_features)
    else:
        print("No highly correlated features found.")

# 2. Prepare categorical columns for modeling


# Get updated list of categorical columns (including newly created ones)
categorical_cols = df.select_dtypes(include=['object', 'category']).columns
categorical_cols = [col for col in categorical_cols if col != 'retained']  # Exclude target if it's categorical

# For demonstration, convert categorical columns to one-hot encoding
for col in categorical_cols:
    # Only one-hot encode if cardinality is not too high
    if df[col].nunique() < 10:
        dummies = pd.get_dummies(df[col], prefix=col, drop_first=True)
        df = pd.concat([df, dummies], axis=1)
        print(f"One-hot encoded '{col}' column")
    else:
        # For high cardinality, use count encoding
        value_counts = df[col].value_counts()
        df[f'{col}_count'] = df[col].map(value_counts)
        print(f"Count-encoded high-cardinality column '{col}'")

# Remove original categorical columns that have been encoded
df = df.drop(columns=categorical_cols)
print(f"Removed {len(categorical_cols)} original categorical columns after encoding")

# ------------------------------------------------
# Step 6: Prepare Data for Machine Learning
# ------------------------------------------------


# Check if 'retained' is still in the DataFrame
if 'retained' not in df.columns:
    raise ValueError("Target column 'retained' not found in the dataset after preprocessing")

# Split features and target
X = df.drop('retained', axis=1)
y = df['retained']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Save column names for later reference
feature_columns = X.columns.tolist()
with open('feature_columns.json', 'w') as f:
    json.dump(feature_columns, f)

# Save the processed dataset
processed_file_path = '/home/sagemaker-user/src/processed_store_data.csv'
processed_df = pd.concat([X, y], axis=1)
processed_df.to_csv(processed_file_path, index=False)
print(f"Saved processed dataset to {processed_file_path}")

# Save train and test sets for SageMaker
train_file_path = '/home/sagemaker-user/src/train_store_data.csv'
test_file_path = '/home/sagemaker-user/src/test_store_data.csv'

train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_test, y_test], axis=1)

train_df.to_csv(train_file_path, index=False)
test_df.to_csv(test_file_path, index=False)
print(f"Saved train dataset to {train_file_path}")
print(f"Saved test dataset to {test_file_path}")

# ------------------------------------------------
# Step 7: Create Feature Store Feature Group
# ------------------------------------------------
print("\n=== Step 7: Creating SageMaker Feature Store Feature Group ===")

# Upload data to S3
s3_client = boto3.client('s3')

# Upload the train and test datasets to S3
s3_train_key = f"{prefix}/train/train_store_data.csv"
s3_test_key = f"{prefix}/test/test_store_data.csv"

s3_client.upload_file(train_file_path, bucket, s3_train_key)
s3_client.upload_file(test_file_path, bucket, s3_test_key)

s3_train_uri = f"s3://{bucket}/{s3_train_key}"
s3_test_uri = f"s3://{bucket}/{s3_test_key}"


# Initialize Feature Store
feature_store_session = sagemaker.Session()

# Add required timestamp and record identifier columns
current_time_sec = int(round(time.time()))
train_df['EventTime'] = current_time_sec
train_df['id'] = train_df.index.astype(str)

# Create a feature group name
feature_group_name = f"store-prediction-features-{int(current_time_sec)}"

# Create the feature group
feature_group = FeatureGroup(
    name=feature_group_name,
    sagemaker_session=feature_store_session
)

# Load feature definitions
feature_group.load_feature_definitions(data_frame=train_df)

# Create the feature group in SageMaker Feature Store
feature_group.create(
    s3_uri=f"s3://{bucket}/{prefix}/feature_store/{feature_group_name}",
    record_identifier_name='id',
    event_time_feature_name='EventTime',
    role_arn=role,
    enable_online_store=True
)

# Wait for feature group creation to complete
status = feature_group.describe().get("FeatureGroupStatus")
while status == "Creating":
    print("Waiting for Feature Group to be created...")
    time.sleep(5)
    status = feature_group.describe().get("FeatureGroupStatus")

print(f"Feature Group status: {status}")

# Ingest data into Feature Group
feature_group.ingest(data_frame=train_df, max_workers=3)
print(f"Ingested data into Feature Group: {feature_group_name}")

# ------------------------------------------------
# Step 8: Train Machine Learning Model
# ------------------------------------------------
print("\n=== Step 8: Training Machine Learning Model ===")

# Create a train.py script for SageMaker
train_script_path = '/home/sagemaker-user/src/train.py'

with open(train_script_path, 'w') as f:
    f.write('''
import argparse
import os
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
import json

def parse_args():
    parser = argparse.ArgumentParser()
    
    # Hyperparameters
    parser.add_argument('--n-estimators', type=int, default=100)
    parser.add_argument('--max-depth', type=int, default=10)
    parser.add_argument('--min-samples-split', type=int, default=2)
    parser.add_argument('--min-samples-leaf', type=int, default=1)
    
    # SageMaker parameters
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    parser.add_argument('--output-data-dir', type=str, default=os.environ.get('SM_OUTPUT_DATA_DIR'))
    
    return parser.parse_args()

def load_data(train_dir, test_dir):
    train_path = os.path.join(train_dir, 'train_store_data.csv')
    test_path = os.path.join(test_dir, 'test_store_data.csv')
    
    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    
    # Split into features and target
    X_train = train_df.drop('retained', axis=1)
    y_train = train_df['retained']
    
    X_test = test_df.drop('retained', axis=1)
    y_test = test_df['retained']
    
    # Save feature columns for inference
    feature_columns = X_train.columns.tolist()
    
    return X_train, y_train, X_test, y_test, feature_columns

def train(args):
    # Load data
    X_train, y_train, X_test, y_test, feature_columns = load_data(args.train, args.test)
    
    print(f"Training data shape: {X_train.shape}")
    print(f"Test data shape: {X_test.shape}")
    
    # Train model
    model = RandomForestClassifier(
        n_estimators=args.n_estimators,
        max_depth=args.max_depth,
        min_samples_split=args.min_samples_split,
        min_samples_leaf=args.min_samples_leaf,
        random_state=42
    )
    
    model.fit(X_train, y_train)
    
    # Evaluate on test set
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1]
    
    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    try:
        auc_roc = roc_auc_score(y_test, y_proba)
    except:
        auc_roc = 0.5  # Default value if AUC calculation fails
    
    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"Test Precision: {precision:.4f}")
    print(f"Test Recall: {recall:.4f}")
    print(f"Test F1 Score: {f1:.4f}")
    print(f"Test AUC ROC: {auc_roc:.4f}")
    
    # Save metrics
    metrics = {
        'accuracy': float(accuracy),
        'precision': float(precision),
        'recall': float(recall),
        'f1': float(f1),
        'auc_roc': float(auc_roc)
    }
    
    # Save important features
    feature_importance = pd.DataFrame({
        'feature': feature_columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    top_features = feature_importance.head(10).to_dict(orient='records')
    
    # Save model
    model_path = os.path.join(args.model_dir, 'model.joblib')
    joblib.dump(model, model_path)
    
    # Save feature columns
    with open(os.path.join(args.model_dir, 'feature_columns.json'), 'w') as f:
        json.dump(feature_columns, f)
    
    # Save metrics
    with open(os.path.join(args.output_data_dir, 'metrics.json'), 'w') as f:
        json.dump(metrics, f)
    
    # Save top features
    with open(os.path.join(args.output_data_dir, 'top_features.json'), 'w') as f:
        json.dump(top_features, f)
    
    print("Training complete.")
    return model, metrics

if __name__ == '__main__':
    args = parse_args()
    model, metrics = train(args)
''')

# Create inference.py script for model deployment
inference_script_path = '/home/sagemaker-user/src/inference.py'

with open(inference_script_path, 'w') as f:
    f.write('''
import os
import json
import pandas as pd
import joblib
import numpy as np

def model_fn(model_dir):
    """
    Load the model from disk
    """
    model_path = os.path.join(model_dir, 'model.joblib')
    model = joblib.load(model_path)
    
    # Load feature columns
    with open(os.path.join(model_dir, 'feature_columns.json'), 'r') as f:
        feature_columns = json.load(f)
    
    return {'model': model, 'feature_columns': feature_columns}

def input_fn(request_body, request_content_type):
    """
    Parse input data payload
    """
    if request_content_type == 'application/json':
        data = json.loads(request_body)
        
        # Convert to DataFrame
        if isinstance(data, dict):
            # Single instance
            df = pd.DataFrame([data])
        else:
            # Multiple instances
            df = pd.DataFrame(data)
        
        return df
    else:
        raise ValueError(f"Unsupported content type: {request_content_type}")

def predict_fn(input_data, model_dict):
    """
    Make predictions
    """
    model = model_dict['model']
    feature_columns = model_dict['feature_columns']
    
    # Ensure all required features are present
    for col in feature_columns:
        if col not in input_data.columns:
            input_data[col] = 0  # Default value for missing features
    
    # Select only features used during training
    input_features = input_data[feature_columns]
    
    # Make predictions
    predictions_proba = model.predict_proba(input_features)[:, 1]
    predictions_binary = (predictions_proba >= 0.5).astype(int)
    
    return {
        'probabilities': predictions_proba.tolist(),
        'predictions': predictions_binary.tolist()
    }

def output_fn(prediction_output, response_content_type):
    """
    Format response
    """
    if response_content_type == 'application/json':
        return json.dumps(prediction_output)
    else:
        raise ValueError(f"Unsupported content type: {response_content_type}")
''')

# Set up the SageMaker SKLearn Estimator
sklearn_estimator = SKLearn(
    entry_point='train.py',
    source_dir='/home/sagemaker-user/src',
    role=role,
    instance_type='ml.m5.xlarge',
    instance_count=1,
    framework_version='0.23-1',
    hyperparameters={
        'n-estimators': 100,
        'max-depth': 10,
        'min-samples-split': 2,
        'min-samples-leaf': 1
    },
    py_version='py3',
    base_job_name='store-prediction-training'
)

# Start the training job
print("Starting model training job...")
sklearn_estimator.fit({'train': s3_train_uri, 'test': s3_test_uri})

# Extract model artifacts
model_data = sklearn_estimator.model_data
print(f"Model training complete. Model artifacts: {model_data}")

# ------------------------------------------------
# Step 9: Deploy Model
# ------------------------------------------------
print("\n=== Step 9: Deploying Model ===")

# Deploy the model
predictor = sklearn_estimator.deploy(
    initial_instance_count=1,
    instance_type='ml.t2.medium',
    endpoint_name=f'store-prediction-endpoint-{int(time.time())}',
    serializer=csv_serializer
)

print(f"Model deployed to endpoint: {predictor.endpoint_name}")

# ------------------------------------------------
# Step 10: Test the Endpoint
# ------------------------------------------------
print("\n=== Step 10: Testing the Endpoint ===")

# Get a sample from the test set for prediction
sample_data = X_test.iloc[0:5].to_dict(orient='records')
print("Sample data for prediction:")
print(json.dumps(sample_data, indent=2))

# Make prediction
print("\nMaking prediction...")
prediction = predictor.predict(json.dumps(sample_data))
result = json.loads(prediction.decode('utf-8'))

print("\nPrediction results:")
print(f"Probabilities: {result['probabilities']}")
print(f"Predictions: {result['predictions']}")

print("\nActual values:")
print(y_test.iloc[0:5].tolist())

# ------------------------------------------------
# Step 11: Evaluate Model Performance
# ------------------------------------------------

# Make predictions on the entire test set
test_data = X_test.to_dict(orient='records')
batch_result = json.loads(predictor.predict(json.dumps(test_data)).decode('utf-8'))

# Calculate AUC ROC and other metrics
y_pred = np.array(batch_result['predictions'])
y_proba = np.array(batch_result['probabilities'])

accuracy = (y_pred == y_test.values).mean()
auc_roc = roc_auc_score(y_test, y_proba)

# Plot ROC curve
fpr, tpr, _ = roc_curve(y_test, y_proba)
plt.figure(figsize=(10, 8))
plt.plot(fpr, tpr, label=f'AUC = {auc_roc:.3f}')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.savefig('roc_curve.png')
plt.close()

print(f"Test Accuracy: {accuracy:.4f}")
print(f"Test AUC ROC: {auc_roc:.4f}")

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.savefig('confusion_matrix.png')
plt.close()


print("\nWorkflow complete!")

NameError: name 'get_execution_role' is not defined