In [83]:
# Cell 1 - Install
!pip uninstall protobuf -y --quiet
!pip install protobuf==3.20.3 --quiet
!pip install kfp==2.3.0 --quiet


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
grpcio-status 1.75.1 requires protobuf<7.0.0,>=6.31.1, but you have protobuf 3.20.3 which is incompatible.
opentelemetry-proto 1.37.0 requires protobuf<7.0,>=5.0, but you have protobuf 3.20.3 which is incompatible.[0m[31m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [84]:
# Cell 2
from kfp.dsl import component, pipeline
from kfp import compiler


In [85]:
# Cell 3 - Component 1: Download
@component(
    packages_to_install=["google-cloud-storage==2.10.0", "google-api-core==2.27.0"],
    base_image="python:3.10-slim"
)
def download_data(raw_data_gcs: str, raw_temp_gcs: str) -> None:
    from google.cloud import storage
    
    print("="*60)
    print("COMPONENT 1: DOWNLOAD DATA")
    print("="*60)
    
    client = storage.Client()
    parts = raw_data_gcs.replace("gs://", "").split("/")
    client.bucket(parts[0]).blob("/".join(parts[1:])).download_to_filename('/tmp/raw.csv')
    print(f"✓ Downloaded: {raw_data_gcs}")
    
    parts_temp = raw_temp_gcs.replace("gs://", "").split("/")
    client.bucket(parts_temp[0]).blob("/".join(parts_temp[1:])).upload_from_filename('/tmp/raw.csv')
    print(f"✓ Uploaded to: {raw_temp_gcs}")
    print("="*60)


In [86]:
# Cell 4 - Component 2: Clean
@component(
    packages_to_install=["pandas==2.0.3", "numpy==1.24.3", "google-cloud-storage==2.10.0", "google-api-core==2.27.0"],
    base_image="python:3.10-slim"
)
def clean_data(raw_temp_gcs: str, cleaned_output_gcs: str) -> None:
    import pandas as pd
    from google.cloud import storage
    
    print("="*60)
    print("COMPONENT 2: CLEAN DATA")
    print("="*60)
    
    client = storage.Client()
    parts = raw_temp_gcs.replace("gs://", "").split("/")
    client.bucket(parts[0]).blob("/".join(parts[1:])).download_to_filename('/tmp/raw.csv')
    
    df = pd.read_csv('/tmp/raw.csv', on_bad_lines='skip', engine='python')
    print(f"✓ Raw: {len(df)} rows")
    
    df['highrating'] = (df['rating'] >= 4).astype(int)
    df = df[['text', 'highrating']].dropna()
    df['text'] = df['text'].astype(str).str.lower()
    
    print(f"✓ Cleaned: {len(df)} rows")
    
    df.to_csv('/tmp/cleaned.csv', index=False)
    
    parts_out = cleaned_output_gcs.replace("gs://", "").split("/")
    client.bucket(parts_out[0]).blob("/".join(parts_out[1:])).upload_from_filename('/tmp/cleaned.csv')
    print(f"✓ Uploaded: {cleaned_output_gcs}")
    print("="*60)


In [102]:
# Cell 5 - Component 3: Train Model 
@component(
    packages_to_install=["pandas==2.0.3", "scikit-learn==1.2.2", "numpy==1.24.3", "scipy==1.10.1", "google-cloud-storage==2.10.0", "google-api-core==2.27.0"],
    base_image="python:3.10-slim"
)
def train_model(cleaned_data_gcs: str, metrics_output_gcs: str) -> None:
    import pandas as pd
    import json
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score
    from google.cloud import storage
    
    print("="*60)
    print("COMPONENT 3: TRAIN MODEL")
    print("="*60)
    
    client = storage.Client()
    parts = cleaned_data_gcs.replace("gs://", "").split("/")
    client.bucket(parts[0]).blob("/".join(parts[1:])).download_to_filename('/tmp/cleaned.csv')
    
    df = pd.read_csv('/tmp/cleaned.csv')
    X, y = df['text'], df['highrating']
    
    vectorizer = TfidfVectorizer(max_features=5000)
    X_vec = vectorizer.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)
    
    model = LogisticRegression(max_iter=500, class_weight='balanced')
    model.fit(X_train, y_train)
    accuracy = accuracy_score(y_test, model.predict(X_test))
    
    print(f"✓ Accuracy: {accuracy:.4f}")
    print(f"✓ Training samples: {X_train.shape[0]}")  # ← FIXED
    print("="*60)
    
    # Write metrics to GCS
    metrics = {
        'accuracy': float(accuracy),
        'samples': int(X_train.shape[0]),  # ← FIXED
        'features': int(X_vec.shape[1])
    }
    
    with open('/tmp/metrics.json', 'w') as f:
        json.dump(metrics, f)
    
    parts_metrics = metrics_output_gcs.replace("gs://", "").split("/")
    client.bucket(parts_metrics[0]).blob("/".join(parts_metrics[1:])).upload_from_filename('/tmp/metrics.json')
    print(f"✓ Metrics saved to: {metrics_output_gcs}")


In [103]:
# Cell 5.5 - Component 3.5: Evaluate Model
@component(
    packages_to_install=["pandas==2.0.3", "scikit-learn==1.2.2", "numpy==1.24.3", "scipy==1.10.1", "google-cloud-storage==2.10.0", "google-api-core==2.27.0"],
    base_image="python:3.10-slim"
)
def evaluate_model(cleaned_data_gcs: str, metrics_input_gcs: str, evaluation_output_gcs: str) -> None:
    import pandas as pd
    import json
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
    from google.cloud import storage
    
    print("="*60)
    print("COMPONENT 3.5: MODEL EVALUATION")
    print("="*60)
    
    client = storage.Client()
    
    # Load cleaned data
    parts = cleaned_data_gcs.replace("gs://", "").split("/")
    client.bucket(parts[0]).blob("/".join(parts[1:])).download_to_filename('/tmp/cleaned.csv')
    
    df = pd.read_csv('/tmp/cleaned.csv')
    X, y = df['text'], df['highrating']
    
    # Same preprocessing as training
    vectorizer = TfidfVectorizer(max_features=5000)
    X_vec = vectorizer.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)
    
    # Train model (same as training component for consistency)
    model = LogisticRegression(max_iter=500, class_weight='balanced')
    model.fit(X_train, y_train)
    
    # Comprehensive evaluation
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred).tolist()
    
    print(f"✓ Accuracy:  {accuracy:.4f}")
    print(f"✓ Precision: {precision:.4f}")
    print(f"✓ Recall:    {recall:.4f}")
    print(f"✓ F1-Score:  {f1:.4f}")
    print(f"✓ Confusion Matrix:\n{conf_matrix}")
    
    # Detailed classification report
    class_report = classification_report(y_test, y_pred, target_names=['Negative', 'Positive'], output_dict=True)
    
    # Save comprehensive evaluation
    evaluation = {
        'accuracy': float(accuracy),
        'precision': float(precision),
        'recall': float(recall),
        'f1_score': float(f1),
        'confusion_matrix': conf_matrix,
        'classification_report': class_report,
        'test_samples': int(X_test.shape[0]),
        'positive_samples': int(sum(y_test)),
        'negative_samples': int(len(y_test) - sum(y_test))
    }
    
    with open('/tmp/evaluation.json', 'w') as f:
        json.dump(evaluation, f, indent=2)
    
    # Upload to GCS
    parts_eval = evaluation_output_gcs.replace("gs://", "").split("/")
    client.bucket(parts_eval[0]).blob("/".join(parts_eval[1:])).upload_from_filename('/tmp/evaluation.json')
    
    print(f"\n EVALUATION SAVED: {evaluation_output_gcs}")
    print("="*60)


In [104]:
# Cell 6 - Component 4: Save Model with Metadata
@component(
    packages_to_install=[
        "pandas==2.0.3",
        "scikit-learn==1.2.2",
        "numpy==1.24.3",
        "scipy==1.10.1",
        "google-cloud-storage==2.10.0",
        "google-api-core==2.27.0"
    ],
    base_image="python:3.10-slim"
)
def save_model_with_metadata(cleaned_data_gcs: str, model_output_gcs: str, metrics_input_gcs: str) -> None:
    import pickle
    import json
    import pandas as pd
    from datetime import datetime
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.linear_model import LogisticRegression
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
    from google.cloud import storage
    import time
    
    print("="*60)
    print("COMPONENT 4: SAVE MODEL WITH ENHANCED METADATA")
    print("="*60)
    
    start_time = time.time()
    client = storage.Client()
    
    # Load cleaned data
    parts = cleaned_data_gcs.replace("gs://", "").split("/")
    client.bucket(parts[0]).blob("/".join(parts[1:])).download_to_filename('/tmp/cleaned.csv')
    
    df = pd.read_csv('/tmp/cleaned.csv')
    X, y = df['text'], df['highrating']
    
    # Train model (same process)
    vectorizer = TfidfVectorizer(max_features=5000)
    X_vec = vectorizer.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_vec, y, test_size=0.2, random_state=42)
    
    model = LogisticRegression(max_iter=500, class_weight='balanced')
    model.fit(X_train, y_train)
    
    # Calculate COMPREHENSIVE metrics
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred).tolist()
    
    training_duration = time.time() - start_time
    
    # Check for previous version
    parts_out = model_output_gcs.replace("gs://", "").split("/")
    bucket = client.bucket(parts_out[0])
    model_blob_path = "/".join(parts_out[1:])
    metadata_blob_path = model_blob_path.replace('.pickle', '_metadata.json')
    
    previous_version = None
    previous_accuracy = None
    improvement = None
    is_champion = True
    
    try:
        meta_blob = bucket.blob(metadata_blob_path)
        old_meta = json.loads(meta_blob.download_as_text())
        previous_version = old_meta.get('current_version', 0)
        previous_accuracy = old_meta.get('current_accuracy', 0)
        
        # Determine if new model is better
        if accuracy > previous_accuracy:
            new_version = previous_version + 1
            is_champion = True
            improvement = ((accuracy - previous_accuracy) / previous_accuracy) * 100
            print(f"✓ NEW CHAMPION! Accuracy improved by {improvement:.2f}%")
        else:
            new_version = previous_version
            is_champion = False
            print(f"✗ Model not better than champion. Keeping version {previous_version}")
    except:
        new_version = 1
        print("✓ First model version!")
    
    # Create ENHANCED metadata
    metadata = {
        # Version control
        'current_version': new_version,
        'current_timestamp': datetime.now().isoformat(),
        'is_champion': is_champion,
        'previous_version': previous_version,
        'previous_accuracy': previous_accuracy,
        'improvement': improvement,
        
        # Performance metrics
        'current_accuracy': float(accuracy),
        'precision': float(precision),
        'recall': float(recall),
        'f1_score': float(f1),
        'confusion_matrix': conf_matrix,
        'true_negatives': int(conf_matrix[0][0]),
        'false_positives': int(conf_matrix[0][1]),
        'false_negatives': int(conf_matrix[1][0]),
        'true_positives': int(conf_matrix[1][1]),
        
        # Training data info
        'samples_trained': int(X_train.shape[0]),
        'samples_tested': int(X_test.shape[0]),
        'total_samples': int(len(df)),
        'positive_samples': int(sum(y == 1)),
        'negative_samples': int(sum(y == 0)),
        'class_balance': float(sum(y == 1) / len(y)),
        'features': int(X_vec.shape[1]),
        
        # Model configuration
        'model_type': 'LogisticRegression',
        'vectorizer_type': 'TfidfVectorizer',
        'max_features': 5000,
        'max_iterations': 500,
        'test_split_ratio': 0.2,
        'random_state': 42,
        
        # Training details
        'training_duration_seconds': float(training_duration),
        'training_duration_formatted': f"{int(training_duration // 60)}m {int(training_duration % 60)}s",
        
        # Dataset info
        'dataset_source': cleaned_data_gcs,
        'dataset_name': 'Amazon Movies & TV Reviews',
        
        # Team info
        'team': 'Team-14',
        'pipeline_name': 'sentiment-pipeline-team14-final',
        'course': 'Data Engineering - MLOps',
        'university': 'JADS'
    }
    
    # Save model
    package = {
        'model': model,
        'vectorizer': vectorizer,
        'version': new_version,
        'accuracy': accuracy,
        'timestamp': datetime.now().isoformat(),
        'samples_trained': X_train.shape[0],
        'features': X_vec.shape[1]
    }
    
    with open('/tmp/model.pkl', 'wb') as f:
        pickle.dump(package, f)
    
    blob_model = bucket.blob(model_blob_path)
    blob_model.upload_from_filename('/tmp/model.pkl')
    
    # Save enhanced metadata
    with open('/tmp/metadata.json', 'w') as f:
        json.dump(metadata, f, indent=2)
    
    blob_meta = bucket.blob(metadata_blob_path)
    blob_meta.upload_from_filename('/tmp/metadata.json')
    
    print(f"\n MODEL SAVED: gs://{parts_out[0]}/{model_blob_path}")
    print(f" METADATA SAVED: gs://{parts_out[0]}/{metadata_blob_path}")
    print(f"\n Model Metrics:")
    print(f"   Version: {new_version}")
    print(f"   Accuracy: {accuracy:.4f}")
    print(f"   Precision: {precision:.4f}")
    print(f"   Recall: {recall:.4f}")
    print(f"   F1-Score: {f1:.4f}")
    print(f"   Training Duration: {training_duration:.2f}s")
    print("="*60)


In [105]:
# Cell 7 - Pipeline Definition (UPDATED with evaluation)
@pipeline(
    name='sentiment-pipeline-team14-final',
    description='Team 14 sentiment analysis with champion/challenger and evaluation'
)
def sentiment_pipeline(
    raw_data_gcs: str = 'gs://data_mlops/Movies_and_TV.csv',
    raw_temp_gcs: str = 'gs://temp_data_mlops/raw_temp.csv',
    cleaned_output_gcs: str = 'gs://temp_data_mlops/cleaned_data.csv',
    model_output_gcs: str = 'gs://model_mlops/Team-14-v2.pickle',
    metrics_gcs: str = 'gs://temp_data_mlops/train_metrics.json',
    evaluation_gcs: str = 'gs://temp_data_mlops/evaluation_report.json'
):
    # Component 1: Download
    download_task = download_data(raw_data_gcs=raw_data_gcs, raw_temp_gcs=raw_temp_gcs)
    
    # Component 2: Clean
    clean_task = clean_data(raw_temp_gcs=raw_temp_gcs, cleaned_output_gcs=cleaned_output_gcs)
    clean_task.after(download_task)
    
    # Component 3: Train
    train_task = train_model(cleaned_data_gcs=cleaned_output_gcs, metrics_output_gcs=metrics_gcs)
    train_task.after(clean_task)
    
    # Component 3.5: Evaluate (NEW!)
    eval_task = evaluate_model(
        cleaned_data_gcs=cleaned_output_gcs,
        metrics_input_gcs=metrics_gcs,
        evaluation_output_gcs=evaluation_gcs
    )
    eval_task.after(train_task)
    
    # Component 4: Save Model
    save_task = save_model_with_metadata(
        cleaned_data_gcs=cleaned_output_gcs,
        model_output_gcs=model_output_gcs,
        metrics_input_gcs=metrics_gcs
    )
    save_task.after(eval_task)  # Changed from train_task to eval_task


In [106]:
# Cell 8 - Compile
compiler.Compiler().compile(
    pipeline_func=sentiment_pipeline,
    package_path='Team14_FINAL_SUBMISSION.yaml'
)
print(" Compiled: Team14_FINAL_SUBMISSION.yaml")



 Compiled: Team14_FINAL_SUBMISSION.yaml
