In [4]:
%pip install nltk

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [7]:
%pip install --quiet mlflow

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.2.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


## With Pickle

In [None]:
import pickle
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from ClassificationModel.data_cleaning_training import DataPreprocessor

class TextModel:
    def __init__(self, max_features=5000):  # Increased max_features for better feature representation
        self.vectorizer = TfidfVectorizer(max_features=max_features)
        self.classifier = MultinomialNB()
        
    def train(self, X_train, y_train):
        self.classifier.fit(X_train, y_train)
        
    def predict(self, X_test):
        return self.classifier.predict(X_test)
    
    def save(self, vectorizer_path, model_path):
        pickle.dump(self.vectorizer, open(vectorizer_path, 'wb'))
        pickle.dump(self.classifier, open(model_path, 'wb'))
    
    def load(self, vectorizer_path, model_path):
        self.vectorizer = pickle.load(open(vectorizer_path, 'rb'))
        self.classifier = pickle.load(open(model_path, 'rb'))
        return self.classifier, self.vectorizer
    
    def analyze_sentiment(self, sentence):
        sentence_transformed = self.vectorizer.transform([sentence]).toarray()
        result = self.classifier.predict(sentence_transformed)[0]
        return 'Positive review' if result == 1 else 'Negative review'

class Evaluation:
    def __init__(self, y_true, y_pred):
        self.y_true = y_true
        self.y_pred = y_pred

    def calculate_metrics(self, labels=None):
        self.f1 = f1_score(self.y_true, self.y_pred, average='binary', labels=labels, zero_division=0)
        self.confusion_matrix = confusion_matrix(self.y_true, self.y_pred, labels=labels)
        self.accuracy = accuracy_score(self.y_true, self.y_pred)
        
    def print_metrics(self):
        print("F1 Score:", self.f1)
        print("Confusion Matrix:\n", self.confusion_matrix)
        print("Accuracy:", self.accuracy)

def model_train(path):
    try:
        # Load dataset
        df = pd.read_csv(path)
        print(f"Dataset loaded: {len(df)} samples")
        print("Raw dataset head:\n", df.head())
        print("Raw label distribution in dataset:")
        print(df['sentiment'].value_counts())

        # Verify sentiment column values
        print("Unique raw sentiment values:", df['sentiment'].unique())

        # Encode sentiment labels ('positive'/'negative' to 1/0)
        label_encoder = LabelEncoder()
        df['sentiment_encoded'] = label_encoder.fit_transform(df['sentiment'])
        print("Encoded labels mapping:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))
        print("Label distribution after encoding:")
        print(df['sentiment_encoded'].value_counts())

        # Data preparation (assuming DataPreprocessor is defined elsewhere)
        preprocessor = DataPreprocessor()
        corpus = preprocessor.preprocess(df)
        print(f"Corpus preprocessed: {len(corpus)} samples")

        # Model NLP fitting
        text_model = TextModel()
        X = text_model.vectorizer.fit_transform(corpus).toarray()
        y = df['sentiment_encoded']  # Use the encoded column
        print(f"Vectorized data: {X.shape}")
        print("Label distribution for training:")
        print(pd.Series(y).value_counts())

        # Split data into training and test sets with stratification
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101, stratify=y)
        print(f"Train set: {X_train.shape}, Test set: {X_test.shape}")
        print("Train label distribution:")
        print(pd.Series(y_train).value_counts())
        print("Test label distribution:")
        print(pd.Series(y_test).value_counts())

        # Train model
        text_model.train(X_train, y_train)

        # Save model and vectorizer
        import os
        os.makedirs("saved_model", exist_ok=True)  # Ensure directory exists
        text_model.save("saved_model/count-Vectorizer.pkl", "saved_model/Classification.pkl")

        return X_train, X_test, y_train, y_test, text_model, label_encoder

    except Exception as e:
        print(f"Error in model_train: {e}")
        return None, None, None, None, None, None
    
def model_predict(X_test, loaded_vector, loaded_model, label_encoder):
    try:
        # X_test is already vectorized, so use it directly for prediction
        print(f"Predicting on test data: {X_test.shape}")
        y_pred = loaded_model.predict(X_test)
        print(f"Predictions made: {y_pred.shape}")
        print("Prediction label distribution:")
        print(pd.Series(y_pred).value_counts())

        # Decode predictions back to original labels ('positive'/'negative')
        y_pred_labels = label_encoder.inverse_transform(y_pred)

        return y_pred, y_pred_labels

    except Exception as e:
        print(f"Error in model_predict: {e}")
        return None, None

# Download NLTK stopwords
try:
    nltk.download('stopwords', quiet=True)
    nltk.download('punkt', quiet=True)  # Added for tokenization
except Exception as e:
    print(f"Error downloading NLTK resources: {e}")

print("NLTK resources downloaded successfully.")

# Path to the IMDB dataset
path = 'datasets/IMDB_Dataset.csv'

# Train the model and get test data
X_train, X_test, y_train, y_test, text_model, label_encoder = model_train(path)
print("Model training completed.")

if X_train is not None:  
    # Load the saved model and vectorizer for prediction
    loaded_model, loaded_vector = text_model.load("saved_model/count-Vectorizer.pkl", "saved_model/Classification.pkl")
    print("Model and vectorizer loaded successfully.")
    
    # Predict on test data
    y_pred, y_pred_labels = model_predict(X_test, loaded_vector, loaded_model, label_encoder)
    print("Prediction completed.")
    
    if y_pred is not None:
        # Evaluate predictions
        eval = Evaluation(y_test, y_pred)
        eval.calculate_metrics(labels=[0, 1])  # Explicitly include both classes
        eval.print_metrics()
        print("Evaluation completed.")
        
        # Print sample predictions
        print("\nSample predictions:")
        for i in range(5):
            print(f"Sample {i+1}: Predicted = {y_pred_labels[i]}, True = {label_encoder.inverse_transform([y_test.iloc[i]])[0]}")
    else:
        print("Prediction failed, cannot proceed with evaluation.")
else:
    print("Training failed, cannot proceed with prediction and evaluation.")

NLTK resources downloaded successfully.
Dataset loaded: 50000 samples
Raw dataset head:
                                               review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
Raw label distribution in dataset:
sentiment
positive    25000
negative    25000
Name: count, dtype: int64
Unique raw sentiment values: ['positive' 'negative']
Encoded labels mapping: {'negative': 0, 'positive': 1}
Label distribution after encoding:
sentiment_encoded
1    25000
0    25000
Name: count, dtype: int64
Corpus preprocessed: 50000 samples
Vectorized data: (50000, 5000)
Label distribution for training:
sentiment_encoded
1    25000
0    25000
Name: count, dtype: int64
Train set: (40000, 5000), Test set: (10000, 5000)
Tr

In [26]:
report_dict = {
    "confusion_matrix": eval.confusion_matrix.tolist(), 
    "f1_score": eval.f1,
    "accuracy": eval.accuracy
}
print("Report dictionary:", report_dict)

Report dictionary: {'confusion_matrix': [[4195, 805], [766, 4234]], 'f1_score': 0.843510309791812, 'accuracy': 0.8429}


#### Experiment Tracking

In [34]:
import mlflow
mlflow.set_experiment("Sentiment Analysis Experiment")
mlflow.set_tracking_uri(uri="http://127.0.0.1:5000/")

with mlflow.start_run():
    mlflow.log_param("max_features", 10000)
    mlflow.log_param("ngram_range", "1-2")
    mlflow.log_param("classifier", "MultinomialNB")
    mlflow.log_metrics({
        'accuracy': report_dict['accuracy'],
        'f1_score': report_dict['f1_score']
    })
    with open("confusion_matrix.txt", "w") as f:
        f.write(str(report_dict['confusion_matrix']))
    mlflow.log_artifact("confusion_matrix.txt")
    mlflow.sklearn.log_model(text_model, "MultinomialNB") 
    # text_model.save(mlflow.active_run().info.run_id)



🏃 View run chill-sponge-452 at: http://127.0.0.1:5000/#/experiments/1/runs/bf7d66f01b2943cb9fb2b8d7fc1d8cac
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


### Model Registry

In [36]:
model_name = 'MultinomialNB'
run_id='bf7d66f01b2943cb9fb2b8d7fc1d8cac'
model_uri = f'runs:/{run_id}/{model_name}'

with mlflow.start_run(run_id=run_id):
    mlflow.register_model(model_uri=model_uri, name=model_name)

Registered model 'MultinomialNB' already exists. Creating a new version of this model...
2025/06/27 15:26:45 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: MultinomialNB, version 2


🏃 View run chill-sponge-452 at: http://127.0.0.1:5000/#/experiments/1/runs/bf7d66f01b2943cb9fb2b8d7fc1d8cac
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/1


Created version '2' of model 'MultinomialNB'.


### Load Model

In [37]:
model_version = 1
model_uri = f"models:/{model_name}/{model_version}"

loaded_model = mlflow.sklearn.load_model(model_uri)
y_pred, y_pred_labels = model_predict(X_test, loaded_vector, loaded_model, label_encoder) #selon my use case
y_pred[:4]

Predicting on test data: (10000, 5000)
Predictions made: (10000,)
Prediction label distribution:
1    5039
0    4961
Name: count, dtype: int64


array([0, 0, 0, 1])

### Transition the Model to Production

In [38]:
current_model_uri = f"models:/{model_name}@challenger"
production_model_name = "anomaly-detection-prod"

client = mlflow.MlflowClient()
client.copy_model_version(src_model_uri=current_model_uri, dst_name=production_model_name)

Successfully registered model 'anomaly-detection-prod'.
Copied version '1' of model 'MultinomialNB' to version '1' of model 'anomaly-detection-prod'.


<ModelVersion: aliases=[], creation_timestamp=1751034712421, current_stage='None', deployment_job_state=<ModelVersionDeploymentJobState: current_task_name='', job_id='', job_state='DEPLOYMENT_JOB_CONNECTION_STATE_UNSPECIFIED', run_id='', run_state='DEPLOYMENT_JOB_RUN_STATE_UNSPECIFIED'>, description='', last_updated_timestamp=1751034712421, metrics=None, model_id=None, name='anomaly-detection-prod', params=None, run_id='bf7d66f01b2943cb9fb2b8d7fc1d8cac', run_link='', source='models:/MultinomialNB/1', status='READY', status_message=None, tags={}, user_id='', version='1'>

=> In docker container, we will be using this model using this code, see the documentation for more details.
https://mlflow.org/docs/latest/ml/model-registry#model-registry-workflows

In [39]:
model_version = 1
prod_model_uri = f"models:/{production_model_name}@champion"

loaded_model = mlflow.sklearn.load_model(model_uri)
y_pred, y_pred_labels = model_predict(X_test, loaded_vector, loaded_model, label_encoder) #selon my use case
y_pred[:4]

Predicting on test data: (10000, 5000)
Predictions made: (10000,)
Prediction label distribution:
1    5039
0    4961
Name: count, dtype: int64


array([0, 0, 0, 1])

## With MLFlow Registry

In [1]:
import pandas as pd
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from ClassificationModel.data_cleaning_training import DataPreprocessor

In [3]:
class TextModel:
    def __init__(self, max_features=5000):  # Increased max_features for better feature representation
        self.vectorizer = TfidfVectorizer(max_features=max_features)
        self.classifier = MultinomialNB()
        
    def train(self, X_train, y_train):
        self.classifier.fit(X_train, y_train)
        
    def predict(self, X_test):
        return self.classifier.predict(X_test)

class Evaluation:
    def __init__(self, y_true, y_pred):
        self.y_true = y_true
        self.y_pred = y_pred

    def calculate_metrics(self, labels=None):
        self.f1 = f1_score(self.y_true, self.y_pred, average='binary', labels=labels, zero_division=0)
        self.confusion_matrix = confusion_matrix(self.y_true, self.y_pred, labels=labels)
        self.accuracy = accuracy_score(self.y_true, self.y_pred)
        
    def print_metrics(self):
        print("F1 Score:", self.f1)
        print("Confusion Matrix:\n", self.confusion_matrix)
        print("Accuracy:", self.accuracy)

def get_processed_data(path):
    # Load dataset
        df = pd.read_csv(path)
        print(f"Dataset loaded: {len(df)} samples")
        print("Raw dataset head:\n", df.head())
        print("Raw label distribution in dataset:")
        print(df['sentiment'].value_counts())

        # Verify sentiment column values
        print("Unique raw sentiment values:", df['sentiment'].unique())

        # Encode sentiment labels ('positive'/'negative' to 1/0)
        label_encoder = LabelEncoder()
        df['sentiment_encoded'] = label_encoder.fit_transform(df['sentiment'])
        print("Encoded labels mapping:", dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))))
        print("Label distribution after encoding:")
        print(df['sentiment_encoded'].value_counts())

        # Data preparation (assuming DataPreprocessor is defined elsewhere)
        preprocessor = DataPreprocessor()
        corpus = preprocessor.preprocess(df)
        print(f"Corpus preprocessed: {len(corpus)} samples")
        return df, corpus, label_encoder

def model_train(path):
    try:
        df, corpus, label_encoder = get_processed_data(path)
   
        # Model NLP fitting
        text_model = TextModel()
        X = text_model.vectorizer.fit_transform(corpus).toarray()
        y = df['sentiment_encoded'] 
        print(f"Vectorized data: {X.shape}")
        print("Label distribution for training:")
        print(pd.Series(y).value_counts())

        # Split data into training and test sets with stratification
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101, stratify=y)
        print(f"Train set: {X_train.shape}, Test set: {X_test.shape}")
        print("Train label distribution:")
        print(pd.Series(y_train).value_counts())
        print("Test label distribution:")
        print(pd.Series(y_test).value_counts())

        # Train model
        text_model.train(X_train, y_train)

        # Save model and vectorizer
        # import os
        # os.makedirs("saved_model", exist_ok=True)  # Ensure directory exists
        # text_model.save("saved_model/count-Vectorizer.pkl", "saved_model/Classification.pkl")

        return X_train, X_test, y_train, y_test, text_model, label_encoder

    except Exception as e:
        print(f"Error in model_train: {e}")
        return None, None, None, None, None, None
    
def model_predict(X_test, loaded_model, label_encoder):
    try:
        # X_test is already vectorized, so use it directly for prediction
        print(f"Predicting on test data: {X_test.shape}")
        y_pred = loaded_model.predict(X_test)
        print(f"Predictions made: {y_pred.shape}")
        print("Prediction label distribution:")
        print(pd.Series(y_pred).value_counts())

        # Decode predictions back to original labels ('positive'/'negative')
        y_pred_labels = label_encoder.inverse_transform(y_pred)

        return y_pred, y_pred_labels

    except Exception as e:
        print(f"Error in model_predict: {e}")
        return None, None


### Step 1: Train the Model

In [4]:
# Download NLTK stopwords
try:
    nltk.download('stopwords', quiet=True)
    nltk.download('punkt', quiet=True)  # Added for tokenization
except Exception as e:
    print(f"Error downloading NLTK resources: {e}")

print("NLTK resources downloaded successfully.")

NLTK resources downloaded successfully.


In [5]:
# Path to the IMDB dataset
path = 'datasets/IMDB_Dataset.csv'

# Train the model and get test data
X_train, X_test, y_train, y_test, text_model, label_encoder = model_train(path)
print("Model training completed.")

Dataset loaded: 50000 samples
Raw dataset head:
                                               review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive
Raw label distribution in dataset:
sentiment
positive    25000
negative    25000
Name: count, dtype: int64
Unique raw sentiment values: ['positive' 'negative']
Encoded labels mapping: {'negative': 0, 'positive': 1}
Label distribution after encoding:
sentiment_encoded
1    25000
0    25000
Name: count, dtype: int64
Corpus preprocessed: 50000 samples
Vectorized data: (50000, 5000)
Label distribution for training:
sentiment_encoded
1    25000
0    25000
Name: count, dtype: int64
Train set: (40000, 5000), Test set: (10000, 5000)
Train label distribution:
sentiment_encode

### Step 2: Track the Model

In [15]:
import mlflow

# mlflow.set_tracking_uri(uri="http://127.0.0.1:5000/")
import dagshub
dagshub.init(repo_owner='cyrineanene', repo_name='Sentiment_Analysis', mlflow=True)

experiment_name = 'sentiment_analysis'
try:
    experiment = mlflow.get_experiment_by_name(experiment_name)
    if experiment is None:
        experiment_id = mlflow.create_experiment(experiment_name)
    else:
        experiment_id = experiment.experiment_id
except Exception as e:
    print(f"Error setting up experiment: {e}")
    experiment_id = mlflow.create_experiment(experiment_name)
mlflow.set_experiment(experiment_name)
mlflow.end_run()
with mlflow.start_run(experiment_id=experiment_id) as run:
    mlflow.log_param("max_features", 10000)
    mlflow.log_param("ngram_range", "1-2")
    mlflow.log_param("classifier", "MultinomialNB")
    # mlflow.log_metrics({
    #     'accuracy': report_dict['accuracy'],
    #     'f1_score': report_dict['f1_score']
    # })
    # with open("confusion_matrix.txt", "w") as f:
    #     f.write(str(report_dict['confusion_matrix']))
    # mlflow.log_artifact("confusion_matrix.txt")
    mlflow.sklearn.log_model(text_model, "MultinomialNB") 



🏃 View run invincible-flea-315 at: http://127.0.0.1:5000/#/experiments/4/runs/0bdd1de953404cada1c80e47b12f956a
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/4




🏃 View run indecisive-dove-177 at: http://127.0.0.1:5000/#/experiments/4/runs/f75edca7faad4ecc88b1bf9e041da0cc
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/4


### Step 3: Register the Model

In [16]:
model_name = 'MultinomialNB'
run_id = run.info.run_id
model_uri = f'runs:/{run_id}/{model_name}'

with mlflow.start_run(run_id=run_id):
    mlflow.register_model(model_uri=model_uri, name=model_name)

Registered model 'MultinomialNB' already exists. Creating a new version of this model...
2025/06/27 21:41:25 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: MultinomialNB, version 6


🏃 View run indecisive-dove-177 at: http://127.0.0.1:5000/#/experiments/4/runs/f75edca7faad4ecc88b1bf9e041da0cc
🧪 View experiment at: http://127.0.0.1:5000/#/experiments/4


Created version '6' of model 'MultinomialNB'.


### Step 4: Load, Predict, Evaluate and log metrics to MLflow

In [17]:
#PREDICT
model_uri = f"models:/{model_name}@challenger"

loaded_model = mlflow.sklearn.load_model(model_uri)

y_pred, y_pred_labels = model_predict(X_test, loaded_model, label_encoder)
print("Prediction completed.")

Predicting on test data: (10000, 5000)
Predictions made: (10000,)
Prediction label distribution:
1    5039
0    4961
Name: count, dtype: int64
Prediction completed.


In [18]:
#EVALUATE
if y_pred is not None:
        # Evaluate predictions
        eval = Evaluation(y_test, y_pred)
        eval.calculate_metrics(labels=[0, 1])  # Explicitly include both classes
        eval.print_metrics()
        print("Evaluation completed.")
        
        # Print sample predictions
        print("\nSample predictions:")
        for i in range(5):
            print(f"Sample {i+1}: Predicted = {y_pred_labels[i]}, True = {label_encoder.inverse_transform([y_test.iloc[i]])[0]}")
else:
        print("Prediction failed, cannot proceed with evaluation.")

report_dict = {
    "confusion_matrix": eval.confusion_matrix.tolist(), 
    "f1_score": eval.f1,
    "accuracy": eval.accuracy
}
print("Report dictionary:", report_dict)

mlflow.log_metrics({
        'accuracy': report_dict['accuracy'],
        'f1_score': report_dict['f1_score']
    })
with open("confusion_matrix.txt", "w") as f:
        f.write(str(report_dict['confusion_matrix']))
mlflow.log_artifact("confusion_matrix.txt")

F1 Score: 0.843510309791812
Confusion Matrix:
 [[4195  805]
 [ 766 4234]]
Accuracy: 0.8429
Evaluation completed.

Sample predictions:
Sample 1: Predicted = negative, True = positive
Sample 2: Predicted = negative, True = negative
Sample 3: Predicted = negative, True = positive
Sample 4: Predicted = positive, True = positive
Sample 5: Predicted = positive, True = positive
Report dictionary: {'confusion_matrix': [[4195, 805], [766, 4234]], 'f1_score': 0.843510309791812, 'accuracy': 0.8429}


## With dagshub

It is used to make the uri public to enable my CI/CD Pipeline

In [None]:
import dagshub

dagshub.init(repo_owner='cyrineanene', repo_name='mlflow_dagshub_sentiment_analysis', mlflow=True)