In [None]:
# Adapted from https://www.kaggle.com/code/ruchitass/predicting-stress-a-machine-learning-approach

In [1]:
!python --version

Python 3.9.19


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import re

# Download the stopwords resource
nltk.download('stopwords')

stemmer = nltk.SnowballStemmer("english")

from wordcloud import STOPWORDS
from PIL import Image
from wordcloud import WordCloud

# from textblob import TextBlob
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


# import nltk
# import re
# stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
# import string

import matplotlib.pyplot as plt
import kaggle


from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# import warnings
# warnings.filterwarnings("ignore")

import string
#Get a list of punctuations
punct = []
for char in string.punctuation:
    punct.append(char)

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

In [4]:
# import xgboost

In [5]:
import mlflow
import os

#os.environ["AWS_PROFILE"] = "dara" # fill in with your AWS profile. More info: https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/setup.html#setup-credentials

TRACKING_SERVER_HOST = "ec2-13-48-31-55.eu-north-1.compute.amazonaws.com" # fill in with the public DNS of the EC2 instance
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")

In [6]:
mlflow.__version__

'2.12.2'

In [7]:
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'http://ec2-13-48-31-55.eu-north-1.compute.amazonaws.com:5000'


In [8]:
train = pd.read_csv("../../data/dreaddit-train.csv")
test = pd.read_csv("../../data/dreaddit-test.csv")

In [9]:
numerical_columns = ["lex_liwc_Tone", "lex_liwc_i", "lex_liwc_negemo", "lex_liwc_Clout", "sentiment"]

In [10]:
numerical_columns 

['lex_liwc_Tone',
 'lex_liwc_i',
 'lex_liwc_negemo',
 'lex_liwc_Clout',
 'sentiment']

In [11]:
# Standardization
scaler = StandardScaler()
# X_numerical = scaler.fit_transform(train_numerical)

In [12]:
def process_numerical_features(df):
    X_numerical = df[numerical_columns]
    X_numerical = scaler.fit_transform(train_numerical)
    return X_numerical
    

In [13]:
def removal(text):
    text = str(text).lower()
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [w for w in text.split(' ') if w not in stopwords]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text

In [14]:

vect=CountVectorizer(stop_words="english")

In [15]:
def process_categorical_features(df, vect = None):
    posts = train[["text"]]
    posts["text"] = posts["text"].apply(removal)
    # posts["sentiment"] = posts["text"].apply(mood)
    X = posts["text"]
    if vect:
        print("here")
        X = vect.transform(X)
    else:
        vect=CountVectorizer(stop_words="english")
        X=vect.fit_transform(X)
    return X, vect
    

In [16]:
def prepare_features(df, vect=None):
    X_categorical, vect = process_categorical_features(train, vect)
    X_numerical = process_numerical_features(train)
    X_combined = np.hstack((X_categorical.toarray(), X_numerical))

    return X_combined
    
    

In [17]:
train_numerical = train[numerical_columns] 

In [18]:
X_combined = prepare_features(train)

In [19]:
X_combined.shape;

In [20]:
X_text_features, vect = process_categorical_features(train)
X_text_features.toarray().shape

(2838, 9448)

In [21]:
y = train["label"]

In [22]:
x_train_combined,x_test_combined,y_train,y_test=train_test_split(X_combined,y,random_state=43)
# x_train,x_test,y_train,y_test=train_test_split(train[numerical_columns + categorical_columns],y,random_state=43)

In [23]:
X_combined.shape

(2838, 9453)

In [24]:
# X_text_features

In [25]:
x_train_text,x_test_text,y_train,y_test=train_test_split(X_text_features.toarray(),y,random_state=43)

In [26]:
x_train_combined;

In [28]:
# experiment_name = "project-experiment-tracking"
# mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='s3://mlflows-artifacts-remote/1', creation_time=1720770609928, experiment_id='1', last_update_time=1720770609928, lifecycle_stage='active', name='project-experiment-tracking', tags={}>

In [29]:
# experiment = mlflow.get_experiment_by_name(experiment_name)

In [30]:
# experiment_id = experiment.experiment_id

In [28]:
# experiment_id

    
    

In [56]:
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from hyperopt.pyll import scope

# mlflow.set_tracking_uri("http://127.0.0.1:5000")
mlflow.set_experiment("random-forest-hyperopt")

def objective(params):
    with mlflow.start_run():
        mlflow.set_tag("developer", "Dara")

        # Log hyperparameters
        for param_name, param_value in params.items():
            mlflow.log_param(param_name, param_value)

        rf = RandomForestClassifier(**params)
        rf.fit(x_train_combined, y_train)
        y_pred = rf.predict(x_test_combined)
        accuracy = accuracy_score(y_test, y_pred)

        mlflow.log_metric("accuracy", accuracy)

    return {'loss': -accuracy, 'status': STATUS_OK}  # Negative because fmin minimizes

def run_optimization(num_trials=15):
    search_space = {
        'n_estimators': scope.int(hp.quniform('n_estimators', 10, 200, 1)),
        'min_samples_split': scope.int(hp.quniform('min_samples_split', 2, 5, 1)),
        'random_state': 42
    }


    rstate = np.random.default_rng(42)  # for reproducible results
    trials = Trials()
    best = fmin(
        fn=objective,
        space=search_space,
        algo=tpe.suggest,
        max_evals=num_trials,
        trials=trials,
        rstate=rstate
    )

    print("Best hyperparameters: ", best)

# Run optimization
run_optimization(num_trials=15)



  0%|                                                     | 0/15 [00:00<?, ?trial/s, best loss=?]




  7%|█▋                        | 1/15 [00:13<03:11, 13.67s/trial, best loss: -0.7309859154929578]




 13%|███▍                      | 2/15 [00:21<02:13, 10.25s/trial, best loss: -0.7450704225352113]




 20%|█████▏                    | 3/15 [00:28<01:42,  8.58s/trial, best loss: -0.7450704225352113]




 27%|██████▉                   | 4/15 [00:37<01:39,  9.08s/trial, best loss: -0.7450704225352113]




 33%|█████████                  | 5/15 [00:48<01:37,  9.77s/trial, best loss: -0.747887323943662]




 40%|██████████▊                | 6/15 [01:02<01:40, 11.12s/trial, best loss: -0.747887323943662]




 47%|████████████▌              | 7/15 [01:16<01:35, 11.88s/trial, best loss: -0.747887323943662]




 53%|██████████████▍            | 8/15 [01:21<01:08,  9.73s/trial, best loss: -0.747887323943662]




 60%|████████████████▏          | 9/15 [01:35<01:06, 11.04s/trial, best loss: -0.747887323943662]




 67%|█████████████████▎        | 10/15 [01:44<00:51, 10.37s/trial, best loss: -0.747887323943662]




 73%|███████████████████       | 11/15 [01:49<00:34,  8.73s/trial, best loss: -0.747887323943662]




 80%|████████████████████▊     | 12/15 [02:03<00:31, 10.40s/trial, best loss: -0.747887323943662]




 87%|██████████████████████▌   | 13/15 [02:10<00:19,  9.54s/trial, best loss: -0.747887323943662]




 93%|████████████████████████▎ | 14/15 [02:16<00:08,  8.26s/trial, best loss: -0.747887323943662]




100%|██████████████████████████| 15/15 [02:23<00:00,  9.54s/trial, best loss: -0.747887323943662]
Best hyperparameters:  {'min_samples_split': 2.0, 'n_estimators': 106.0}


In [57]:
from mlflow.entities import ViewType
from mlflow.tracking import MlflowClient


HPO_EXPERIMENT_NAME = "random-forest-hyperopt"
EXPERIMENT_NAME = "random-forest-best-models"
RF_PARAMS = ['n_estimators', 'min_samples_split', 'random_state']

mlflow.set_experiment(EXPERIMENT_NAME)
mlflow.sklearn.autolog()

def train_and_log_model(params):

    with mlflow.start_run():
        for param in RF_PARAMS:
            params[param] = int(params[param])
        print(params)
        rf = RandomForestClassifier(**params)
        rf.fit(x_train_combined, y_train)

        # Evaluate model on the validation and test sets
        accuracy = accuracy_score(y_test, rf.predict(x_test_combined))
        mlflow.log_metric("accuracy", accuracy)


def run_register_model(top_n: int):

    client = MlflowClient()

    # Retrieve the top_n model runs and log the models
    experiment = client.get_experiment_by_name(HPO_EXPERIMENT_NAME)
    runs = client.search_runs(
        experiment_ids=experiment.experiment_id,
        run_view_type=ViewType.ACTIVE_ONLY,
        max_results=top_n,
        order_by=["metrics.accuracy DESC"]
    )
    for run in runs:
        train_and_log_model(params=run.data.params)

    # Select the model with the lowest test RMSE
    experiment = client.get_experiment_by_name(EXPERIMENT_NAME)
    # best_run = client.search_runs(experiment_ids="1")[0]
    best_run = client.search_runs(experiment_ids=[experiment.experiment_id], max_results=top_n, order_by=["metrics.test_rmse ASC"])
    # best_run = client.search_runs(experiment)[0]
    # print(best_run)

    # Register the best model
    run_id = best_run[0].info.run_id
    model_uri = f'runs:/{run_id}/model'
    print(run_id)
    mlflow.register_model(model_uri=model_uri, name="stress_predictor")



In [55]:
run_register_model(2)



{'bootstrap': 'True', 'ccp_alpha': '0.0', 'class_weight': 'None', 'criterion': 'gini', 'max_depth': 'None', 'max_features': 'sqrt', 'max_leaf_nodes': 'None', 'max_samples': 'None', 'min_impurity_decrease': '0.0', 'min_samples_leaf': '1', 'min_samples_split': 2, 'min_weight_fraction_leaf': '0.0', 'monotonic_cst': 'None', 'n_estimators': 106, 'n_jobs': 'None', 'oob_score': 'False', 'random_state': 42, 'verbose': '0', 'warm_start': 'False'}


InvalidParameterError: The 'bootstrap' parameter of RandomForestClassifier must be an instance of 'bool' or an instance of 'numpy.bool_'. Got 'True' instead.