In [1]:
# Adapted from https://www.kaggle.com/code/ruchitass/predicting-stress-a-machine-learning-approach

In [1]:
!python --version

Python 3.9.19


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import re

# Download the stopwords resource
nltk.download('stopwords')

stemmer = nltk.SnowballStemmer("english")

from wordcloud import STOPWORDS
from PIL import Image
from wordcloud import WordCloud

# from textblob import TextBlob
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


# import nltk
# import re
# stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
# import string

import matplotlib.pyplot as plt
import kaggle


from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# import warnings
# warnings.filterwarnings("ignore")

import string
#Get a list of punctuations
punct = []
for char in string.punctuation:
    punct.append(char)

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

In [5]:
# import xgboost

In [6]:
import mlflow
import os

#os.environ["AWS_PROFILE"] = "dara" # fill in with your AWS profile. More info: https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/setup.html#setup-credentials

TRACKING_SERVER_HOST = "ec2-16-171-53-183.eu-north-1.compute.amazonaws.com" # fill in with the public DNS of the EC2 instance
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")

In [7]:
mlflow.__version__

'2.12.2'

In [8]:
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'http://ec2-16-171-53-183.eu-north-1.compute.amazonaws.com:5000'


In [9]:
train = pd.read_csv("../../data/dreaddit-train.csv")
test = pd.read_csv("../../data/dreaddit-test.csv")

In [10]:
numerical_columns = ["lex_liwc_Tone", "lex_liwc_i", "lex_liwc_negemo", "lex_liwc_Clout", "sentiment"]

In [11]:
numerical_columns 

['lex_liwc_Tone',
 'lex_liwc_i',
 'lex_liwc_negemo',
 'lex_liwc_Clout',
 'sentiment']

In [12]:
# Standardization
scaler = StandardScaler()
# X_numerical = scaler.fit_transform(train_numerical)

In [13]:
def process_numerical_features(df):
    X_numerical = df[numerical_columns]
    X_numerical = scaler.fit_transform(X_numerical)
    return X_numerical
    

In [14]:
def removal(text):
    text = str(text).lower()
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [w for w in text.split(' ') if w not in stopwords]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text

In [15]:

vect=CountVectorizer(stop_words="english")

In [16]:
def process_categorical_features(df, vect = None):
    posts = df[["text"]]
    posts["text"] = posts["text"].apply(removal)
    # posts["sentiment"] = posts["text"].apply(mood)
    X = posts["text"]
    if vect:
        X = vect.transform(X)
    else:
        vect=CountVectorizer(stop_words="english")
        X=vect.fit_transform(X)
    return X, vect
    

In [17]:
def prepare_features(df, vect=None):
    X_categorical, vect = process_categorical_features(df, vect)
    X_numerical = process_numerical_features(df)
    X_combined = np.hstack((X_categorical.toarray(), X_numerical))

    return X_combined
    
    

In [18]:
# train_numerical = train[numerical_columns] 

In [19]:
X_combined = prepare_features(train)

In [20]:
X_combined.shape;

In [21]:
X_text_features, vect = process_categorical_features(train)
X_text_features.toarray().shape

(2838, 9448)

In [22]:
y = train["label"]

In [23]:
x_train_combined,x_test_combined,y_train,y_test=train_test_split(X_combined,y,random_state=43)
# x_train,x_test,y_train,y_test=train_test_split(train[numerical_columns + categorical_columns],y,random_state=43)

In [24]:
X_combined.shape

(2838, 9453)

In [25]:
# X_text_features

In [26]:
x_train_text,x_test_text,y_train,y_test=train_test_split(X_text_features.toarray(),y,random_state=43)

In [27]:
x_train_combined;

In [28]:
experiment_name = "project-experiment-tracking"
mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='s3://mlflows-artifacts-remote/1', creation_time=1720770609928, experiment_id='1', last_update_time=1720770609928, lifecycle_stage='active', name='project-experiment-tracking', tags={}>

In [29]:
experiment = mlflow.get_experiment_by_name(experiment_name)

In [30]:
experiment_id = experiment.experiment_id

In [31]:
experiment_id

    
    

'1'

In [32]:
# Define your models and datasets
models_to_train = ["logistic_reg","decision_tree","random_forest","xgboost"]

tags = ["combined", "text_only"]

x_data = {
    "train": {
        'combined': x_train_combined,
        'text_only': x_train_text
    },
    "test": {
        'combined': x_test_combined,
        'text_only': x_test_text
    }
}

# Start a new MLflow run for each model
for model_name in models_to_train:
    for tag in tags:   
        with mlflow.start_run(run_name=f"{model_name}_{tag}"):
            # Initialize the model
            if model_name == 'logistic_reg':
                model = LogisticRegression(max_iter=1000)
            elif model_name == "decision_tree":
                model = DecisionTreeClassifier()
            elif model_name == "random_forest":
                model = RandomForestClassifier(n_estimators=100, random_state=42)
            elif model_name == "xgboost":
                model = XGBClassifier(n_estimators=100, random_state=42)
            else:
                raise ValueError(f"Unsupported model: {model_name}")

            train_data = x_data["train"][tag]
            test_data = x_data["test"][tag]
            
            # Train the model
            model.fit(x_data["train"][tag], y_train)
            # Predict on the test set
            y_pred = model.predict(x_data["test"][tag])
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred) *100
            
            # Log parameters and metrics
            params = model.get_params()
            mlflow.log_params(params)
            mlflow.sklearn.log_model(model, f"model")
            mlflow.log_metric(f"accuracy_{tag}", accuracy)

            
            # Print model evaluation
            print(f"Model: {model_name}, Tag: {tag} Accuracy: {accuracy}")

            # print(f"default artifacts URI: '{mlflow.get_artifact_uri()}'")
            
            # End the MLflow run
            mlflow.end_run()
            




Model: logistic_reg, Tag: combined Accuracy: 0.7098591549295775
Model: logistic_reg, Tag: text_only Accuracy: 0.6746478873239437
Model: decision_tree, Tag: combined Accuracy: 0.6605633802816901
Model: decision_tree, Tag: text_only Accuracy: 0.6225352112676056
Model: random_forest, Tag: combined Accuracy: 0.7450704225352113
Model: random_forest, Tag: text_only Accuracy: 0.6605633802816901
Model: xgboost, Tag: combined Accuracy: 0.7281690140845071
Model: xgboost, Tag: text_only Accuracy: 0.6774647887323944
