In [1]:
# Adapted from https://www.kaggle.com/code/ruchitass/predicting-stress-a-machine-learning-approach

In [2]:
!python --version

Python 3.9.19


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import re

# Download the stopwords resource
nltk.download('stopwords')

stemmer = nltk.SnowballStemmer("english")

from wordcloud import STOPWORDS
from PIL import Image
from wordcloud import WordCloud

# from textblob import TextBlob
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier


# import nltk
# import re
# stemmer = nltk.SnowballStemmer("english")
from nltk.corpus import stopwords
stopwords = stopwords.words('english')
# import string

import matplotlib.pyplot as plt
import kaggle
import pickle


from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# import warnings
# warnings.filterwarnings("ignore")

import string
#Get a list of punctuations
punct = []
for char in string.punctuation:
    punct.append(char)

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
print(nltk.__version__)

3.8.1


In [5]:
import sklearn
print(sklearn.__version__)

1.4.2


In [6]:
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

In [8]:
import mlflow
from mlflow.tracking import MlflowClient
import os

#os.environ["AWS_PROFILE"] = "dara" # fill in with your AWS profile. More info: https://docs.aws.amazon.com/sdk-for-java/latest/developer-guide/setup.html#setup-credentials

TRACKING_SERVER_HOST = "ec2-13-53-192-42.eu-north-1.compute.amazonaws.com" # fill in with the public DNS of the EC2 instance
mlflow.set_tracking_uri(f"http://{TRACKING_SERVER_HOST}:5000")

In [9]:
EXPERIMENT_NAME = "random-forest-best-models"
mlflow.set_experiment(EXPERIMENT_NAME)

<Experiment: artifact_location='s3://mlflows-artifacts-remote/3', creation_time=1720815677834, experiment_id='3', last_update_time=1720815677834, lifecycle_stage='active', name='random-forest-best-models', tags={}>

In [10]:
mlflow.__version__

'2.12.2'

In [11]:
print(f"tracking URI: '{mlflow.get_tracking_uri()}'")

tracking URI: 'http://ec2-13-53-192-42.eu-north-1.compute.amazonaws.com:5000'


In [13]:
run_id = os.getenv("RUN_ID", "57342ae687254eeeac28602bb8d42aca")
# run_id = "57342ae687254eeeac28602bb8d42aca"

In [14]:
input_file = "dreaddit-test.csv"
output_file = "output/stress_predictions.parquet"

In [16]:
if not os.path.exists("output"):
    os.makedirs("output")

In [17]:
def read_dataframe(filename):
    # dreaddit-test.csv
    df = pd.read_csv(f"../../data/{filename}")
    
    return df
    
    

In [18]:
def process_numerical_features(df, numerical_columns, scaler):
    X_numerical = df[numerical_columns]
    X_numerical = scaler.fit_transform(X_numerical)
    return X_numerical
    

In [19]:
def removal(text):
    text = str(text).lower()
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [w for w in text.split(' ') if w not in stopwords]
    text=" ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text=" ".join(text)
    return text

In [21]:
def process_categorical_features(df, vect = None):
    posts = df[["text"]]
    posts["text"] = posts["text"].apply(removal)
    # posts["sentiment"] = posts["text"].apply(mood)
    X = posts["text"]
    if vect:
        X = vect.transform(X)
    else:
        vect=CountVectorizer(stop_words="english")
        X=vect.fit_transform(X)
    return X, vect
    

In [22]:
def prepare_features(df, vect=None):
    numerical_columns = ["lex_liwc_Tone", "lex_liwc_i", "lex_liwc_negemo", "lex_liwc_Clout", "sentiment"]
    scaler = StandardScaler()
    
    X_categorical, vect = process_categorical_features(df, vect)
    X_numerical = process_numerical_features(df,numerical_columns, scaler)
    X_features = np.hstack((X_categorical.toarray(), X_numerical))

    return X_features
    
    

In [24]:
def load_model_n_vect(run_id):
    logged_model = f'runs:/{run_id}/model'

    # Load model as a PyFuncModel.
    loaded_model = mlflow.pyfunc.load_model(logged_model)

    client = MlflowClient()

    client.download_artifacts(run_id=run_id, path='vectorizer', dst_path='.')
    
    with open("vectorizer/vectorizer.b", "rb") as f_in:
        vect = pickle.load(f_in)
        
    return loaded_model, vect
    

In [25]:
def apply_model(input_file, run_id, output_file):

    df = read_dataframe(input_file)
    model, vect = load_model_n_vect(run_id)

    X_test = prepare_features(df, vect)
    assert X_test.shape[1] == 9453, "feature size does not match"
    
    y_pred = model.predict(X_test)
    "lex_liwc_Tone", "lex_liwc_i", "lex_liwc_negemo", "lex_liwc_Clout", "sentiment"
    df_result = pd.DataFrame()
    df_result['text'] = df['text']
    df_result['lex_liwc_Tone'] = df['lex_liwc_Tone']
    df_result['lex_liwc_i'] = df['lex_liwc_i']
    df_result['lex_liwc_negemo'] = df['lex_liwc_negemo']
    df_result['lex_liwc_Clout'] = df['lex_liwc_Clout']
    df_result['sentiment'] = df['sentiment']
    df_result['actual_label'] = df['label']
    df_result['predicted_stress'] = y_pred
    df_result['diff'] = df_result['actual_label'] - df_result['predicted_stress']
    df_result['model_version'] = run_id
    
    df_result.to_parquet(output_file, index=False)

In [26]:
apply_model(input_file=input_file, run_id=run_id, output_file=output_file)

Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/1 [00:00<?, ?it/s]

In [27]:
df_output = pd.read_parquet(output_file)

In [28]:
df_output

Unnamed: 0,text,lex_liwc_Tone,lex_liwc_i,lex_liwc_negemo,lex_liwc_Clout,sentiment,actual_label,predicted_stress,diff,model_version
0,"Its like that, if you want or not.“ ME: I have...",5.95,5.45,1.82,57.22,0.000000,0,0,0,57342ae687254eeeac28602bb8d42aca
1,I man the front desk and my title is HR Custom...,92.40,6.94,0.00,50.00,-0.065909,0,0,0,57342ae687254eeeac28602bb8d42aca
2,We'd be saving so much money with this new hou...,16.15,6.08,2.03,75.05,-0.036818,1,0,1,57342ae687254eeeac28602bb8d42aca
3,"My ex used to shoot back with ""Do you want me ...",12.51,12.00,2.00,38.19,-0.066667,1,1,0,57342ae687254eeeac28602bb8d42aca
4,I haven’t said anything to him yet because I’m...,89.84,11.54,0.00,50.00,0.141667,0,0,0,57342ae687254eeeac28602bb8d42aca
...,...,...,...,...,...,...,...,...,...,...
710,i have horrible vivid nightmares every night. ...,1.00,9.43,13.21,22.52,-0.075000,1,1,0,57342ae687254eeeac28602bb8d42aca
711,Also I can't think about both of them without ...,77.00,2.78,2.78,55.52,0.189286,1,0,1,57342ae687254eeeac28602bb8d42aca
712,"Furthermore, I told him before we got really s...",1.00,12.68,8.45,28.65,-0.044444,1,1,0,57342ae687254eeeac28602bb8d42aca
713,Here's the link to my amazon wish list where t...,96.76,5.00,0.00,25.24,0.090000,0,0,0,57342ae687254eeeac28602bb8d42aca
