# Stress Analysis in Social Media

Leverage the newly published and labelled reddit dataset for stress analysis to develop and improve supervised learning methods for identifying stress, both neural and traditional, and analyze the complexity and diversity of the data and characteristics of each category.

In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from gensim.models import Word2Vec

In [2]:
import sys
sys.path.append('../')
from data_preprocess import Posts
from word_embedding_vectorizer import WordEmbeddingVectorizer

## Prepare Dataset

In [3]:
# path = '/content/Insight_Stress_Analysis/data/' 
path = '../../data/'
train = pd.read_csv(path + 'dreaddit-train.csv', encoding = "ISO-8859-1")
test = pd.read_csv(path + 'dreaddit-test.csv', encoding = "ISO-8859-1")

In [4]:
train_text = Posts(train.text)
test_text = Posts(test.text)

## Create Model for Production

In [None]:
train_text = train_text.preprocess()
test_text = test_text.preprocess()

  from pandas import Panel
100%|██████████| 2838/2838 [00:00<00:00, 32353.68it/s]
  0%|          | 6/2838 [00:00<00:48, 58.07it/s]

start preprocessing data...


 45%|████▌     | 1289/2838 [00:14<00:15, 98.88it/s] 

In [None]:
word2vec = Word2Vec(train_text, size=300, window=10, min_count=2, workers=10, iter=100)
word_vectorizer = WordEmbeddingVectorizer(word2vec)

In [None]:
X_train = word_vectorizer.fit(train_text).transform(train_text)
X_test = word_vectorizer.fit(test_text).transform(test_text)

In [None]:
y_train = train.label
y_test = test.label

In [None]:
# random forest
word_embedding_rf = RandomForestClassifier(n_estimators=100, random_state=0).fit(X_train, y_train) 
y_pred = word_embedding_rf.predict(X_test)
print(classification_report(y_test, y_pred))

In [None]:
print(accuracy_score(y_test, y_pred))

In [None]:
print(confusion_matrix(y_test, y_pred))

## BentoService for model serving

In [None]:
%%writefile word_embedding_model.py
import pandas as pd
import bentoml
from bentoml.artifact import PickleArtifact
from bentoml.handlers import DataframeHandler
from data_preprocess import Posts
from word_embedding_vectorizer import WordEmbeddingVectorizer
from gensim.models import Word2Vec

@bentoml.artifacts([PickleArtifact('word_vectorizer'),
                    PickleArtifact('word_embedding_rf')]) 

@bentoml.env(pip_dependencies=["pandas", "numpy", "gensim", "scikit-learn", "nltk"])

class WordEmbeddingModel(bentoml.BentoService):
        
    @bentoml.api(DataframeHandler, typ='series')
    def data_preprocess(self, series):
        preprocess_series = Posts(series).preprocess()
        input_matrix = self.artifacts.word_vectorizer.fit(preprocess_series).transform(preprocess_series)
        return input_matrix
    
    @bentoml.api(DataframeHandler, typ='series')
    def predict(self, series):
        input_matrix = self.data_preprocess(series)
        pred_labels = self.artifacts.word_embedding_rf.predict(input_matrix)
        pred_proba = self.artifacts.word_embedding_rf.predict_proba(input_matrix)
        confidence_score = [prob[1] for prob in pred_proba]
        output = pd.DataFrame({'text': series, 'confidence_score': confidence_score, 'labels': pred_labels})
        output['labels'] = output['labels'].map({1: 'stress', 0: 'non-stress'})
        
        return output

In [None]:
sys.path.append('../')
from word_embedding_rf_model import WordEmbeddingRFModel
# Initialize bentoML model with artifacts

bento_model = WordEmbeddingRFModel()
bento_model.pack('word_vectorizer', word_vectorizer)
bento_model.pack('word_embedding_rf', word_embedding_rf)

# Save bentoML model to directory
saved_path = bento_model.save()

In [None]:
# print the directory containing exported model archive (prefixed with model name and version)
print(saved_path)

## Load BentoService from saved bundle

In [None]:
import bentoml

# Load exported bentoML model archive from path
bento_model = bentoml.load(saved_path)

# Call predict on the restored sklearn model
series = test.text.iloc[:10]
bento_model.predict(series)

In [None]:
bento_tag = '{name}:{version}'.format(name=bento_model.name, version=bento_model.version)
bento_tag

## Deploy BentoService with Google Cloud Run
Tutorial: https://github.com/bentoml/BentoML/blob/master/guides/deployment/deploy-with-google-cloud-run/deploy-with-google-cloud-run.ipynb