In [1]:
#import required packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import re
import numpy as np
from sklearn.metrics import f1_score,classification_report, confusion_matrix,precision_score, recall_score
from tensorflow import keras

In [2]:
#post data
data = pd.read_csv('input_data/Post_ModelPredictions_ManualChecking.csv')
df = data.copy()
df.head()

Unnamed: 0,post,prefLabel,compulsion_ml,obs-com_ml,obsession_ml,Document_Keywords,compulsion_bert,obs-com_bert,obsession_bert,obsession_first_checker,obs-com_first_checker,compulsion_first_checker
0,"I grew up with obsessive thoughts, or more apt...","['site', 'compulsion', 'obsession', 'rash', 'p...",1,0,1,"['grow', 'obsessive_thought', 'aptly', 'intrus...",0.0,0.0,1.0,1.0,1.0,1.0
1,I have been thinking so much about Covid becau...,"['singing', 'washing hands', 'anxiety', 'anxie...",0,0,1,"['think', 'covid', 'job', 'concern', 'encourag...",0.0,0.0,1.0,0.0,0.0,1.0
2,"I fed up, I send all day checking excessive ch...","['OCD', 'checking', 'Thought']",0,0,1,"['send', 'day', 'check', 'excessive', 'checkin...",0.0,0.0,1.0,0.0,0.0,1.0
3,I’ve noticed that I’ve been washing my hands m...,"['washing hands', 'hope', 'depression', 'OCD']",1,0,1,"['notice', 'washing_hand', 'thing', 'trigger',...",0.0,0.0,1.0,1.0,0.0,0.0
4,This sounds so stupid\nAt the moment I want to...,['Thought'],0,0,1,"['sounds_stupid', 'moment', 'download', 'windo...",0.0,0.0,1.0,1.0,1.0,1.0


In [3]:
import re
#define the process of text cleaning
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r' ',text)
#Clean Text
def clean_text(data):
    # convert catacter to lowercase
    data['clean_text']=data['post'].str.lower()
    #remove URLS
    data['clean_text'] = data['clean_text'].apply(lambda elem:re.sub(r"http\S+", "", elem))
    #remove ponctuation
    data['clean_text'] = data['clean_text'].apply(lambda elem:re.sub(r"[^\w\s]", "", elem))
    #remove
    data['clean_text'] = data['clean_text'].apply(lambda elem:re.sub(r'/n',"",elem))
    #remove degits
    data['clean_text'] = data['clean_text'].apply(lambda elem:re.sub(r'\d+',"",elem))
    #remove emojis
    data['clean_text'] = data['clean_text'].apply(lambda elem:deEmojify(elem))
    #remove multiple spaces
    data['clean_text'] = data['clean_text'].apply(lambda elem:re.sub(r'\s+'," ",elem))
    #remove single caracter
    data['clean_text'] = data['clean_text'].apply(lambda elem:re.sub(r'\s+[a-zA-Z]\s+'," ",elem))
    return data

In [4]:
#apply the process of cleaning for the train and test data
df = clean_text(df)

In [5]:
#for the text pre-processing (text cleaning)
import nltk
nltk.download('punkt')
import re #regular expression
import string
from nltk.tokenize import word_tokenize # word tokenization
from nltk.stem import PorterStemmer # word stemming

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
def remove_stop_words(text):
  stopW=stopwords.words('english')
  s=""
  for i in text.split():
    if i not in stopW:
        s=s+i+" "
  return s

In [8]:
df['clean_text']=df['clean_text'].apply(lambda x:remove_stop_words(x))

In [9]:
def stem(ch):
  stem = PorterStemmer()
  return " ".join([stem.stem(i) for i in ch.split()])

#apply the stem function to each row in the dataframe
df['clean_text'] = df['clean_text'].apply(lambda x:stem(x))

In [10]:
X_test = df['clean_text']

In [13]:
import pickle
num_words = 2000
with open('model/lstm_enrich_tockenizer_compulsion.pkl', 'rb') as f:
    vect_compulsion = pickle.load(f)
vocab_size = len(vect_compulsion.word_index) + 1
print(vocab_size)

21242


In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
MAX_LEN = 100
encoded_docs_test =  vect_compulsion.texts_to_sequences(X_test)
padded_docs_test = pad_sequences(encoded_docs_test, maxlen=MAX_LEN, padding='pre')
len(padded_docs_test)

23674

In [15]:
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

In [16]:
def get_model():
    model = Sequential()
    # Configuring the parameters
    model.add(Embedding(num_words, output_dim=16, input_length=MAX_LEN))
    model.add(LSTM(16, return_sequences=True))  
    # Adding a dropout layer
    model.add(Dropout(0.5))
    model.add(LSTM(8))
    # model.add(Dropout(0.5))
    model.add(Dense(4))
    # model.add(Dropout(0.5))
    # Adding a dense output layer with sigmoid activation
    model.add(Dense(1, activation='sigmoid'))
    print(model.summary())
    return model

In [17]:
compulsion_model = get_model()
compulsion_model.load_weights('model/lstm_compulsion_enrich_model.h5')

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 16)           32000     
                                                                 
 lstm (LSTM)                 (None, 100, 16)           2112      
                                                                 
 dropout (Dropout)           (None, 100, 16)           0         
                                                                 
 lstm_1 (LSTM)               (None, 8)                 800       
                                                                 
 dense (Dense)               (None, 4)                 36        
                                                                 
 dense_1 (Dense)             (None, 1)                 5         
                                                                 
Total params: 34,953
Trainable params: 34,953
Non-traina

In [18]:
compulsion_predictions=compulsion_model.predict([padded_docs_test])



In [19]:
import pickle
num_words = 2000
with open('model/lstm_enrich_tockenizer_obsession.pkl', 'rb') as f:
    vect_obsession = pickle.load(f)
vocab_size = len(vect_obsession.word_index) + 1
print(vocab_size)

21242


In [20]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
MAX_LEN = 100
encoded_docs_test =  vect_obsession.texts_to_sequences(X_test)
padded_docs_test = pad_sequences(encoded_docs_test, maxlen=MAX_LEN, padding='pre')
len(padded_docs_test)

23674

In [21]:
obsession_model = get_model()
obsession_model.load_weights('model/lstm_obsession_enrich_model.h5')

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 100, 16)           32000     
                                                                 
 lstm_2 (LSTM)               (None, 100, 16)           2112      
                                                                 
 dropout_1 (Dropout)         (None, 100, 16)           0         
                                                                 
 lstm_3 (LSTM)               (None, 8)                 800       
                                                                 
 dense_2 (Dense)             (None, 4)                 36        
                                                                 
 dense_3 (Dense)             (None, 1)                 5         
                                                                 
Total params: 34,953
Trainable params: 34,953
Non-trai

In [22]:
obsession_predictions=obsession_model.predict([padded_docs_test])



In [23]:
compulsion_gt = data['compulsion_first_checker']
compulsion_gt.head()

0    1.0
1    1.0
2    1.0
3    0.0
4    1.0
Name: compulsion_first_checker, dtype: float64

In [24]:
compulsion_predictions[:5]

array([[0.9533845 ],
       [0.15155517],
       [0.12229809],
       [0.95484114],
       [0.07827639]], dtype=float32)

In [25]:
obsession_gt = data['obsession_first_checker']
obsession_gt.head()

0    1.0
1    0.0
2    0.0
3    1.0
4    1.0
Name: obsession_first_checker, dtype: float64

In [26]:
obsession_predictions[:5]

array([[0.98839104],
       [0.07851634],
       [0.04445652],
       [0.0439509 ],
       [0.56093556]], dtype=float32)

In [27]:
compulsion = pd.DataFrame({
    'gt': np.array(compulsion_gt.values[:306], dtype=np.float32),
    'pred': np.array(compulsion_predictions.squeeze()[:306], dtype=np.float32)
})
compulsion.dropna(axis=0, inplace=True)
report = classification_report(compulsion['gt'].tolist(), (compulsion['pred'] > 0.5).tolist())
print('--------------------compulsion report--------------------')
print(report)

--------------------compulsion report--------------------
              precision    recall  f1-score   support

         0.0       0.90      0.95      0.92       274
         1.0       0.07      0.03      0.04        30

    accuracy                           0.86       304
   macro avg       0.48      0.49      0.48       304
weighted avg       0.82      0.86      0.84       304



In [28]:
obsession = pd.DataFrame({
    'gt': np.array(obsession_gt.values[:306], dtype=np.float32),
    'pred': np.array(obsession_predictions.squeeze()[:306], dtype=np.float32)
})
obsession.dropna(axis=0, inplace=True)
report = classification_report(obsession['gt'].tolist(), (obsession['pred'] > 0.7).tolist())
print('--------------------obsession report--------------------')
print(report)

--------------------obsession report--------------------
              precision    recall  f1-score   support

         0.0       0.61      0.60      0.61       187
         1.0       0.38      0.38      0.38       117

    accuracy                           0.52       304
   macro avg       0.49      0.49      0.49       304
weighted avg       0.52      0.52      0.52       304



In [29]:
data['compulsion_enrich_lstm_pred'] = compulsion_predictions.squeeze()
data['obsession_enrich_lstm_pred'] = obsession_predictions.squeeze()

In [30]:
data.to_csv('input_data/Post_ModelPredictions_lstm_enrich_separate_ManualChecking.csv')