In [1]:
#import required packages
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import re
import numpy as np
from sklearn.metrics import f1_score,classification_report, confusion_matrix,precision_score, recall_score
from tensorflow import keras

In [2]:
#post data
df = pd.read_csv('processed_data/enriched.csv',usecols=['enriched_post','compulsion','obs-com','obsession'])
df.head()

Unnamed: 0,compulsion,obs-com,obsession,enriched_post
0,0,0,1,Hi -\nSo I haven't been on here since December...
1,1,1,1,"Hi all, {Hoffnung} hope {espoir} {hope} you're..."
2,0,0,1,"Hi, \nFirst, I {Hoffnung} hope {espoir} {hope}..."
3,0,0,1,Hello everyone. I could really use your help r...
4,1,1,1,"Though it comes in many flavors, one of the mo..."


# **Data Preparation**

In [3]:
import re
#define the process of text cleaning
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r' ',text)
#Clean Text
def clean_text(data):
    # convert catacter to lowercase
    data['clean_text']=data['enriched_post'].str.lower()
    #remove URLS
    data['clean_text'] = data['clean_text'].apply(lambda elem:re.sub(r"http\S+", "", elem))
    #remove ponctuation
    data['clean_text'] = data['clean_text'].apply(lambda elem:re.sub(r"[^\w\s]", "", elem))
    #remove
    data['clean_text'] = data['clean_text'].apply(lambda elem:re.sub(r'/n',"",elem))
    #remove degits
    data['clean_text'] = data['clean_text'].apply(lambda elem:re.sub(r'\d+',"",elem))
    #remove emojis
    data['clean_text'] = data['clean_text'].apply(lambda elem:deEmojify(elem))
    #remove multiple spaces
    data['clean_text'] = data['clean_text'].apply(lambda elem:re.sub(r'\s+'," ",elem))
    #remove single caracter
    data['clean_text'] = data['clean_text'].apply(lambda elem:re.sub(r'\s+[a-zA-Z]\s+'," ",elem))
    return data

In [4]:
#apply the process of cleaning for the train and test data
df = clean_text(df)

In [5]:
#for the text pre-processing (text cleaning)
import nltk
nltk.download('punkt')
import re #regular expression
import string
from nltk.tokenize import word_tokenize # word tokenization
from nltk.stem import PorterStemmer # word stemming

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
def remove_stop_words(text):
  stopW=stopwords.words('english')
  s=""
  for i in text.split():
    if i not in stopW:
        s=s+i+" "
  return s

In [8]:
df['clean_text']=df['clean_text'].apply(lambda x:remove_stop_words(x))

In [9]:
def stem(ch):
  stem = PorterStemmer()
  return " ".join([stem.stem(i) for i in ch.split()])

#apply the stem function to each row in the dataframe
df['clean_text'] = df['clean_text'].apply(lambda x:stem(x))

In [10]:
X, Y = df['clean_text'], df[['obsession']]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [11]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((5235,), (5235, 1), (1309,), (1309, 1))

In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer
num_words = 2000
vect=Tokenizer(num_words=num_words)
vect.fit_on_texts(X_train)
vocab_size = len(vect.word_index) + 1
print(vocab_size)

21242


In [13]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
encoded_docs_train = vect.texts_to_sequences(X_train)
MAX_LEN = 150
padded_docs_train = pad_sequences(encoded_docs_train, maxlen=MAX_LEN, padding='pre')
padded_docs_train.shape

(5235, 150)

In [14]:
encoded_docs_test =  vect.texts_to_sequences(X_test)
padded_docs_test = pad_sequences(encoded_docs_test, maxlen=MAX_LEN, padding='pre')

In [15]:
from keras import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout

In [16]:
def get_model():
    model = Sequential()
    # Configuring the parameters
    model.add(Embedding(num_words, output_dim=16, input_length=MAX_LEN))
    model.add(LSTM(16, return_sequences=True))  
    # Adding a dropout layer
    model.add(Dropout(0.5))
    model.add(LSTM(8))
    # model.add(Dropout(0.5))
    model.add(Dense(4))
    # model.add(Dropout(0.5))
    # Adding a dense output layer with sigmoid activation
    model.add(Dense(1, activation='sigmoid'))
    print(model.summary())
    return model

In [17]:
obsession_model = get_model()
obsession_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
obsession_history = obsession_model.fit(padded_docs_train, y_train, epochs=1000, batch_size=16,validation_split=0.2,
                    callbacks=[keras.callbacks.EarlyStopping(monitor='val_loss',patience=20, min_delta=1e-7),
                              keras.callbacks.ReduceLROnPlateau(factor=0.2, patience=10),
                              keras.callbacks.ModelCheckpoint(filepath='model/lstm_obsession_enrich_model.h5', 
                                      monitor='val_loss', 
                                      save_best_only=True)])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 150, 16)           32000     
                                                                 
 lstm (LSTM)                 (None, 150, 16)           2112      
                                                                 
 dropout (Dropout)           (None, 150, 16)           0         
                                                                 
 lstm_1 (LSTM)               (None, 8)                 800       
                                                                 
 dense (Dense)               (None, 4)                 36        
                                                                 
 dense_1 (Dense)             (None, 1)                 5         
                                                                 
Total params: 34,953
Trainable params: 34,953
Non-traina

In [18]:
obsession_model.load_weights('model/lstm_obsession_enrich_model.h5')
obsession_predictions = obsession_model.predict([padded_docs_test])



In [19]:
print('Obsession Prediction Result')
thresholds=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9]
for val in thresholds:
    pred=obsession_predictions.copy()
  
    pred[pred>=val]=1
    pred[pred<val]=0
  
    precision = precision_score(y_test.values[:, :1], pred, average='micro')
    recall = recall_score(y_test.values[:, :1], pred, average='micro')
    f1 = f1_score(y_test.values[:, :1], pred, average='micro')
   
    print("Micro-average quality numbers for threshold", val)
    print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(precision, recall, f1))

Obsession Prediction Result
Micro-average quality numbers for threshold 0.1
Precision: 0.8709, Recall: 0.8709, F1-measure: 0.8709
Micro-average quality numbers for threshold 0.2
Precision: 0.8709, Recall: 0.8709, F1-measure: 0.8709
Micro-average quality numbers for threshold 0.3
Precision: 0.8762, Recall: 0.8762, F1-measure: 0.8762
Micro-average quality numbers for threshold 0.4
Precision: 0.8747, Recall: 0.8747, F1-measure: 0.8747
Micro-average quality numbers for threshold 0.5
Precision: 0.8724, Recall: 0.8724, F1-measure: 0.8724
Micro-average quality numbers for threshold 0.6
Precision: 0.8709, Recall: 0.8709, F1-measure: 0.8709
Micro-average quality numbers for threshold 0.7
Precision: 0.8617, Recall: 0.8617, F1-measure: 0.8617
Micro-average quality numbers for threshold 0.8
Precision: 0.8388, Recall: 0.8388, F1-measure: 0.8388
Micro-average quality numbers for threshold 0.9
Precision: 0.7815, Recall: 0.7815, F1-measure: 0.7815


In [21]:
Y_test = y_test.copy()
Y_test['obsession_pred'] = obsession_predictions

In [22]:
Y_test[:40]

Unnamed: 0,obsession,obsession_pred
1277,1,0.989207
4982,1,0.973672
1022,1,0.988665
5835,1,0.988643
1675,1,0.986666
2137,0,0.025268
48,1,0.891282
751,1,0.988762
1106,0,0.819443
3680,0,0.944341


In [23]:
import pickle
with open('model/lstm_enrich_tockenizer_obsession.pkl', 'wb') as f:
    pickle.dump(vect, f)