In [151]:
import os
os.listdir()

['.env',
 '.git',
 '.gitignore',
 '.ipynb_checkpoints',
 'cleaned_data.csv',
 'cleaned_data_relevant.csv',
 'cleaned_pro-ana-data.xls',
 'cleaned_pro-recovery-data.xls',
 'data.csv',
 'data.xls',
 'EDA_cleaning.ipynb',
 'gensim-data',
 'invalid_words.txt',
 'model_cleaned.ipynb',
 'model_pretrained.ipynb',
 'model_tuning.ipynb',
 'pro-ana-data.xls',
 'pro-recovery-data.xls',
 'proanavsprorecovery.zip',
 'tmpres.txt']

In [152]:
import pandas

data = pandas.read_csv('data.csv')

In [153]:
# filtered out
data_f = pandas.read_csv('cleaned_data_relevant.csv')
data = data[data['id'].isin(data_f['id'])]
data

#data = data_f
#cleaned_data = data[:]

Unnamed: 0,id,label,text
1,2_0,0,i had a long battle with anorexia
2,3_0,0,those thoughts telling me that if i just lost...
3,4_0,0,the trouble is that never happened
4,5_0,0,there was never a magic number
5,6_0,0,it was never enough
...,...,...,...
3364,2131_1,1,the last pro ana diet comes with a twist in at...
3365,2132_1,1,"in this diet, you can hardly eat any carbs bu..."
3366,2133_1,1,"with this diet, you will see a drastic loss i..."
3367,2134_1,1,"well, these were some of the best pro ana diet..."


In [154]:
data.head()

Unnamed: 0,id,label,text
1,2_0,0,i had a long battle with anorexia
2,3_0,0,those thoughts telling me that if i just lost...
3,4_0,0,the trouble is that never happened
4,5_0,0,there was never a magic number
5,6_0,0,it was never enough


In [165]:
import string
import wordninja
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer

def clean_text(text: str, stem: bool = True):
    text = text.lower()   
    remove_string = 'tcb-script]'  # random string appearing in some text samples
    text = text.replace(remove_string, '')
    
    tokens = separate_words(text)
    
    # remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    
    tokens = [word for word in tokens if word.isalpha()]

    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if not w in stop_words]
    
    if stem:
        porter = PorterStemmer()
        tokens = [porter.stem(word) for word in tokens]
    
    return tokens
    
def separate_words(text):
    return wordninja.split(text)

[nltk_data] Downloading package stopwords to /home/gabi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [166]:
from functools import partial

cleaned_data = data[:]

cleaned_data['text'] = cleaned_data['text'].apply(partial(clean_text, stem=False))
cleaned_data['text'] = cleaned_data['text'].apply(lambda t: " ".join(t))
cleaned_data.head()

Unnamed: 0,id,label,text
1,2_0,0,long battle anorexia
2,3_0,0,thoughts telling lost x amount weight would ac...
3,4_0,0,trouble never happened
4,5_0,0,never magic number
5,6_0,0,never enough


# Model

In [167]:
max_seq_len = cleaned_data['text'].apply(lambda t: len(t)).max()
max_seq_len

692

In [168]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(cleaned_data, test_size=0.1)
train_X, train_y = train['text'], train['label']
test_X, test_y = test['text'], test['label']

In [169]:
import keras
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_X)

train_X = tokenizer.texts_to_sequences(train_X)
train_X = keras.preprocessing.sequence.pad_sequences(train_X,
                                                         maxlen=max_seq_len, padding='pre')

test_X = tokenizer.texts_to_sequences(test_X)
test_X = keras.preprocessing.sequence.pad_sequences(test_X,
                                                         maxlen=max_seq_len, padding='pre')

In [170]:
import tensorflow as tf
tf.test.is_gpu_available()

False

## Load embedding

In [43]:
import gensim.downloader as api
path = api.load("word2vec-google-news-300", return_path=True)
print(path)

/home/gabi/gensim-data/word2vec-google-news-300/word2vec-google-news-300.gz


In [51]:
import os
from gensim.models import KeyedVectors
from gensim.downloader import base_dir


def load_data():
    path = os.path.join(base_dir, 'word2vec-google-news-300', "word2vec-google-news-300.gz")
    model = KeyedVectors.load_word2vec_format(path, binary=True)
    return model

pretrained_emb = load_data()

In [48]:
import sys
sys.path.append('/home/gabi/gensim-data/word2vec-google-news-300/')

from . import load_data
pretrained_emb = load_data()

ImportError: cannot import name 'load_data'

In [52]:
pretrained_emb

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x7f2c4ecedba8>

In [171]:
import numpy as np

embedding_dim = 300
vocab_size = len(tokenizer.word_index)

embedding_matrix = np.zeros((vocab_size + 1, embedding_dim))
for word, i in tokenizer.word_index.items():
    try:
        embedding_vector = pretrained_emb.get_vector(word)
        embedding_matrix[i] = embedding_vector
    except:
        # words not found in embedding index will be all-zeros.
        pass

In [172]:
from keras.layers import Embedding

w2v_embedding = Embedding(vocab_size + 1,
                          embedding_dim,
                          weights=[embedding_matrix],
                          input_length=max_seq_len,
                          trainable=False)
orig_embedding = pretrained_emb.get_keras_embedding()

## Model from word2vec keras

In [173]:

# Keras
from tensorflow import keras
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Embedding, LSTM
from keras import utils
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

from sklearn.preprocessing import LabelEncoder


def train_model(x_train, y_train, embedding,
                k_batch_size=128, k_epochs=32, k_lstm_neurons=128,
                k_hidden_layer_neurons=(128, 64, 32), verbose=1):
    
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train)
    num_classes = len(label_encoder.classes_)
    y_train = utils.to_categorical(y_train, num_classes)


    k_model = Sequential()
    k_model.add(embedding)
    k_model.add(LSTM(k_lstm_neurons, dropout=0.5, recurrent_dropout=0.2))
    for hidden_layer in k_hidden_layer_neurons:
        k_model.add(Dense(hidden_layer, activation='relu'))
        k_model.add(Dropout(0.2))
    if num_classes > 1:
        k_model.add(Dense(num_classes, activation='softmax'))
    else:
        k_model.add(Dense(num_classes, activation='sigmoid'))

    k_model.compile(loss='categorical_crossentropy' if num_classes > 1 else 'binary_crossentropy',
                    optimizer='adam',
                    metrics=['accuracy'])

    # Callbacks
    early_stopping = EarlyStopping(monitor='accuracy', patience=6, verbose=0, mode='max')
    rop = ReduceLROnPlateau(monitor='accuracy', factor=0.1, patience=3, verbose=1, epsilon=1e-4, mode='max')
    callbacks = [early_stopping, rop]
    #callbacks = [rop]
    
    k_model.fit(x_train, y_train,
                batch_size=k_batch_size,
                epochs=k_epochs,
                callbacks=callbacks,
                verbose=verbose)

    return k_model

In [174]:
import time

def predict(k_model, tt, label_enc, threshold=.0):
        """
        Predict raw text label
        :param text: raw text
        :param threshold: cut-off threshold, if confidence il less than given value return __OTHER__ as label
        :return: {label: LABEL, confidence: CONFIDENCE, elapsed_time: TIME}
        """
        if not k_model:
            raise RuntimeError("Model not in memory, please load it train new model")
            
        start_at = time.time()
        
        # Predict
        confidences = k_model.predict(tt)[0]
        # Get mex prediction
        idx = np.argmax(confidences)
        elapsed_time = time.time() - start_at
        if float(confidences[idx]) > threshold:
            return {"label": label_encoder.classes_[idx], "confidence": float(confidences[idx]),
                    "elapsed_time": elapsed_time}
        return {"label": "__OTHER__", "confidence": float(confidences[idx]), "elapsed_time": elapsed_time}

In [194]:
model = train_model(train_X, train_y, w2v_embedding, k_lstm_neurons=20, k_epochs=300, k_batch_size=32)



Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300

Epoch 00037: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300

Epoch 00044: ReduceLROnPlateau reducing learning rate to 1.0000000474974514e-05.
Epoch 45/300
Epoch 46/300
Epoch 47/300

Epoch 00047: ReduceLROnPlateau reducing learning rate to 1.0000000656873453e-06.


In [195]:
res = []

for text, label in zip(test_X, test_y):
    try:
        pred = predict(model, text[np.newaxis], label_encoder)
        res.append(pred)
        print(f"{pred}, orig: {label}")
    except IndexError as e:
        print(e)
        res.append(None)

{'label': 1, 'confidence': 0.9999911785125732, 'elapsed_time': 0.12320137023925781}, orig: 1
{'label': 1, 'confidence': 0.9744738340377808, 'elapsed_time': 0.022378206253051758}, orig: 1
{'label': 1, 'confidence': 0.988387405872345, 'elapsed_time': 0.017545700073242188}, orig: 1
{'label': 1, 'confidence': 0.9946430921554565, 'elapsed_time': 0.022649049758911133}, orig: 1
{'label': 0, 'confidence': 0.999932050704956, 'elapsed_time': 0.024684667587280273}, orig: 0
{'label': 1, 'confidence': 0.9818726181983948, 'elapsed_time': 0.02177715301513672}, orig: 1
{'label': 1, 'confidence': 0.9414276480674744, 'elapsed_time': 0.01757979393005371}, orig: 1
{'label': 1, 'confidence': 0.9999692440032959, 'elapsed_time': 0.017933368682861328}, orig: 1
{'label': 1, 'confidence': 0.9692163467407227, 'elapsed_time': 0.017621755599975586}, orig: 0
{'label': 1, 'confidence': 0.9342998266220093, 'elapsed_time': 0.02166604995727539}, orig: 1
{'label': 1, 'confidence': 0.9858518838882446, 'elapsed_time': 0.0

{'label': 0, 'confidence': 0.6231398582458496, 'elapsed_time': 0.02597522735595703}, orig: 1
{'label': 0, 'confidence': 0.8514736890792847, 'elapsed_time': 0.01810741424560547}, orig: 1
{'label': 1, 'confidence': 0.9178150296211243, 'elapsed_time': 0.01949477195739746}, orig: 1
{'label': 0, 'confidence': 0.9999990463256836, 'elapsed_time': 0.022029399871826172}, orig: 0
{'label': 0, 'confidence': 0.7337194085121155, 'elapsed_time': 0.023634910583496094}, orig: 1
{'label': 1, 'confidence': 0.9996300935745239, 'elapsed_time': 0.02506279945373535}, orig: 1
{'label': 1, 'confidence': 0.9831568598747253, 'elapsed_time': 0.021947622299194336}, orig: 1
{'label': 0, 'confidence': 0.9743329286575317, 'elapsed_time': 0.017139673233032227}, orig: 0
{'label': 0, 'confidence': 0.9812036156654358, 'elapsed_time': 0.021477937698364258}, orig: 0
{'label': 1, 'confidence': 0.5147914886474609, 'elapsed_time': 0.02216196060180664}, orig: 0
{'label': 1, 'confidence': 0.9870744943618774, 'elapsed_time': 0.

{'label': 1, 'confidence': 0.999955415725708, 'elapsed_time': 0.029483318328857422}, orig: 1
{'label': 1, 'confidence': 0.9287601113319397, 'elapsed_time': 0.018526554107666016}, orig: 1
{'label': 1, 'confidence': 0.9086763262748718, 'elapsed_time': 0.02225327491760254}, orig: 1
{'label': 1, 'confidence': 0.9041798114776611, 'elapsed_time': 0.017763376235961914}, orig: 1
{'label': 0, 'confidence': 0.9999773502349854, 'elapsed_time': 0.023235797882080078}, orig: 0
{'label': 1, 'confidence': 0.9993973970413208, 'elapsed_time': 0.02513408660888672}, orig: 1
{'label': 1, 'confidence': 0.9649067521095276, 'elapsed_time': 0.017750263214111328}, orig: 1
{'label': 0, 'confidence': 0.9753398299217224, 'elapsed_time': 0.018018722534179688}, orig: 0
{'label': 1, 'confidence': 0.848393976688385, 'elapsed_time': 0.017178058624267578}, orig: 0
{'label': 0, 'confidence': 0.9929090738296509, 'elapsed_time': 0.018288850784301758}, orig: 0
{'label': 0, 'confidence': 0.9373100399971008, 'elapsed_time': 0

{'label': 1, 'confidence': 0.8525994420051575, 'elapsed_time': 0.020960092544555664}, orig: 1
{'label': 1, 'confidence': 0.9833074808120728, 'elapsed_time': 0.03160858154296875}, orig: 1
{'label': 0, 'confidence': 0.9437058568000793, 'elapsed_time': 0.03161454200744629}, orig: 0
{'label': 1, 'confidence': 0.9461022615432739, 'elapsed_time': 0.020478248596191406}, orig: 0
{'label': 1, 'confidence': 0.9999171495437622, 'elapsed_time': 0.017576932907104492}, orig: 1
{'label': 1, 'confidence': 0.9801521897315979, 'elapsed_time': 0.017244577407836914}, orig: 1
{'label': 1, 'confidence': 0.9960607886314392, 'elapsed_time': 0.024509668350219727}, orig: 1
{'label': 0, 'confidence': 0.993857204914093, 'elapsed_time': 0.02179694175720215}, orig: 0
{'label': 1, 'confidence': 0.994852602481842, 'elapsed_time': 0.024984359741210938}, orig: 1
{'label': 1, 'confidence': 0.9999961853027344, 'elapsed_time': 0.017896652221679688}, orig: 1
{'label': 1, 'confidence': 0.999546468257904, 'elapsed_time': 0.0

In [196]:
from sklearn.metrics import accuracy_score

none_count = 0
res2 = []
test2 = []

for r, t in zip(res, test_y):
    if r is not None:
        res2.append(r['label'])
        test2.append(t)
    else:
        none_count += 1
        
print(none_count)
accuracy_score(test2, res2)

0


0.8598726114649682

In [197]:
label_encoder = LabelEncoder()
label_encoder.fit(train_y)

LabelEncoder()

In [198]:
import numpy as np

def dummy_val(r):
    return r['label'] if r is not None else -1

test2_arr = np.array(test_y)
res2_arr = np.array([dummy_val(r) for r in res])

filter_inds = np.not_equal(test2_arr, res2_arr)
incorrectly_clf_X = test['text'][filter_inds]
incorrectly_clf_y = test_y[filter_inds]

In [199]:
for i, text in enumerate(incorrectly_clf_X):
    print(f"Class: {incorrectly_clf_y.iloc[i]}, index: {incorrectly_clf_X.index[i]}")
    print(text)
    print()

Class: 0, index: 977
answer gained weight need new jeans exercise know

Class: 0, index: 335
necessary take break social media toss fashion magazines

Class: 0, index: 779
um know many calories slice never ready ashton kutcher appear tell middle prank

Class: 0, index: 397
fall misguided belief handle binge eating anorexia bulimia

Class: 0, index: 951
canada though sorry resources mentioned however available worldwide

Class: 0, index: 793
remember sitting around kitchen table trying explain dissatisfied

Class: 0, index: 345
outdoor activities especially good boosting sense well

Class: 1, index: 3007
everyone struggle different

Class: 1, index: 1560
patients also trained importance better eating patterns

Class: 0, index: 716
obesity caused bed lead heart disease diabetes hypertension related issues

Class: 1, index: 1739
mind

Class: 1, index: 2422
earliest

Class: 1, index: 1605
understand want lose weight anorexia

Class: 0, index: 179
say fine

Class: 0, index: 16
however true
