# Word embeddings for sentiment analysis Suicide Sentiment Analysis

In [None]:
import os
from pathlib import Path
import sys

In [None]:
project_name = 'clpsych'
project_path = '/home/guerramarj/github/clpsych/'

if sys.platform == "win32":
    data_path = 'D:\Dataset\{0}\dataset'.format(project_name)
    model_path = 'D:\Dataset\{0}\models'.format(project_name)
    src_path = '/Volumes/Dataset/{0}/src'.format(project_name)
    
elif sys.platform == 'darwin':
    data_path = '/Volumes/Dataset/{0}/dataset'.format(project_name)
    model_path = '/Volumes/Dataset/{0}/models'.format(project_name)
    src_path = '/Volumes/Dataset/{0}/src'.format(project_name)
    
else:
    data_path = Path(project_path, 'dataset')
    model_path = Path(project_path, 'models')
    src_path = Path(project_path, 'src')

utils_path = str(Path(project_path, 'utils'))
# including the project folder and the utils folder
if utils_path not in ''.join(sys.path):
    sys.path.extend([str(project_path), utils_path, str(src_path)])

print('project path = {0}'.format(project_path))
print('data path = {0}'.format(data_path))
print('model path = {0}'.format(model_path))
print('sys.path = {0}'.format(sys.path))

In [None]:
# Basic packages
import pandas as pd 
import numpy as np
import re
import collections
import matplotlib.pyplot as plt
from tqdm import tqdm

# Packages for data preparation
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Packages for modeling
from keras import models
from keras import layers
from keras import regularizers

# wei
from data_helpers import build_vocab
from data_helpers import load_data_and_labels
from data_helpers import pad_sentences

In [None]:
def deep_model(model, x_train, y_train, x_valid, y_valid):
    '''
    Function to train a multi-class model. The number of epochs and 
    batch_size are set by the constants at the top of the
    notebook. 
    
    Parameters:
        model : model with the chosen architecture
        X_train : training features
        y_train : training target
        X_valid : validation features
        Y_valid : validation target
    Output:
        model training history
    '''
    model.compile(optimizer='rmsprop'
                  , loss='categorical_crossentropy'
                  , metrics=['accuracy'])
    
    history = model.fit(x_train
                       , y_train
                       , epochs=number_epochs
                       , batch_size=batch_size
                       , validation_data=(x_valid, y_valid)
                       , verbose=0)
    return history


def eval_metric(history, metric_name):
    '''
    Function to evaluate a trained model on a chosen metric. 
    Training and validation metric are plotted in a
    line chart for each epoch.
    
    Parameters:
        history : model training history
        metric_name : loss or accuracy
    Output:
        line chart with epochs of x-axis and metric on
        y-axis
    '''
    metric = history.history[metric_name]
    val_metric = history.history['val_' + metric_name]

    e = range(1, number_epochs + 1)

    plt.plot(e, metric, 'bo', label='Train ' + metric_name)
    plt.plot(e, val_metric, 'b', label='Validation ' + metric_name)
    plt.legend()
    plt.show()

def test_model(model, x_train, y_train, x_test, y_test, epoch_stop):
    '''
    Function to test the model on new data after training it
    on the full training data with the optimal number of epochs.
    
    Parameters:
        model : trained model
        X_train : training features
        y_train : training target
        X_test : test features
        y_test : test target
        epochs : optimal number of epochs
    Output:
        test accuracy and test loss
    '''
    model.fit(x_train
              , y_train
              , epochs=epoch_stop
              , batch_size=batch_size
              , verbose=0)
    results = model.evaluate(x_test, y_test)
    
    return results

In [None]:
def preprocess_word(word):
    # Remove punctuation
    word = word.strip('\'"?!,.():;')
    # Convert more than 2 letter repetitions to 2 letter
    # funnnnny --> funny
    word = re.sub(r'(.)\1+', r'\1\1', word)
    # Remove - & '
    word = re.sub(r'(-|\')', '', word)
    return word


def is_valid_word(word):
    # Check if word begins with an alphabet
    return (re.search(r'^[a-zA-Z][a-z0-9A-Z\._]*$', word) is not None)


def handle_emojis(tweet):
    # Smile -- :), : ), :-), (:, ( :, (-:, :')
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', tweet)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' EMO_POS ', tweet)
    # Love -- <3, :*
    tweet = re.sub(r'(<3|:\*)', ' EMO_POS ', tweet)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMO_POS ', tweet)
    # Sad -- :-(, : (, :(, ):, )-:
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', tweet)
    # Cry -- :,(, :'(, :"(
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', tweet)
    return tweet


def preprocess_tweet(tweet):
    processed_tweet = []
    # Convert to lower case
    tweet = tweet.lower()
    # Replaces URLs with the word URL
    tweet = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', ' URL ', tweet)
    # Replace @handle with the word USER_MENTION
    tweet = re.sub(r'@[\S]+', 'USER_MENTION', tweet)
    # Replaces #hashtag with hashtag
    tweet = re.sub(r'#(\S+)', r' \1 ', tweet)
    # Remove RT (retweet)
    tweet = re.sub(r'\brt\b', '', tweet)
    # Replace 2+ dots with space
    tweet = re.sub(r'\.{2,}', ' ', tweet)
    # Strip space, " and ' from tweet
    tweet = tweet.strip(' "\'')
    # Replace emojis with either EMO_POS or EMO_NEG
    tweet = handle_emojis(tweet)
    # Replace multiple spaces with a single space
    tweet = re.sub(r'\s+', ' ', tweet)
    words = tweet.split()

    for word in words:
        word = preprocess_word(word)
        if is_valid_word(word):
            processed_tweet.append(word)

    return ' '.join(processed_tweet)

In [None]:
batch_size = 512  # Size of the batches used in the mini-batch gradient descent
glove_dim = 200

In [None]:
suicide_data = pd.read_csv(Path(data_path, 'risk_title_body.csv'))
suicide_data.head()

In [None]:
suicide_data['text'] = suicide_data.title_body.apply(preprocess_tweet)
suicide_data.head()

In [None]:
x_train, y_class = load_data_and_labels(file_path=data_path)
sentences_padded, sequence_length = pad_sentences(x_text)
vocabulary, vocabulary_inv = build_vocab(sentences_padded)
number_word = len(vocabulary_inv)

In [None]:
tk = Tokenizer(num_words=number_word, split=" ")
tk.fit_on_texts(x_train)
x_train_seq = tk.texts_to_sequences(x_train)
# x_test_seq = tk.texts_to_sequences(x_test)

In [None]:
le = LabelEncoder()
y_train_le = le.fit_transform(y_train)
y_test_le = le.transform(y_test)
y_train_oh = to_categorical(y_train_le)
y_test_oh = to_categorical(y_test_le)

In [None]:
x_train_emb, x_valid_emb, y_train_emb, y_valid_emb = train_test_split(
    x_train_seq_trunc, y_train_oh, test_size=0.1, random_state=37)

In [None]:
emb_model = models.Sequential()
emb_model.add(layers.Embedding(number_word, 8, input_length=max_len))
emb_model.add(layers.Flatten())
emb_model.add(layers.Dense(3, activation='softmax'))
emb_history = deep_model(emb_model, x_train_emb, y_train_emb, x_valid_emb, y_valid_emb)

In [None]:
emb_results = test_model(emb_model, x_train_seq_trunc, y_train_oh, x_test_seq_trunc, y_test_oh, 6)
print('')
print('Test accuracy of word embeddings model: {0:.2f}%'.format(emb_results[1]*100))

In [None]:
glove_file = 'glove.twitter.27B.' + str(glove_dim) + 'd.txt'
emb_dict = {}
glove = Path(model_path, glove_file).open()
for line in glove:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], dtype='float32')
    emb_dict[word] = vector
glove.close()

In [None]:
emb_matrix = np.zeros((number_word, glove_dim))
for w, i in tk.word_index.items():
    if i < number_word:
        vect = emb_dict.get(w)
        if vect is not None:
            emb_matrix[i] = vect
    else:
        break

In [None]:
glove_model = models.Sequential()
glove_model.add(layers.Embedding(number_word, glove_dim, input_length=max_len))
glove_model.add(layers.Flatten())
glove_model.add(layers.Dense(3, activation='softmax'))

In [None]:
glove_model.layers[0].set_weights([emb_matrix])
glove_model.layers[0].trainable = False
glove_history = deep_model(glove_model, x_train_emb, y_train_emb, x_valid_emb, y_valid_emb)
glove_history.history['acc'][-1]

In [None]:
eval_metric(glove_history, 'loss')

In [None]:
eval_metric(glove_history, 'acc')

In [None]:
glove_results = test_model(glove_model, x_train_seq_trunc, y_train_oh, x_test_seq_trunc, y_test_oh, 3)
print('/n')
print('Test accuracy of word glove model: {0:.2f}%'.format(glove_results[1]*100))

In [None]:
emb_model2 = models.Sequential()
emb_model2.add(layers.Embedding(number_word, glove_dim, input_length=max_len))
emb_model2.add(layers.Flatten())
emb_model2.add(layers.Dense(3, activation='softmax'))
emb_history2 = deep_model(emb_model2, x_train_emb, y_train_emb, x_valid_emb, y_valid_emb)

In [None]:
eval_metric(emb_history2, 'loss')

In [None]:
emb_results2 = test_model(emb_model2, x_train_seq_trunc, y_train_oh, x_test_seq_trunc, y_test_oh, 3)
print('/n')
print('Test accuracy of word embedding model 2: {0:.2f}%'.format(emb_results2[1]*100))