# Word embeddings for sentiment analysis Suicide Sentiment Analysis

StanfordCore NLP

In [None]:
import os
from pathlib import Path
import sys

In [None]:
project_name = 'clpsych'
project_path = Path(os.getcwd()).parent

if sys.platform == "win32":
    data_path = 'D:\Dataset\{0}\dataset'.format(project_name)
    model_path = Path(project_path, 'models')
    src_path = src_path = Path(project_path, 'src')
    
elif sys.platform == 'darwin':
    data_path = '/Volumes/Dataset/{0}/dataset'.format(project_name)
    model_path = '/Volumes/Dataset/{0}/models'.format(project_name)
    src_path = '/Volumes/Dataset/{0}/src'.format(project_name)
    
else:
    data_path = Path(project_path, 'dataset')
    model_path = Path(project_path, 'models')
    src_path = Path(project_path, 'src')

utils_path = str(Path(project_path, 'utils'))
# including the project folder and the utils folder
if utils_path not in ''.join(sys.path):
    sys.path.extend([str(project_path), str(utils_path), str(src_path)])

print('project path = {0}'.format(project_path))
print('data path = {0}'.format(data_path))
print('model path = {0}'.format(model_path))
print('utils path = {0}'.format(utils_path))
print('sys.path = {0}'.format(sys.path))

In [None]:
# Basic packages
import pandas as pd 
import numpy as np
import re
import collections
import matplotlib.pyplot as plt
from tqdm import tqdm

# Packages for data preparation
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# NLP
from pycorenlp import StanfordCoreNLP

# Packages for modeling
from keras import models
from keras import layers
from keras import regularizers

# wei
from data_helpers import build_vocab
from data_helpers import load_data_and_labels
from data_helpers import pad_sentences

from statistics import mean
from itertools import islice

In [None]:
def preprocess_word(word):
    # Remove punctuation
    word = word.strip('\'"?!,.():;')
    # Convert more than 2 letter repetitions to 2 letter
    # funnnnny --> funny
    word = re.sub(r'(.)\1+', r'\1\1', word)
    # Remove - & '
    word = re.sub(r'(-|\')', '', word)
    return word


def is_valid_word(word):
    # Check if word begins with an alphabet
    return (re.search(r'^[a-zA-Z][a-z0-9A-Z\._]*$', word) is not None)


def handle_emojis(tweet):
    # Smile -- :), : ), :-), (:, ( :, (-:, :')
    tweet = re.sub(r'(:\s?\)|:-\)|\(\s?:|\(-:|:\'\))', ' EMO_POS ', tweet)
    # Laugh -- :D, : D, :-D, xD, x-D, XD, X-D
    tweet = re.sub(r'(:\s?D|:-D|x-?D|X-?D)', ' EMO_POS ', tweet)
    # Love -- <3, :*
    tweet = re.sub(r'(<3|:\*)', ' EMO_POS ', tweet)
    # Wink -- ;-), ;), ;-D, ;D, (;,  (-;
    tweet = re.sub(r'(;-?\)|;-?D|\(-?;)', ' EMO_POS ', tweet)
    # Sad -- :-(, : (, :(, ):, )-:
    tweet = re.sub(r'(:\s?\(|:-\(|\)\s?:|\)-:)', ' EMO_NEG ', tweet)
    # Cry -- :,(, :'(, :"(
    tweet = re.sub(r'(:,\(|:\'\(|:"\()', ' EMO_NEG ', tweet)
    return tweet


def preprocess_tweet(tweet):
    processed_tweet = []
    # Convert to lower case
    tweet = tweet.lower()
    # Replaces URLs with the word URL
    tweet = re.sub(r'((www\.[\S]+)|(https?://[\S]+))', ' URL ', tweet)
    # Replace @handle with the word USER_MENTION
    tweet = re.sub(r'@[\S]+', 'USER_MENTION', tweet)
    # Replaces #hashtag with hashtag
    tweet = re.sub(r'#(\S+)', r' \1 ', tweet)
    # Remove RT (retweet)
    tweet = re.sub(r'\brt\b', '', tweet)
    # Replace 2+ dots with space
    tweet = re.sub(r'\.{2,}', ' ', tweet)
    # remove nans
    tweet = tweet.replace('nan', '')
    # remove @ mentions
    tweet = re.sub(r'@\w+', '', tweet)
    # Strip space, " and ' from tweet
    tweet = tweet.strip(' "\'')
    # Replace emojis with either EMO_POS or EMO_NEG
    tweet = handle_emojis(tweet)
    # Replace multiple spaces with a single space
    tweet = re.sub(r'\s+', ' ', tweet)
    words = tweet.split()

    for word in words:
        word = preprocess_word(word)
        if is_valid_word(word):
            processed_tweet.append(word)

    return ' '.join(processed_tweet)

In [None]:
suicide_data = pd.read_csv(Path(data_path, 'risk_title_body.csv'))
suicide_data.head()

In [None]:
suicide_data.columns = ['index', 'risk_label', 'title_body']
suicide_data.head()

In [None]:
# suicide_data['text'] = suicide_data.title_body.apply(preprocess_tweet)
# suicide_data.head()

In [None]:
# example
nlp = StanfordCoreNLP('http://localhost:9000')
res = nlp.annotate("I love you. I hate him. You are nice. He is dumb",
                   properties={
                       'annotators': 'sentiment',
                       'outputFormat': 'json'
                   })
for s in res["sentences"]:
    print("{0}: {1}: {2} {3}".format( s["index"], " ".join([t["word"] for t in s["tokens"]]), s["sentimentValue"], s["sentiment"]))

In [None]:
tweets = pd.read_csv(Path(data_path, 'Tweets.csv'))
tweets['prop'] = tweets['text'].apply(remove_mentions)
tweets.head()

In [None]:
nlp = StanfordCoreNLP('http://localhost:9000')

sentiment_list = list()
averaged_sentiment = pd.DataFrame(columns=['text', 'sentiments', 'sentiment_dist','y'])

# islice()	seq, [start,] stop [, step]
for _, tweet in islice(tweets.iterrows(), 0, 12):
    res = nlp.annotate(tweet['prop'], properties={ 'annotators': 'sentiment', 'outputFormat': 'json'})
    sentimement = list()
    sentiment_dist = list()
    for s in res["sentences"]:
        print("{0}: {1}: {2} {3}".format( s["index"], " ".join([t["word"] for t in s["tokens"]]), s["sentimentValue"], s["sentiment"]))
        sentimement.append(s["sentiment"])
        sentiment_dist.append(s['sentimentDistribution'])
    averaged_sentiment = averaged_sentiment.append({'text':tweet['prop'], 'sentiments':sentimement, 'sent dist' : sentiment_dist,'y' : tweet['airline_sentiment']}, ignore_index=True)

In [None]:
nlp = StanfordCoreNLP('http://localhost:9000')

sentiment_list = list()
sentiment_df = pd.DataFrame(columns=['text', 'sentiments', 'sentiment_dist','y'])

total_len = suicide_data.shape[0]

for ix, post in suicide_data.iterrows():
    print('processing {0}/{1}'.format(ix, total_len))
    res = nlp.annotate(post['title_body'], properties={ 'timeout': 600000, 'annotators': 'sentiment', 'outputFormat': 'json'})
    sentimement = list()
    sentiment_dist = list()
    for s in res["sentences"]:
        print("{0}: {1}: {2} {3}".format( s["index"], " ".join([t["word"] for t in s["tokens"]]), s["sentimentValue"], s["sentiment"]))
        sentimement.append(s["sentiment"])
        sentiment_dist.append(s['sentimentDistribution'])
    sentiment_df = sentiment_df.append({'text':post['title_body'], 'sentiments':sentimement, 'sent dist' : sentiment_dist,
                                        'y' : post['risk_label']}, ignore_index=True)
    if ix == 2:
        break

In [None]:
suicide_data[0:11403].to_csv(Path(data_path, 'risk_title_body_1.csv'), index=False)
suicide_data[11403:22806].to_csv(Path(data_path, 'risk_title_body_2.csv'), index=False)
suicide_data[22806:34209].to_csv(Path(data_path, 'risk_title_body_3.csv'), index=False)
suicide_data[34209:45612].to_csv(Path(data_path, 'risk_title_body_4.csv'), index=False)
suicide_data[45612:].to_csv(Path(data_path, 'risk_title_body_5.csv'), index=False)