<a href="https://colab.research.google.com/github/gamecicn/sample_jupyter/blob/main/NLP_FP_LSTM_Emotional_analysis_W2V_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# NLP FP Emotional analysis

In [None]:
# Install
!pip install numpy==1.16.2


Collecting numpy==1.16.2
[?25l  Downloading https://files.pythonhosted.org/packages/35/d5/4f8410ac303e690144f0a0603c4b8fd3b986feb2749c435f7cdbb288f17e/numpy-1.16.2-cp36-cp36m-manylinux1_x86_64.whl (17.3MB)
[K     |████████████████████████████████| 17.3MB 173kB/s 
[31mERROR: umap-learn 0.4.6 has requirement numpy>=1.17, but you'll have numpy 1.16.2 which is incompatible.[0m
[31mERROR: datascience 0.10.6 has requirement folium==0.2.1, but you'll have folium 0.8.3 which is incompatible.[0m
[31mERROR: albumentations 0.1.12 has requirement imgaug<0.2.7,>=0.2.5, but you'll have imgaug 0.2.9 which is incompatible.[0m
[?25hInstalling collected packages: numpy
  Found existing installation: numpy 1.18.5
    Uninstalling numpy-1.18.5:
      Successfully uninstalled numpy-1.18.5
Successfully installed numpy-1.16.2


In [None]:
# Setup



# All the imports!

import numpy as np
from numpy import array

import tensorflow as tf 
from tensorflow.keras.preprocessing import sequence
from keras.callbacks import LambdaCallback
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.layers import Dense, Activation
from keras.models import Sequential

 
# Supress deprecation warnings
import logging
logging.getLogger('tensorflow').disabled = True


In [None]:
# Mount google drive
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


## Sklearn import data


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("/content/drive/My Drive/DS_data/ISEAR_aug.csv", sep=",")
df.dropna(inplace=True)

labels = df.emotion.factorize()
labels_index = labels[1]
df['emotion'] = labels[0]

training_data, testing_data, y_train, y_test = train_test_split(df.text, df.emotion, test_size=0.1, random_state=123, shuffle=True)

In [None]:
training_data.head(5)

4844     The teacher of one course gave me 10 questions...
3130     I felt disgust with alcohol in general when a ...
14152    When I was taking my girlfriend out, she seeme...
9014     For example, to feel bad because of one's atti...
12479    My friend replying to my letter and again sayi...
Name: text, dtype: object

In [None]:
y_train.head(5)

4844     4
3130     4
14152    4
9014     5
12479    0
Name: emotion, dtype: int64

## Pre-processing Data

### TF-IDF

In [None]:
'''
from sklearn.feature_extraction.text import TfidfVectorizer

# extract features
vectorizer1 = TfidfVectorizer(stop_words = "english")
x_train = vectorizer1.fit_transform(training_data)
# Use training data's vocabulary to create test tf-idf matrix
vectorizer2 = TfidfVectorizer(stop_words = "english",vocabulary=vectorizer1.vocabulary_)
x_test = vectorizer2.fit_transform(testing_data)

vocab_size = x_train.shape[1]
review_length = 500

print("vocab_size : {}".format(vocab_size))
'''



'\nfrom sklearn.feature_extraction.text import TfidfVectorizer\n\n# extract features\nvectorizer1 = TfidfVectorizer(stop_words = "english")\nx_train = vectorizer1.fit_transform(training_data)\n# Use training data\'s vocabulary to create test tf-idf matrix\nvectorizer2 = TfidfVectorizer(stop_words = "english",vocabulary=vectorizer1.vocabulary_)\nx_test = vectorizer2.fit_transform(testing_data)\n\nvocab_size = x_train.shape[1]\nreview_length = 500\n\nprint("vocab_size : {}".format(vocab_size))\n'

### Wrod2Vec

In [None]:
import gensim

print('\nTraining word2vec...')
 

max_sentence_len = max([ len(x) for x in df["text"] ])

word_model = gensim.models.Word2Vec(df["text"], size=100, min_count=1, window=5, iter=100)
pretrained_weights = word_model.wv.syn0
vocab_size, emdedding_size = pretrained_weights.shape
print('Result embedding shape:', pretrained_weights.shape)

def word2idx(word):
  return word_model.wv.vocab[word].index
def idx2word(idx):
  return word_model.wv.index2word[idx]

print('\nPreparing the data for LSTM...')

x_train = np.zeros([len(training_data), max_sentence_len], dtype=np.int32)
for i, sentence in enumerate(training_data):
  for t, word in enumerate(sentence):
    x_train[i, t] = word2idx(word)
print('x_train shape:', x_train.shape)


x_test = np.zeros([len(testing_data), max_sentence_len], dtype=np.int32)
for i, sentence in enumerate(testing_data):
  for t, word in enumerate(sentence):
    x_test[i, t] = word2idx(word)
print('x_test shape:', x_test.shape)




Training word2vec...


  if __name__ == '__main__':


Result embedding shape: (85, 100)

Preparing the data for LSTM...
x_train shape: (15111, 890)
x_test shape: (1679, 890)


In [None]:
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

print('vocab_size: {}'.format(vocab_size))
print('emdedding_size: {}'.format(emdedding_size))

review_length = max_sentence_len


y_train shape: (15111,)
y_test shape: (1679,)
vocab_size: 85
emdedding_size: 100


## Create and build LSTM Recurrent Neural Network

In [None]:
# We begin by defining the a empty stack. We'll use this for building our 
# network, later by layer.
model = tf.keras.models.Sequential()

# The Embedding Layer provides a spatial mapping (or Word Embedding) of all the 
# individual words in our training set. Words close to one another share context 
# and or meaning. This spatial mapping is learning during the training process.

 
model.add(
    tf.keras.layers.Embedding(
        input_dim = vocab_size, # The size of our vocabulary 
        output_dim = emdedding_size, # Dimensions to which each words shall be mapped
        weights=[pretrained_weights]
    )
)
 
# Dropout layers fight overfitting and forces the model to learn multiple 
# representations of the same data by randomly disabling neurons in the 
# learning phase.
model.add(
    tf.keras.layers.Dropout(
        rate=0.25 # Randomly disable 25% of neurons
    )
)

# We are using a fast version of LSTM whih is optimised for GPUs. This layer 
# looks at the sequence of words in the review, along with their word embeddings
# and uses both of these to determine to sentiment of a given review.
model.add(
    tf.compat.v1.keras.layers.CuDNNLSTM(
        units=emdedding_size  
    )
)

# Add a second dropout layer with the same aim as the first.
model.add(
    tf.keras.layers.Dropout(
        rate=0.25 # Randomly disable 25% of neurons
    )
)

# All LSTM units are connected to a single node in the dense layer. A sigmoid 
# activation function determines the output from this node - a value 
# between 0 and 1. Closer to 0 indicates a negative review. Closer to 1 
# indicates a positive review.
model.add(
    tf.keras.layers.Dense(
        units=vocab_size, # Single unit
        activation='softmax' # Sigmoid activation function (output from 0 to 1)
    )
)

# Compile the model
model.compile(
    loss=tf.keras.losses.binary_crossentropy, # loss function
    optimizer=tf.keras.optimizers.Adam(), # optimiser function
    metrics=['accuracy']) # reporting metric

# Display a summary of the models structure
model.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, None, 100)         8500      
_________________________________________________________________
dropout_13 (Dropout)         (None, None, 100)         0         
_________________________________________________________________
cu_dnnlstm_9 (CuDNNLSTM)     (None, 100)               80800     
_________________________________________________________________
dropout_14 (Dropout)         (None, 100)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 85)                8585      
Total params: 97,885
Trainable params: 97,885
Non-trainable params: 0
_________________________________________________________________


In [None]:
#x_train = np.array(x_train)
#y_train = np.array(y_train)
type(x_train)

numpy.ndarray

In [None]:
type(y_train)

numpy.ndarray

## Train the LSTM

In [None]:
# Train the LSTM on the training data
history = model.fit(

    # Training data : features (review) and classes (positive or negative)
    x_train, y_train,
                    
    # Number of samples to work through before updating the 
    # internal model parameters via back propagation. The 
    # higher the batch, the more memory you need.
    batch_size=64, 

    # An epoch is an iteration over the entire training data.
    epochs=50, 
    
    # The model will set apart his fraction of the training 
    # data, will not train on it, and will evaluate the loss
    # and any model metrics on this data at the end of 
    # each epoch.
    validation_split=0.2,
    
    verbose=1
) 

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
# Get Model Predictions for test data
from sklearn.metrics import classification_report

predicted_classes = model.predict_classes(x_test)
print(classification_report(y_test, predicted_classes, target_names=labels_index))

ValueError: ignored