<a href="https://colab.research.google.com/github/gamecicn/Deep-Learning-Papers-Reading-Roadmap/blob/master/NLP_FP_LSTM_Emotional_analysis_TF-IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# NLP FP Emotional analysis

In [3]:
# Install
!pip install numpy==1.16.2


Collecting numpy==1.16.2
[?25l  Downloading https://files.pythonhosted.org/packages/35/d5/4f8410ac303e690144f0a0603c4b8fd3b986feb2749c435f7cdbb288f17e/numpy-1.16.2-cp36-cp36m-manylinux1_x86_64.whl (17.3MB)
[K     |████████████████████████████████| 17.3MB 1.5MB/s 
[31mERROR: umap-learn 0.4.6 has requirement numpy>=1.17, but you'll have numpy 1.16.2 which is incompatible.[0m
[31mERROR: datascience 0.10.6 has requirement folium==0.2.1, but you'll have folium 0.8.3 which is incompatible.[0m
[31mERROR: albumentations 0.1.12 has requirement imgaug<0.2.7,>=0.2.5, but you'll have imgaug 0.2.9 which is incompatible.[0m
[?25hInstalling collected packages: numpy
  Found existing installation: numpy 1.18.5
    Uninstalling numpy-1.18.5:
      Successfully uninstalled numpy-1.18.5
Successfully installed numpy-1.16.2


In [3]:
# Setup



# All the imports!
import tensorflow as tf 
import numpy as np
from tensorflow.keras.preprocessing import sequence
from numpy import array

 
# Supress deprecation warnings
import logging
logging.getLogger('tensorflow').disabled = True


In [4]:
# Mount google drive
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
# Loca data from Google Drieve
TRAIN_DATA_URL = "/content/drive/My Drive/DS_data/ISEAR_train.csv"
TEST_DATA_URL = "/content/drive/My Drive/DS_data/ISEAR_test.csv"

train_file_path = tf.keras.utils.get_file(TRAIN_DATA_URL, TRAIN_DATA_URL)
test_file_path = tf.keras.utils.get_file(TRAIN_DATA_URL, TEST_DATA_URL)

LABEL_COLUMN = "emotion"


In [6]:
def get_dataset(file_path):
  dataset = tf.data.experimental.make_csv_dataset(
      file_path,
      batch_size=12, # Artificially small to make examples easier to show.
      label_name=LABEL_COLUMN,
      na_value="",
      num_epochs=1,
      ignore_errors=True)
  return dataset

raw_train_data = get_dataset(train_file_path)
raw_test_data = get_dataset(test_file_path)

In [8]:
examples, labels = next(iter(raw_train_data)) # Just the first batch.
print("EXAMPLES: \n", examples, "\n")
print("LABELS: \n", labels)

EXAMPLES: 
 OrderedDict([('text', <tf.Tensor: shape=(12,), dtype=string, numpy=
array([b'When I received my TE Score and my acceptance from University - \xc3\xa1 it had been a goal since I left the bank a year earlier.',
       b'During lunch with some classmates, a priest was also there.  He \xc3\xa1 was eating desperately as if someone was going to take his plate \xc3\xa1 away.  He did it with gluttony.  He wiped his hands on his suit \xc3\xa1 several times and it was full of crumbs.',
       b'When I was young, one day in the car we fell sick and ended up \xc3\xa1 vomitting on each other.',
       b'When the gear broke on my car.',
       b'On boat trip - saw mother giving young child (aprox 6-7 years) \xc3\xa1 cigarettes.',
       b'One night I had a feeling that somebody was stealing my car, \xc3\xa1 but I was too frightened to go and see.',
       b'I felt afraid when I smoked maryhuana for the first time in my \xc3\xa1 life with a lot of friends in the middle of the street at da

## Sklearn import data


In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split

df = pd.read_csv("/content/drive/My Drive/DS_data/ISEAR_clean.csv", sep=",")
df.dropna(inplace=True)

labels = df.emotion.factorize()
labels_index = labels[1]
df['emotion'] = labels[0]

training_data, testing_data, y_train, y_test = train_test_split(df.text, df.emotion, test_size=0.3, random_state=123, shuffle=True)

In [33]:
train_data.head(5)

4102                When I had an argument with a friend.
2284    One night, when I got out from the University ...
2061    My father died last year after an 8-week sever...
1998    The loss of my father as he died of a massive ...
1261    The breaking up of a relationship.  We decided...
Name: text, dtype: object

In [34]:
y_train.head(5)

4102    3
2284    1
2061    3
1998    3
1261    3
Name: emotion, dtype: int64

## Pre-processing Data

In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer

# extract features
vectorizer1 = TfidfVectorizer(stop_words = "english")
x_train = vectorizer1.fit_transform(training_data)
# Use training data's vocabulary to create test tf-idf matrix
vectorizer2 = TfidfVectorizer(stop_words = "english",vocabulary=vectorizer1.vocabulary_)
x_test = vectorizer2.fit_transform(testing_data)

vocab_size = x_train.shape[1]
review_length = 500

print("vocab_size : {}".format(vocab_size))




vocab_size : 7307


## Create and build LSTM Recurrent Neural Network

In [36]:
# We begin by defining the a empty stack. We'll use this for building our 
# network, later by layer.
model = tf.keras.models.Sequential()

# The Embedding Layer provides a spatial mapping (or Word Embedding) of all the 
# individual words in our training set. Words close to one another share context 
# and or meaning. This spatial mapping is learning during the training process.
model.add(
    tf.keras.layers.Embedding(
        input_dim = vocab_size, # The size of our vocabulary 
        output_dim = 32, # Dimensions to which each words shall be mapped
        input_length = review_length # Length of input sequences
    )
)

# Dropout layers fight overfitting and forces the model to learn multiple 
# representations of the same data by randomly disabling neurons in the 
# learning phase.
model.add(
    tf.keras.layers.Dropout(
        rate=0.25 # Randomly disable 25% of neurons
    )
)

# We are using a fast version of LSTM whih is optimised for GPUs. This layer 
# looks at the sequence of words in the review, along with their word embeddings
# and uses both of these to determine to sentiment of a given review.
model.add(
    tf.compat.v1.keras.layers.CuDNNLSTM(
        units=32 # 32 LSTM units in this layer
    )
)

# Add a second dropout layer with the same aim as the first.
model.add(
    tf.keras.layers.Dropout(
        rate=0.25 # Randomly disable 25% of neurons
    )
)

# All LSTM units are connected to a single node in the dense layer. A sigmoid 
# activation function determines the output from this node - a value 
# between 0 and 1. Closer to 0 indicates a negative review. Closer to 1 
# indicates a positive review.
model.add(
    tf.keras.layers.Dense(
        units=1, # Single unit
        activation='sigmoid' # Sigmoid activation function (output from 0 to 1)
    )
)

# Compile the model
model.compile(
    loss=tf.keras.losses.binary_crossentropy, # loss function
    optimizer=tf.keras.optimizers.Adam(), # optimiser function
    metrics=['accuracy']) # reporting metric

# Display a summary of the models structure
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 500, 32)           233824    
_________________________________________________________________
dropout_4 (Dropout)          (None, 500, 32)           0         
_________________________________________________________________
cu_dnnlstm_2 (CuDNNLSTM)     (None, 32)                8448      
_________________________________________________________________
dropout_5 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 33        
Total params: 242,305
Trainable params: 242,305
Non-trainable params: 0
_________________________________________________________________


In [49]:
#x_train = np.array(x_train)
#y_train = np.array(y_train)
type(x_train)

scipy.sparse.csr.csr_matrix

In [50]:
type(y_train)

numpy.ndarray

## Train the LSTM

In [51]:
# Train the LSTM on the training data
history = model.fit(

    # Training data : features (review) and classes (positive or negative)
    x_train.toarray(), y_train,
                    
    # Number of samples to work through before updating the 
    # internal model parameters via back propagation. The 
    # higher the batch, the more memory you need.
    batch_size=256, 

    # An epoch is an iteration over the entire training data.
    epochs=2, 
    
    # The model will set apart his fraction of the training 
    # data, will not train on it, and will evaluate the loss
    # and any model metrics on this data at the end of 
    # each epoch.
    validation_split=0.2,
    
    verbose=1
) 

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [54]:
# Get Model Predictions for test data
from sklearn.metrics import classification_report

predicted_classes = model.predict_classes(x_test.toarray())
print(classification_report(y_test, predicted_classes, target_names=labels_index))

              precision    recall  f1-score   support

         joy       0.00      0.00      0.00       322
        fear       0.14      1.00      0.24       315
       anger       0.00      0.00      0.00       337
     sadness       0.00      0.00      0.00       310
     disgust       0.00      0.00      0.00       339
       shame       0.00      0.00      0.00       338
       guilt       0.00      0.00      0.00       339

    accuracy                           0.14      2300
   macro avg       0.02      0.14      0.03      2300
weighted avg       0.02      0.14      0.03      2300



  _warn_prf(average, modifier, msg_start, len(result))
