<a href="https://colab.research.google.com/github/glenkalarikkal/CNN_Sentiment/blob/main/CNN_Sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **CNN for sentiment analysis**

Load data from google drive

In [None]:
import pandas as pd
import tensorflow as tf
from bs4 import BeautifulSoup
import re
import tensorflow_datasets as tfds


In [None]:
from google.colab import drive
drive.mount("/content/drive")

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [None]:
cols = ["sentiment", "id", "date", "query", "user", "text"]
train_data = pd.read_csv(
    "/content/drive/My Drive/Colab Training Data/training.1600000.processed.noemoticon.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)
test_data = pd.read_csv(
    "/content/drive/My Drive/Colab Training Data/testdata.manual.2009.06.14.csv",
    header=None,
    names=cols,
    engine="python",
    encoding="latin1"
)

In [None]:
train_data.columns

Index(['sentiment', 'id', 'date', 'query', 'user', 'text'], dtype='object')

In [None]:
def clean_data(df):
    df.drop(["id", "date", "query", "user"],  # don't forget to run data = train_data before!
            axis=1,
            inplace=True)

    data_clean = [clean_tweet(tweet) for tweet in df.text]
    data_sentiment = df.sentiment.values
    data_sentiment[data_sentiment == 4] = 1
    return data_clean, data_sentiment

In [None]:
def clean_tweet(tweet):
    tweet = BeautifulSoup(tweet, "lxml").get_text()
    # Removing the @
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    # Removing the URL links
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    # Keeping only letters
    tweet = re.sub(r"[^a-zA-Z.!?']", ' ', tweet)
    # Removing additional whitespaces
    tweet = re.sub(r" +", ' ', tweet)
    return tweet

In [None]:
def tokenize_corpus(data, maxlen, encoder):                        
    data_clean = [encoder.encode(sentence) for sentence in data]   
    data_clean = tf.keras.preprocessing.sequence.pad_sequences(    
        data_clean,                                                
        value=0,                                                   
        padding="post",                                            
        maxlen=maxlen                                              
    )                                                              
    return data_clean                                              

In [None]:
train_datapoints, train_labels = clean_data(train_data)
test_datapoints, test_labels = clean_data(test_data)

In [None]:
vocab_size = 2**16
tokenizer = tfds.features.text.SubwordTextEncoder.build_from_corpus(   
    train_datapoints + test_datapoints, vocab_size                                                   
)

In [None]:
MAX_LEN = max([len(sentence) for sentence in train_datapoints])
print(MAX_LEN)

tokenized_train = tokenize_corpus(train_datapoints, MAX_LEN, tokenizer)
tokenized_test = tokenize_corpus(test_datapoints, MAX_LEN, tokenizer)

246


In [None]:
from tensorflow.keras import layers


class DCNN(tf.keras.Model):
    def __init__(self,
                 vocab_size,
                 embedding_dim=128,
                 nb_filters=50,
                 FFN_units=512,
                 nb_classes=2,
                 dropout_rate=0.2,
                 training=False,
                 name="dcnn"):
        super(DCNN, self).__init__(name=name)
        self.embedding = layers.Embedding(vocab_size, embedding_dim)

        self.bigram = layers.Conv1D(filters=nb_filters, kernel_size=2, padding="valid", activation="relu")
        self.trigram = layers.Conv1D(filters=nb_filters, kernel_size=3, padding="valid", activation="relu")
        self.fourgram = layers.Conv1D(filters=nb_filters, kernel_size=4, padding="valid", activation="relu")

        self.pool = layers.GlobalMaxPooling1D()
        self.dense = layers.Dense(units=FFN_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if nb_classes == 2:
            self.last_dense = layers.Dense(units=1, activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=nb_classes, activation="softmax")

    def get_func_model(self, training=False):
        inp = tf.keras.Input(shape=(138,))
        x = self.embedding(inp)
        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)

        merged = tf.concat([x_1, x_2, x_3], axis=-1)
        merged = self.dense(merged)
        merged = self.dropout(merged)
        output = self.last_dense(merged)
        model = tf.keras.models.Model(inputs=inp, outputs=output)
        return model

    def call(self, inputs, training):
        x = self.embedding(inputs)
        x_1 = self.bigram(x)
        x_1 = self.pool(x_1)
        x_2 = self.trigram(x)
        x_2 = self.pool(x_2)
        x_3 = self.fourgram(x)
        x_3 = self.pool(x_3)

        merged = tf.concat([x_1, x_2, x_3], axis=-1)
        merged = self.dense(merged)
        merged = self.dropout(merged, training)
        output = self.last_dense(merged)
        return output


In [None]:
class Config:
    def __init__(self,
                 VOCAB_SIZE,
                 EMB_DIM,
                 NB_FILTERS,
                 FFN_UNITS,
                 NB_CLASSES,
                 DROPOUT_RATE,
                 BATCH_SIZE,
                 NB_EPOCHS):
        self.VOCAB_SIZE = VOCAB_SIZE

        self.EMB_DIM = EMB_DIM
        self.NB_FILTERS = NB_FILTERS
        self.FFN_UNITS = FFN_UNITS
        self.NB_CLASSES = NB_CLASSES  # len(set(train_labels))

        self.DROPOUT_RATE = DROPOUT_RATE

        self.BATCH_SIZE = BATCH_SIZE
        self.NB_EPOCHS = NB_EPOCHS


In [None]:
def train(tokenized_train, train_labels, config):

    Dcnn = DCNN(vocab_size= config.VOCAB_SIZE,
                embedding_dim=config.EMB_DIM,
                nb_filters=config.NB_FILTERS,
                FFN_units=config.FFN_UNITS,
                nb_classes=config.NB_CLASSES,
                dropout_rate=config.DROPOUT_RATE)

    model = Dcnn.get_func_model(training=True)

    if config.NB_CLASSES == 2:
        model.compile(loss="binary_crossentropy",
                      experimental_steps_per_execution = 50,
                      optimizer="adam",
                      metrics=["accuracy"])
    else:
        model.compile(loss="sparse_categorical_crossentropy",
                      experimental_steps_per_execution = 50,
                      optimizer="adam",
                      metrics=["sparse_categorical_accuracy"])

    checkpoint_path = "ckpt"

    ckpt = tf.train.Checkpoint(Dcnn=model)

    ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

    if ckpt_manager.latest_checkpoint:
        ckpt.restore(ckpt_manager.latest_checkpoint)
        print("Latest checkpoint restored!!")

    model.fit(tokenized_train,
              train_labels,
              batch_size=config.BATCH_SIZE,
              epochs=config.NB_EPOCHS)

    model.summary()

    
    ckpt_manager.save()
    ##Because the TPU requires data storage location to be gcs bucket
    #model.save('/content/drive/My Drive/Colab Training Data/output/output.tf')
    return model


In [None]:
config = Config(
    VOCAB_SIZE=tokenizer.vocab_size,
    EMB_DIM=200,
    NB_FILTERS=100,
    FFN_UNITS=256,
    NB_CLASSES=2,
    DROPOUT_RATE=0.2,
    BATCH_SIZE=512,
    NB_EPOCHS=5
)

In [None]:
config, tokenized_train, train_labels

(<__main__.Config at 0x7f5b43ec18d0>,
 array([[65335,  1570,   113, ...,     0,     0,     0],
        [   11,  1090,    23, ...,     0,     0,     0],
        [65335,     3, 41585, ...,     0,     0,     0],
        ...,
        [  926,    12,   229, ...,     0,     0,     0],
        [  366,   337,  1309, ...,     0,     0,     0],
        [  181, 51256,     0, ...,     0,     0,     0]], dtype=int32),
 array([0, 0, 0, ..., 1, 1, 1]))

In [None]:
 tf.config.experimental.list_physical_devices('GPU') 
 

[]

In [None]:
print("Tensorflow version " + tf.__version__)

try:
  tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
  print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
except ValueError:
  raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')

tf.config.experimental_connect_to_cluster(tpu)
tf.tpu.experimental.initialize_tpu_system(tpu)
tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)

Tensorflow version 2.3.0
Running on TPU  ['10.82.195.162:8470']




INFO:tensorflow:Initializing the TPU system: grpc://10.82.195.162:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.82.195.162:8470


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


In [None]:
model = train(tokenized_train, train_labels, config)

Epoch 1/5




















Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Model: "functional_25"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_13 (InputLayer)           [(None, 138)]        0                                            
__________________________________________________________________________________________________
embedding_12 (Embedding)        (None, 138, 200)     13111800    input_13[0][0]                   
__________________________________________________________________________________________________
conv1d_36 (Conv1D)              (None, 137, 100)     40100       embedding_12[0][0]               
__________________________________________________________________________________________________
conv1d_37 (Conv1D)              (None, 136, 100)     60100       embedding_12[0][0]               
______________________________________________

In [None]:
def convert_model_to_tflite(model=None):                                                     
    # Convert the model.                                                                     
    model_to_convert = model if model else tf.keras.models.load_model("output.tf")           
                                                                                             
    converter = tf.lite.TFLiteConverter.from_keras_model(model_to_convert)                   
    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS,                   
                                           tf.lite.OpsSet.SELECT_TF_OPS]                     
    converter.optimizations = [tf.lite.Optimize.DEFAULT]                                     
    tflite_model = converter.convert()                                                       
                                                                                             
    # Save the TF Lite model.                                                                
    with tf.io.gfile.GFile('/content/drive/My Drive/Colab Training Data/output/tflite/dcnn.tflite', 'wb') as f:                                 
        f.write(tflite_model)                                                                
                                                                                             

In [None]:
convert_model_to_tflite(model)

INFO:tensorflow:Assets written to: /tmp/tmp6tfdbcjo/assets


INFO:tensorflow:Assets written to: /tmp/tmp6tfdbcjo/assets
