## Loading the Tweets dataset:

In [306]:
## Importing libraries:
import pickle
import pandas as pd
import random 
import math

In [307]:
import numpy as np
!pip install bert-for-tf2
!pip install sentencepiece
import tensorflow as tf
import tensorflow_hub as hub
from tensorflow.keras import layers
import bert



In [308]:
print(tf.__version__)

2.3.0


In [309]:
with open(r"/content/sample_data/Entire_Dataset.pkl", "rb") as input_file:
      df_1= pickle.load(input_file)

In [310]:
with open(r"/content/sample_data/tweets_100_Dec_3.pkl", "rb") as input_file:
      df_2= pickle.load(input_file)

In [311]:
df_1 = df_1[df_1.labels!=''] ##Ignoring the irrelevant labels
len(df_1)

89

In [312]:
df_2 = df_2[df_2.labels!=''] ##Ignoring the irrelevant labels
len(df_2)

141

In [313]:
len(df_1.full_text.unique())

88

In [314]:
len(df_2.full_text.unique())

141

In [315]:
df = df_2.append(df_1,ignore_index=False) 
len(df)

230

In [316]:
df.reset_index(drop=True).drop(['created_at','url'],axis=1,inplace=True)

In [317]:
df.head()

Unnamed: 0,tweet_id,created_at,full_text,url,disp_url,labels,articles
0,1334673337591754755,Fri Dec 04 01:38:15 +0000 2020,Introducing Salesforce Hyperforce: the world’s...,https://www.salesforce.com/news/press-releases...,salesforce.com/news/press-rel…,press releases,"The world’s #1 CRM, available on major public ..."
1,1334636358414049282,Thu Dec 03 23:11:18 +0000 2020,Salesforce Introduces Service Cloud Workforce ...,https://www.salesforce.com/news/press-releases...,salesforce.com/news/press-rel…,press releases,Service leaders will be able to predict custom...
2,1334608137018630146,Thu Dec 03 21:19:10 +0000 2020,The Biggest News from Dreamforce to You https:...,https://www.salesforce.com/news/stories/the-bi...,salesforce.com/news/stories/t…,case studies,Every part of our lives have been transformed ...
5,1334576945602981890,Thu Dec 03 19:15:13 +0000 2020,Today we recognize the International Day of Pe...,https://sforce.co/2VAYQOS,sforce.co/2VAYQOS,case studies,
7,1334533636205449216,Thu Dec 03 16:23:07 +0000 2020,Fortune Names Salesforce a Future 50 Company f...,https://www.salesforce.com/news/stories/fortun...,salesforce.com/news/stories/f…,case studies,Fortune Magazine just announced its Future 50 ...


In [318]:
len(df)

230

In [319]:
len(df.full_text.unique())

166

In [320]:
dataset = df.drop_duplicates() 

In [321]:
len(dataset)

230

In [322]:
dataset.labels.value_counts()

press releases    122
case studies      108
Name: labels, dtype: int64

In [323]:
dataset.shape

(230, 7)

In [324]:
dataset.columns

Index(['tweet_id', 'created_at', 'full_text', 'url', 'disp_url', 'labels',
       'articles'],
      dtype='object')

## Prepping data before making the model:

In [325]:
data = dataset[['full_text','labels']].copy(deep=True).reset_index(drop=True)

In [326]:
data.shape

(230, 2)

In [327]:
data.head(2)

Unnamed: 0,full_text,labels
0,Introducing Salesforce Hyperforce: the world’s...,press releases
1,Salesforce Introduces Service Cloud Workforce ...,press releases


## Remove the punctuations and cleaning the tweets a bit:

In [328]:
data['cleaned_tweets'] = ''

In [329]:
data.full_text = data.full_text.str.replace(r'https\S+','')
data['cleaned_tweets'] = data.full_text.str.replace(r'[^\w\s]+', ' ')

In [330]:
data.cleaned_tweets = data.cleaned_tweets.str.replace('\d+','')
data.cleaned_tweets = data.cleaned_tweets.str.lower()
data.cleaned_tweets = data.cleaned_tweets.str.replace('  ',' ')


In [331]:
data.cleaned_tweets[2]

'the biggest news from dreamforce to you via salesforcenews'

In [332]:
data.labels.unique()

array(['press releases', 'case studies'], dtype=object)

In [333]:
#Target Label - y 
y = data.labels

y = np.array(list(map(lambda x: 1 if x=="case studies" else 0, y))) ## We are encoding case studies as 1 and press releases as 0

## We will use BERT's tokenizer and we are using a pre-trained model for English:

In [334]:
BertTokenizer = bert.bert_tokenization.FullTokenizer
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3", \
                            trainable=False)
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = BertTokenizer(vocabulary_file, to_lower_case)

In [349]:
data.head()

Unnamed: 0,full_text,labels,cleaned_tweets
0,Introducing Salesforce Hyperforce: the world’s...,press releases,introducing salesforce hyperforce the world s ...
1,Salesforce Introduces Service Cloud Workforce ...,press releases,salesforce introduces service cloud workforce ...
2,The Biggest News from Dreamforce to You via @...,case studies,the biggest news from dreamforce to you via sa...
3,Today we recognize the International Day of Pe...,case studies,today we recognize the international day of pe...
4,Fortune Names Salesforce a Future 50 Company f...,case studies,fortune names salesforce a future company for ...


In [350]:
tokenizer.convert_tokens_to_ids(tokenizer.tokenize(data.cleaned_tweets[1][:50]))

[4341, 14821, 13999, 2326, 6112, 14877, 25540, 2050]

## Tokenizing the tweets and preprocessing them:

In [351]:
def tokenize_tweets(text):
    return tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))

In [352]:
tokenized_tweets = [tokenize_tweets(tweet) for tweet in data.cleaned_tweets]

In [353]:
tweets_len = [[tweet, y[i], len(tweet)] for i, tweet in enumerate(tokenized_tweets)]

In [354]:
random.shuffle(tweets_len)

In [355]:
tweets_len.sort(key=lambda x: x[2])

In [356]:
sorted_tweets_labels = [(tw_label[0], tw_label[1]) for tw_label in tweets_len]

## Converting tweets data so that it can be used for Tf2.0 models:

In [357]:
processed_dataset = tf.data.Dataset.from_generator(lambda: sorted_tweets_labels, output_types=(tf.int32, tf.int32))

## We will make batch size of 3 that means that the weights of model will change after every 3 training samples:

In [358]:
BATCH_SIZE = 3
batched_dataset = processed_dataset.padded_batch(BATCH_SIZE, padded_shapes=((None, ), ()))

In [359]:
next(iter(batched_dataset))

(<tf.Tensor: shape=(3, 11), dtype=int32, numpy=
 array([[ 4341, 14821,  2638,  9333, 13856,  1996,  4256,  8144,  2400,
          4791,     0],
        [13580,  4328,  4608,  1996,  2327,  5312,  1997,  1040, 11263,
          3959, 14821],
        [13580,  4328,  4608,  1996,  2327,  5312,  1997,  1040, 11263,
          3959, 14821]], dtype=int32)>,
 <tf.Tensor: shape=(3,), dtype=int32, numpy=array([1, 1, 1], dtype=int32)>)

In [360]:
TOTAL_BATCHES = math.ceil(len(sorted_tweets_labels) / BATCH_SIZE)
TEST_BATCHES = TOTAL_BATCHES // 2 ## 50% of data we are taking for testing
batched_dataset.shuffle(TOTAL_BATCHES)
test_data = batched_dataset.take(TEST_BATCHES)
train_data = batched_dataset.skip(TEST_BATCHES)

In [361]:
TOTAL_BATCHES

77

In [362]:
TEST_BATCHES

38

## Using 3 conv. layers with Global Max pooling with ReLU activation function and Sigmoid outut function since we have 2 output classes:

In [363]:
class TEXT_MODEL(tf.keras.Model):
    
    def __init__(self,
                 vocabulary_size,
                 embedding_dimensions=128,
                 cnn_filters=50,
                 dnn_units=512,
                 model_output_classes=2,
                 dropout_rate=0.1,
                 training=False,
                 name="text_model"):
      
        super(TEXT_MODEL, self).__init__(name=name)
        
        self.embedding = layers.Embedding(vocabulary_size,
                                          embedding_dimensions)
        self.cnn_layer1 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=2,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer2 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=3,
                                        padding="valid",
                                        activation="relu")
        self.cnn_layer3 = layers.Conv1D(filters=cnn_filters,
                                        kernel_size=4,
                                        padding="valid",
                                        activation="relu")
        self.pool = layers.GlobalMaxPool1D()
        
        self.dense_1 = layers.Dense(units=dnn_units, activation="relu")
        self.dropout = layers.Dropout(rate=dropout_rate)
        if model_output_classes == 2:
            self.last_dense = layers.Dense(units=1,
                                           activation="sigmoid")
        else:
            self.last_dense = layers.Dense(units=model_output_classes,
                                           activation="softmax")
    
    def call(self, inputs, training):
        l = self.embedding(inputs)
        l_1 = self.cnn_layer1(l) 
        l_1 = self.pool(l_1) 
        l_2 = self.cnn_layer2(l) 
        l_2 = self.pool(l_2)
        l_3 = self.cnn_layer3(l)
        l_3 = self.pool(l_3) 
        
        concatenated = tf.concat([l_1, l_2, l_3], axis=-1) # (batch_size, 3 * cnn_filters)
        concatenated = self.dense_1(concatenated)
        concatenated = self.dropout(concatenated, training)
        model_output = self.last_dense(concatenated)
        
        return model_output

In [364]:
VOCAB_LENGTH = len(tokenizer.vocab)
EMB_DIM = 200
CNN_FILTERS = 50
DNN_UNITS = 256
OUTPUT_CLASSES = 2
DROPOUT_RATE = 0.1
NB_EPOCHS = 5

In [365]:
text_model = TEXT_MODEL(vocabulary_size=VOCAB_LENGTH,
                        embedding_dimensions=EMB_DIM,
                        cnn_filters=CNN_FILTERS,
                        dnn_units=DNN_UNITS,
                        model_output_classes=OUTPUT_CLASSES,
                        dropout_rate=DROPOUT_RATE)

## We use Adam optimizer and Binary Cross-Entropy as the loss function since ou problem statement is a two-class problem. 

In [366]:
if OUTPUT_CLASSES == 2:
    text_model.compile(loss="binary_crossentropy",
                       optimizer="adam",
                       metrics=["accuracy"])
else:
    text_model.compile(loss="sparse_categorical_crossentropy",
                       optimizer="adam",
                       metrics=["sparse_categorical_accuracy"])

In [367]:
train_model = text_model.fit(train_data, epochs=NB_EPOCHS)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## Evaluating our model on test data:

In [368]:
next(iter(test_data))

(<tf.Tensor: shape=(3, 11), dtype=int32, numpy=
 array([[ 4341, 14821,  2638,  9333, 13856,  1996,  4256,  8144,  2400,
          4791,     0],
        [13580,  4328,  4608,  1996,  2327,  5312,  1997,  1040, 11263,
          3959, 14821],
        [13580,  4328,  4608,  1996,  2327,  5312,  1997,  1040, 11263,
          3959, 14821]], dtype=int32)>,
 <tf.Tensor: shape=(3,), dtype=int32, numpy=array([1, 1, 1], dtype=int32)>)

In [369]:
results = text_model.evaluate(test_data)
print(results)

[0.11137957125902176, 0.9649122953414917]


## We observe that the model accuracy is ~96% on test data.

## Conclusion:<br>We need to scrape more tweets as our training model seems to have over-fitted a bit. With more data our model would be more generalized. So, for 230 rows, our training model seems to give 100% accuracy with testing model giving ~96% accuracy.
