In [1]:
import tensorflow as tf

**Stacking using SimpleRNN**

  1. return_sequences=True:
  
  For all recurrent layers except the final one in a stacked RNN, the return_sequences parameter must be set to True. This ensures that each layer outputs a 3D tensor (batch\_size, timesteps, features) which is required as input by the subsequent recurrent layer. If return_sequences is False, the layer would only output the final hidden state (a 2D tensor), leading to an error when feeding it to another recurrent layer.

  2. Layer Types:
  
  You can use various recurrent layer types in a stacked configuration, such as SimpleRNN, LSTM, or GRU. For example, you might stack multiple LSTM layers to create a deep LSTM network.

  3. Input Shape:
  
  The first recurrent layer in the stack needs to be aware of the input shape. You can specify this using the input_shape argument, typically in the format (timesteps, features). For subsequent layers, Keras automatically infers the input shape from the preceding layer's output.

  4. Output Layer:
  
  After the final recurrent layer (which typically has return_sequences=False if you're performing a sequence-to-vector task like classification), you would usually add a Dense layer for the final output.
  
  5. Sequence to Sequence :
  
  If your task is sequence-to-sequence (e.g., generating a sequence of outputs), you might set return_sequences=True for the final RNN layer and wrap the Dense layer with a TimeDistributed layer to apply it at each timestep.

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv("/content/train.csv")
df.head()


Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


In [3]:
df.drop(["id"],axis=1, inplace = True)
df.head()

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


In [4]:
df["label"].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,29720
1,2242


**Text Preprocessing**

In [5]:
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [6]:
df["tweet"][0]

' @user when a father is dysfunctional and is so selfish he drags his kids into his dysfunction.   #run'

In [7]:
tweets = []
for i in range(len(df["tweet"])):
  tweet = [x for x in word_tokenize(df["tweet"][i]) if x.isalpha() == True]
  tweets.append(tweet)


In [8]:
tweets[:5]


[['user',
  'when',
  'a',
  'father',
  'is',
  'dysfunctional',
  'and',
  'is',
  'so',
  'selfish',
  'he',
  'drags',
  'his',
  'kids',
  'into',
  'his',
  'dysfunction',
  'run'],
 ['user',
  'user',
  'thanks',
  'for',
  'lyft',
  'credit',
  'i',
  'ca',
  'use',
  'cause',
  'they',
  'do',
  'offer',
  'wheelchair',
  'vans',
  'in',
  'pdx',
  'disapointed',
  'getthanked'],
 ['bihday', 'your', 'majesty'],
 ['model', 'i', 'love', 'u', 'take', 'with', 'u', 'all', 'the', 'time', 'in'],
 ['factsguide', 'society', 'now', 'motivation']]

# Generate embeddings

In [9]:
 # !pip install gensim

In [None]:
import gensim.downloader
glove_vectors = gensim.downloader.load('glove-twitter-25')
glove_vectors.most_similar('twitter')



In [None]:
glove_vectors["dysfunction"], glove_vectors.most_similar('dysfunction')

Find word vectors for each word in the tweets

In [None]:
embedding_dim = glove_vectors.vector_size

In [None]:
embedding_dim

In [None]:
X = [[ glove_vectors[word] if word in glove_vectors else np.zeros(embedding_dim) for word in tweet ] for tweet in tweets]
    # If vector does'nt exist for a word, give dummy values.

In [None]:
y = df["label"]

There is one issue ie, number of words per doc are different. This issue can be resolved by padding.

In [None]:
max_words_per_doc = max([len(tweet) for tweet in tweets])
max_words_per_doc

In [None]:
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=max_words_per_doc, padding='post')

In [None]:
X[0].shape

In [None]:
type(X)

In [None]:
X = np.array(X)
y = np.array(y)
X.shape, y.shape

y is expected to be 2D so reshape it

In [None]:
type(y)

In [None]:
y = y.reshape(-1,1)
y.shape

In [None]:
from tensorflow.keras.layers import Dense, SimpleRNN
from  tensorflow.keras.models import Sequential

In [None]:
model = Sequential()
model.add( SimpleRNN(50, input_shape=(max_words_per_doc, embedding_dim), return_sequences=True))
model.add( SimpleRNN(25, return_sequences=False))
model.add( Dense(1, activation='sigmoid'))
model.summary()

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy', "f1_score"])

model.fit(X, y, epochs=15, validation_split=0.2)