# Import required libraries

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.models import Sequential
import warnings
warnings.filterwarnings("ignore")

# Read the dataset

In [4]:
df = pd.read_csv("C:/Users/Administrator/Downloads/archive (2)/training.1600000.processed.noemoticon.csv",encoding='latin1')

FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/Administrator/Downloads/archive (2)/training.1600000.processed.noemoticon.csv'

# Get the details of Dataset

In [None]:
print(df.shape)

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

# Check for null values if any

In [None]:
df.isnull().sum()

# Drop the unnecessary columns

In [None]:
df.drop(['id of the tweet', 'date of the tweet', 'query', 'user'], axis=1, inplace=True)
df.head()

# Get the Information of remaining coloumns 

In [None]:
df.columns = ['sentiment', 'text']

In [None]:
df.info()

In [None]:
df['sentiment'].value_counts()

# Visualize the data

In [None]:
sns.histplot(df['sentiment'], kde=True, color = 'r')

In [None]:
sns.countplot(x=df['sentiment'] , color= 'b')

# Tokenizing the words

In [None]:
word_tokens = [word_tokenize(i) for i in df["text"]]

In [None]:
word_tokens[:3]

# Applying Regular Expression Pattern for removing alphanumeric characters and one or more occurrences of the preceding character

In [None]:
removal = RegexpTokenizer(r"\w+")

In [None]:
new_sentiments = [removal.tokenize(i) for i in df["text"]]

In [None]:
new_sentiments[:3]

# Applying Join on list of strings to form a complete sentence

In [None]:
new_strings = [" ".join(i) for i in new_sentiments]

In [None]:
new_strings[:3]

# Removing Stopwords

In [None]:
sw = set(stopwords.words("english"))

In [None]:
new_str_tokens = [word_tokenize(i) for i in new_strings]

In [None]:
new_str_tokens[:3]

In [None]:
updated_strings = [[word for word in i if not word in sw] for i in new_str_tokens]

In [None]:
updated_strings[:3]

# Applying join to form a complete sentence

In [None]:
dummy_str = [" ".join(i) for i in updated_strings]
dummy_str[:3]

In [None]:
df["text"] = dummy_str

In [None]:
df.head()

In [None]:
df["sentiment"].value_counts()

In [None]:
df["sentiment"].unique()

# Applying Tokenizer

In [None]:
tokenizer = Tokenizer()

In [None]:
tokenizer.fit_on_texts(df["text"])

In [None]:
tokens = tokenizer.texts_to_sequences(df["text"])

In [None]:
tokens[:5]

# Finding the Vocab size

In [None]:
vocab_size = len(tokenizer.word_index)+1

In [None]:
vocab_size

# Converting a series of text data into a series of sequences of integer indices using a tokenizer object

In [None]:
seq = tokenizer.texts_to_sequences(df["text"])

In [None]:
seq[:5]

#  Padding a series of sequences of integer indices.

In [None]:
pad_seq = pad_sequences(seq, maxlen=35, padding="post")

In [None]:
pad_seq

# Applying LabelEncoder

In [None]:
from sklearn.preprocessing import LabelEncoder

In [None]:
le = LabelEncoder()

In [None]:
labels = le.fit_transform(df["sentiment"])

In [None]:
labels

In [None]:
df.head()

# Applying Simple RNN with Sequential Model and Softmax activation 

In [None]:
model = Sequential()
model.add(Embedding(input_dim=20000, output_dim=5))
model.add(SimpleRNN(32,return_sequences=False))
model.add(Dense(3,activation='softmax'))
model.summary()

# Compiling Model using adam as optimizer

In [None]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", 
              metrics=["accuracy"])

In [None]:
model.fit(pad_seq, labels, epochs=10)

# Checking if the model works correctly

In [None]:
new_text = "I am happy, As I scored good marks in test"

In [None]:
new_tokens = tokenizer.texts_to_sequences([new_text])

In [None]:
new_tokens

In [None]:
new_pad_seq = pad_sequences(new_tokens, maxlen=35, padding="post")

In [None]:
new_pad_seq

In [None]:
predictions = model.predict(new_pad_seq)
predicted_class_index = predictions.argmax(axis=-1)
if predicted_class_index[0] == 0:
    print("Negetive Sentiment");
else:
    print("Positive Sentiment")