### Imports

In [8]:
import numpy as np
import tensorflow as tf

# Set seed for NumPy
np.random.seed(42)

# Set seed for TensorFlow
tf.random.set_seed(42)


In [9]:
import pandas as pd
import time
import os

### Globals

In [10]:
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"

### DATASET

In [11]:
for dirname, _, filenames in os.walk('/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
df = pd.read_csv('data.csv',encoding =DATASET_ENCODING , names=DATASET_COLUMNS)


In [12]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [13]:
# Map target to string
decode_map = {0: "NEGATIVE", 2: "NEUTRAL", 4: "POSITIVE"}
def decode_sentiment(label):
    return decode_map[int(label)]
df.target = df.target.apply(lambda x: decode_sentiment(x))

In [14]:
df.head()

Unnamed: 0,target,ids,date,flag,user,text
0,NEGATIVE,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,NEGATIVE,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,NEGATIVE,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,NEGATIVE,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,NEGATIVE,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [14]:
# import nltk
# import nltk.corpus
# from nltk.corpus import stopwords
# nltk.download("stopwords")
# stop_words = set(stopwords.words("english"))

In [15]:
import re
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense


In [16]:

# Data Cleaning and Preprocessing
def preprocess_text(text):
    text = re.sub(r"@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+", ' ', text)
    return text.lower().strip()

df['text'] = df['text'].apply(preprocess_text)

train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Tokenization and Padding
max_words = 10000  
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(train_df['text'])

train_sequences = tokenizer.texts_to_sequences(train_df['text'])
test_sequences = tokenizer.texts_to_sequences(test_df['text'])

max_length = 50 
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post', truncating='post')
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post', truncating='post')

# Model Definition
embedding_dim = 32 
model = Sequential([
    Embedding(input_dim=max_words, output_dim=embedding_dim, input_length=max_length),
    LSTM(64, return_sequences=True),
    LSTM(64),
    Dense(1, activation='sigmoid')
])

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Model Training
X_train, y_train = train_padded, (train_df['target'] == 'POSITIVE').astype(int)
X_test, y_test = test_padded, (test_df['target'] == 'POSITIVE').astype(int)

model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test))

#Evaluate the Model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_acc}")

# the following lines will only be for testing 




Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Accuracy: 0.8306968808174133


In [23]:
train_df['text']
train_df[['text']].to_csv('tokens.csv', index=False)


In [17]:
model.save("text.h5")

  saving_api.save_model(


In [18]:
model.save("text.keras")

In [24]:

# Assuming you have a new input text
new_text = "i dont like this movie"

# Preprocess the new text
new_text = preprocess_text(new_text)

# Tokenize and pad the new text
new_sequence = tokenizer.texts_to_sequences([new_text])
new_padded = pad_sequences(new_sequence, maxlen=max_length, padding='post', truncating='post')

# Make predictions
prediction = model.predict(new_padded)
print(prediction[0][0])
# Interpret the prediction
if prediction[0][0] >= 0.5:
    sentiment = 'POSITIVE'
else:
    sentiment = 'NEGATIVE'

print(f"The sentiment of the input text is: {sentiment}")


0.040225506
The sentiment of the input text is: NEGATIVE
