## creating the dataframe in pandas

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import contractions
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
import gensim.downloader as api

In [3]:
import re
import nltk
import contractions
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()

train_df = pd.read_csv('train_data.csv')
test_df = pd.read_csv('test_data.csv')

data = pd.concat([train_df, test_df], ignore_index=True)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## regular expressions to clean the data and normalize the text

In [4]:
def clean_text(text):
    text = text.lower()
    text = contractions.fix(text)
    text = re.sub(r'https?://\S+|www\.\S+', '', text)
    text = re.sub(r'\@\w+|\#', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    text = ' '.join([WordNetLemmatizer().lemmatize(word) for word in text.split()])
    text = re.sub(r'\s+', ' ', text).strip()
    return text

data['Processed_Review'] = data['Review'].apply(clean_text)

## ngrams

In [20]:
from nltk import ngrams
from nltk.tokenize import word_tokenize

nltk.download('punkt')  # Necessary for nltk's word_tokenize function, if not already installed.


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [21]:
def generate_ngrams(text, n=2):
    # First, clean the text
    cleaned_text = clean_text(text)

    # Tokenize the cleaned text
    tokens = word_tokenize(cleaned_text)
    
    # Generate and return n-grams
    n_grams = list(ngrams(tokens, n))
    return [' '.join(grams) for grams in n_grams]

# Example usage
sample_text = "I can't believe how interesting this movie is, but the plot isn't great!"
bigrams = generate_ngrams(sample_text, 2)  # Generate bigrams
trigrams = generate_ngrams(sample_text, 3)  # Generate trigrams

print("Bigrams:", bigrams)
print("Trigrams:", trigrams)


Bigrams: ['can not', 'not believe', 'believe interesting', 'interesting movie', 'movie plot', 'plot great']
Trigrams: ['can not believe', 'not believe interesting', 'believe interesting movie', 'interesting movie plot', 'movie plot great']


In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

data['cleaned_reviews'] = data['Review'].apply(clean_text)

vectorizer = TfidfVectorizer(max_features=1000)  
X = vectorizer.fit_transform(data['cleaned_reviews'])
y = data['Sentiment'].map({'Positive': 1, 'Negative': 0})  


## naive bayes

In [13]:
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(data['Processed_Review'])
y = data['Sentiment'].map({'Positive': 1, 'Negative': 0})

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = MultinomialNB()
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
cv_scores = cross_val_score(clf, X, y, cv=5, scoring='accuracy')
print("CV Average Accuracy: %0.2f (+/- %0.2f)" % (cv_scores.mean(), cv_scores.std() * 2))

Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       124
           1       1.00      1.00      1.00       116

    accuracy                           1.00       240
   macro avg       1.00      1.00      1.00       240
weighted avg       1.00      1.00      1.00       240

CV Average Accuracy: 0.91 (+/- 0.35)


## vectors

In [15]:
word_vectors = api.load("glove-wiki-gigaword-100")

def document_vector(word_vecs, doc):
    doc = [word for word in doc if word in word_vecs.key_to_index]
    if doc:
        return np.mean([word_vecs[word] for word in doc], axis=0)
    else:
        return np.zeros(word_vecs.vector_size)

data['vector'] = data['Processed_Review'].apply(lambda x: document_vector(word_vectors, x.split()))

X = np.vstack(data['vector'].values)
y = data['Sentiment'].map({'Positive': 1, 'Negative': 0}).values

## neural networks

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = Sequential([
    Dense(128, activation='relu', input_dim=100),
    Dropout(0.5),
    Dense(64, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_test, y_test))

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1cec0ec59e8>

In [19]:
scores = model.evaluate(X_test, y_test, verbose=1)
print(f"Test Loss: {scores[0]}")
print(f"Test Accuracy: {scores[1]}")

Test Loss: 2.4985042728076223e-06
Test Accuracy: 1.0


In [17]:
def predict_sentiment(text, model, word_vectors):
    processed_text = clean_text(text)
    vector = document_vector(word_vectors, processed_text.split())
    vector = vector.reshape(1, -1)
    prediction = model.predict(vector)[0][0]
    return "Positive" if prediction >= 0.5 else "Negative"

In [18]:
sample_review = "This movie was a fantastic journey through the realms of science fiction."
predicted_sentiment = predict_sentiment(sample_review, model, word_vectors)
print("Predicted Sentiment:", predicted_sentiment)

Predicted Sentiment: Positive
