#**Topic: Sentiment Analysis**
---

# **Connecting to drive**
---


In [None]:
from google.colab import drive

drive.mount("/content/Drive")

Mounted at /content/Drive


In [None]:
import os

# Mount your Google Drive to access files stored there
drive.mount('/content/drive')

# Replace 'your_file_name.csv' with the actual name of your file.
file_name = 'final_data.csv'

# Set the root directory to your Google Drive
root_dir = '/content/drive/My Drive/'

# Function to recursively search for the file in all directories and subdirectories
def find_file(directory):
    for item in os.listdir(directory):
        item_path = os.path.join(directory, item)
        if os.path.isfile(item_path) and item == file_name:
            return directory
        elif os.path.isdir(item_path):
            result = find_file(item_path)
            if result:
                return result
    return None

# Call the function to find the file directory
file_directory = find_file(root_dir)

# Print the file directory
if file_directory:
    print("File directory:", file_directory)
else:
    print("File not found in Google Drive.")

Mounted at /content/drive
File directory: /content/drive/My Drive/AI_Desicion_Scineces2_endterm


In [None]:
os.chdir('/content/drive/My Drive/AI_Desicion_Scineces2_endterm')

#**Importing required Libraries**
---

In [None]:
# Importing the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, recall_score, precision_score, classification_report, precision_recall_fscore_support
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from tqdm import tqdm
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Embedding, LSTM,Dense, SpatialDropout1D, Dropout
from keras.initializers import Constant

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#**Reading the final dataset**
---

In [None]:
# Read the csv file and convert it to DataFrame
df = pd.read_csv('final_data.csv')

In [None]:
df.head()

Unnamed: 0,IDLink,Title,Headline,Topic,PublishDate,SentimentTitle,SentimentHeadline,Facebook,GooglePlus,LinkedIn,PublishTime,Weekday,Facebook_scaled,GooglePlus_scaled,LinkedIn_scaled,SentimentTitle_Category,SentimentHeadline_Category,Source_type,Hour
0,80690.0,"Monday, 29 Feb 2016","RAMALLAH, February 25, 2016 (WAFA) - Palestine...",palestine,2016-02-28,0.0,-0.005906,1.0,1.0,1.0,14:03:00,Sunday,-0.64969,-0.617774,-0.563154,neutral,negative,D,14
1,28854.0,Buffett: Politicians 'Dead Wrong' on Economy,Warren Buffett has a message for presidential ...,economy,2016-02-28,0.051031,-0.037921,0.0,0.0,0.0,19:17:00,Sunday,0.0,0.0,0.0,positive,negative,D,19
2,81052.0,"Monday, 29 Feb 2016","RAMALLAH, February 29, 2016 (WAFA) - The Gover...",palestine,2016-03-01,0.0,0.048546,1.0,1.0,1.0,09:29:00,Tuesday,-0.64969,-0.617774,-0.563154,neutral,positive,D,9
3,80994.0,"Tuesday, 1 Mar 2016","RAMALLAH, February 29, 2016 (WAFA) - The Gover...",palestine,2016-03-01,-0.243068,0.048546,1.0,1.0,1.0,00:15:00,Tuesday,-0.64969,-0.617774,-0.563154,negative,positive,D,0
4,946.0,Microsoft Takes Six Billion Dollars From Android,"A long time ago, Microsoft MSFT +0.00% purchas...",microsoft,2015-11-01,0.0,0.115928,0.0,0.0,0.0,00:00:00,Sunday,0.0,0.0,0.0,neutral,positive,D,0


In [None]:
df.shape

(92808, 19)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 92808 entries, 0 to 92807
Data columns (total 19 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   IDLink                      92808 non-null  float64
 1   Title                       92808 non-null  object 
 2   Headline                    92808 non-null  object 
 3   Topic                       92808 non-null  object 
 4   PublishDate                 92808 non-null  object 
 5   SentimentTitle              92808 non-null  float64
 6   SentimentHeadline           92808 non-null  float64
 7   Facebook                    92808 non-null  float64
 8   GooglePlus                  92808 non-null  float64
 9   LinkedIn                    92808 non-null  float64
 10  PublishTime                 92808 non-null  object 
 11  Weekday                     92808 non-null  object 
 12  Facebook_scaled             92808 non-null  float64
 13  GooglePlus_scaled           928

#**Preprocessing**
----

- Cleaning the text column ('reviewText') by:
- Removing stop words
- Convert text to lowercase
- Removing punctuations and numbers
- Tokenizing
- Stemming and
- Lemmatization

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
import nltk
import re

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
# Define stopwords
stop_words = set(stopwords.words('english'))

# Define stemmer
stemmer = PorterStemmer()

# Define lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove punctuations and numbers
    text = re.sub('[^a-zA-Z]', ' ', text)

    # Tokenize and remove stop words
    tokenized_text = [w for w in word_tokenize(text) if w not in stop_words]
    text = ' '.join(tokenized_text)

    # Perform stemming and lemmatization
    stemmed_lemmatized_text = [stemmer.stem(lemmatizer.lemmatize(w)) for w in word_tokenize(text)]
    text = ' '.join(stemmed_lemmatized_text)

    return text

In [None]:
df['title_headline'] = df['Title'] + ' ' + df['Headline']

In [None]:
df['title_headline'] = df['title_headline'].apply(preprocess_text)
df['Headline'] = df['Headline'].apply(preprocess_text)
df['Title'] = df['Title'].apply(preprocess_text)

0        monday feb ramallah februari wafa palestin lib...
1        buffett politician dead wrong economi warren b...
2        monday feb ramallah februari wafa govern japan...
3        tuesday mar ramallah februari wafa govern japa...
4        microsoft take six billion dollar android long...
                               ...                        
92803    stock rise investor key u economi ahead friday...
92804    russian pm propos use conserv tough scenario a...
92805    palestinian govern us foreign aid pay terroris...
92806    palestin youth orchestra prepar first uk tour ...
92807    sausalito businesswoman win microsoft window g...
Name: title_headline, Length: 92808, dtype: object


In [None]:
df.head(2)

Unnamed: 0,IDLink,Title,Headline,Topic,PublishDate,SentimentTitle,SentimentHeadline,Facebook,GooglePlus,LinkedIn,PublishTime,Weekday,Facebook_scaled,GooglePlus_scaled,LinkedIn_scaled,SentimentTitle_Category,SentimentHeadline_Category,Source_type,Hour,title_headline
0,80690.0,monday feb,ramallah februari wafa palestin liber organ se...,palestine,2016-02-28,0.0,-0.005906,1.0,1.0,1.0,14:03:00,Sunday,-0.64969,-0.617774,-0.563154,neutral,negative,D,14,monday feb ramallah februari wafa palestin lib...
1,28854.0,buffett politician dead wrong economi,warren buffett messag presidenti candid other ...,economy,2016-02-28,0.051031,-0.037921,0.0,0.0,0.0,19:17:00,Sunday,0.0,0.0,0.0,positive,negative,D,19,buffett politician dead wrong economi warren b...


In [None]:
# Prepare the news for SentimentTitle classification
X_title = df['Title']
y_title = df['SentimentTitle_Category']


# Prepare the news for SentimentHeadline classification
X_headline = df['Headline']
y_headline = df['SentimentHeadline_Category']

#**Splitting the data**

In [None]:
# Split the data for SentimentTitle classification into training and testing sets
X_train_title, X_test_title, y_train_title, y_test_title = train_test_split(X_title, y_title, test_size=0.2, random_state=42)

# Split the data for SentimentHeadline classification into training and testing sets
X_train_headline, X_test_headline, y_train_headline, y_test_headline = train_test_split(X_headline, y_headline, test_size=0.2, random_state=42)

#**TF-IDF Vectorization**

In [None]:
# Create separate TF-IDF vectorizers for each classification task
# Title
tfidf_vectorizer_title = TfidfVectorizer(max_features=10000, stop_words='english')
X_train_tfidf_title = tfidf_vectorizer_title.fit_transform(X_train_title)
X_test_tfidf_title = tfidf_vectorizer_title.transform(X_test_title)

# Headline
tfidf_vectorizer_headline = TfidfVectorizer(max_features=10000, stop_words='english')
X_train_tfidf_headline = tfidf_vectorizer_headline.fit_transform(X_train_headline)
X_test_tfidf_headline = tfidf_vectorizer_headline.transform(X_test_headline)

#**Model Building**
---

##**Logistic Regression**

###**Title**

In [None]:
# Initialize the Logistic Regression model
clf = LogisticRegression(max_iter=1000)

# Train the model on the TF-IDF transformed training data
clf.fit(X_train_tfidf_title, y_train_title)

# Make predictions on the TF-IDF transformed test data
y_pred_clf = clf.predict(X_test_tfidf_title)

# Evaluate the model
accuracy_clf = accuracy_score(y_test_title, y_pred_clf)
print(f'Accuracy: {accuracy_clf}')

# Print classification report for more detailed metrics
report_clf = classification_report(y_test_title, y_pred_clf)
print('Classification Report:')
print(report_clf)

Accuracy: 0.6612972740006465
Classification Report:
              precision    recall  f1-score   support

    negative       0.68      0.74      0.71      7593
     neutral       0.58      0.42      0.48      3742
    positive       0.67      0.71      0.69      7227

    accuracy                           0.66     18562
   macro avg       0.64      0.62      0.63     18562
weighted avg       0.66      0.66      0.66     18562



###**Headline**

In [None]:
# Initialize the Logistic Regression model
clf = LogisticRegression(max_iter=1000)

# Train the model on the TF-IDF transformed training data
clf.fit(X_train_tfidf_headline, y_train_headline)

# Make predictions on the TF-IDF transformed test data
y_pred_clf_head = clf.predict(X_test_tfidf_headline)

# Evaluate the model
accuracy_clf_head = accuracy_score(y_test_headline, y_pred_clf_head)
print(f'Accuracy: {accuracy_clf_head}')

# Print classification report for more detailed metrics
report_clf_head = classification_report(y_test_headline, y_pred_clf_head)
print('Classification Report:')
print(report_clf_head)

Accuracy: 0.7143626764357289
Classification Report:
              precision    recall  f1-score   support

    negative       0.73      0.81      0.77     10550
     neutral       0.67      0.01      0.03       571
    positive       0.68      0.63      0.65      7441

    accuracy                           0.71     18562
   macro avg       0.69      0.48      0.48     18562
weighted avg       0.71      0.71      0.70     18562



##**Decision Tree**

###**Title**

In [None]:
# Initialize the Decision tree classifier model
dt = DecisionTreeClassifier()

# Train the model on the TF-IDF transformed training data
dt.fit(X_train_tfidf_title, y_train_title)

# Make predictions on the TF-IDF transformed test data
y_pred_dt = dt.predict(X_test_tfidf_title)

# Evaluate the model
accuracy_dt = accuracy_score(y_test_title, y_pred_dt)
print(f'Accuracy: {accuracy_dt}')

# Print classification report for more detailed metrics
report_dt = classification_report(y_test_title, y_pred_dt)
print('Classification Report:')
print(report_dt)

Accuracy: 0.6322055812951191
Classification Report:
              precision    recall  f1-score   support

    negative       0.67      0.67      0.67      7593
     neutral       0.51      0.51      0.51      3742
    positive       0.65      0.66      0.66      7227

    accuracy                           0.63     18562
   macro avg       0.61      0.61      0.61     18562
weighted avg       0.63      0.63      0.63     18562



###**Headline**

In [None]:
# Initialize the Decision tree classifier model
dt = DecisionTreeClassifier()

# Train the model on the TF-IDF transformed training data
dt.fit(X_train_tfidf_headline, y_train_headline)

# Make predictions on the TF-IDF transformed test data
y_pred_dt_head = dt.predict(X_test_tfidf_headline)

# Evaluate the model
accuracy_dt_head = accuracy_score(y_test_headline, y_pred_dt_head)
print(f'Accuracy: {accuracy_dt_head}')

# Print classification report for more detailed metrics
report_dt_head = classification_report(y_test_headline, y_pred_dt_head)
print('Classification Report:')
print(report_dt_head)

Accuracy: 0.6448658549725245
Classification Report:
              precision    recall  f1-score   support

    negative       0.70      0.70      0.70     10550
     neutral       0.25      0.19      0.22       571
    positive       0.58      0.60      0.59      7441

    accuracy                           0.64     18562
   macro avg       0.51      0.50      0.50     18562
weighted avg       0.64      0.64      0.64     18562



## **Random Forest**

###**Title**

In [None]:
rf = RandomForestClassifier()

# Train the model on the TF-IDF transformed training data
rf.fit(X_train_tfidf_title, y_train_title)

# Make predictions on the TF-IDF transformed test data
y_pred_rf = rf.predict(X_test_tfidf_title)

# Evaluate the model
accuracy_rf = accuracy_score(y_test_title, y_pred_rf)
print(f'Accuracy: {accuracy_rf}')

# Print classification report for more detailed metrics
report_rf = classification_report(y_test_title, y_pred_rf)
print('Classification Report:')
print(report_rf)

Accuracy: 0.6958301907122078
Classification Report:
              precision    recall  f1-score   support

    negative       0.71      0.76      0.73      7593
     neutral       0.64      0.52      0.57      3742
    positive       0.71      0.73      0.72      7227

    accuracy                           0.70     18562
   macro avg       0.68      0.67      0.67     18562
weighted avg       0.69      0.70      0.69     18562



###**Headline**

In [None]:
rf = RandomForestClassifier()

# Train the model on the TF-IDF transformed training data
rf.fit(X_train_tfidf_headline, y_train_headline)

# Make predictions on the TF-IDF transformed test data
y_pred_rf_head = rf.predict(X_test_tfidf_headline)

# Evaluate the model
accuracy_rf_head = accuracy_score(y_test_headline, y_pred_rf_head)
print(f'Accuracy: {accuracy_rf_head}')

# Print classification report for more detailed metrics
report_rf_head = classification_report(y_test_headline, y_pred_rf_head)
print('Classification Report:')
print(report_rf_head)

Accuracy: 0.720558129511906
Classification Report:
              precision    recall  f1-score   support

    negative       0.73      0.84      0.78     10550
     neutral       0.80      0.14      0.24       571
    positive       0.70      0.60      0.65      7441

    accuracy                           0.72     18562
   macro avg       0.74      0.53      0.56     18562
weighted avg       0.72      0.72      0.71     18562



#**Word Embeddings**

In [None]:
import gensim.downloader as api
from gensim.models import Word2Vec

###**Title**

In [None]:
# Load the pre-trained Google Word2Vec model
word2vec_model = api.load("word2vec-google-news-300")

# Get the dimension of the word embeddings
embedding_size = word2vec_model.vector_size

# Compute average word embeddings for each review
X_train_embeddings = []

for review in X_train_title:
    words = review.split()
    embeddings = [word2vec_model[word] for word in words if word in word2vec_model]
    if embeddings:
        avg_embedding = sum(embeddings) / len(embeddings)
        X_train_embeddings.append(avg_embedding)
    else:
        # Handle the case when no embeddings are available
        X_train_embeddings.append([0.0] * embedding_size)

X_test_embeddings = []

for review in X_test_title:
    words = review.split()
    embeddings = [word2vec_model[word] for word in words if word in word2vec_model]
    if embeddings:
        avg_embedding = sum(embeddings) / len(embeddings)
        X_test_embeddings.append(avg_embedding)
    else:
        # Handle the case when no embeddings are available
        X_test_embeddings.append([0.0] * embedding_size)

# Train Logistic Regression on average embeddings
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_embeddings, y_train_title)
y_pred = lr.predict(X_test_embeddings)

# Calculate accuracy and report
accuracy = accuracy_score(y_test_title, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.5321


###**Headline**

In [None]:
# Get the dimension of the word embeddings
embedding_size = word2vec_model.vector_size

# Compute average word embeddings for each review
X_train_embeddings = []

for review in X_train_headline:
    words = review.split()
    embeddings = [word2vec_model[word] for word in words if word in word2vec_model]
    if embeddings:
        avg_embedding = sum(embeddings) / len(embeddings)
        X_train_embeddings.append(avg_embedding)
    else:
        # Handle the case when no embeddings are available
        X_train_embeddings.append([0.0] * embedding_size)

X_test_embeddings = []

for review in X_test_headline:
    words = review.split()
    embeddings = [word2vec_model[word] for word in words if word in word2vec_model]
    if embeddings:
        avg_embedding = sum(embeddings) / len(embeddings)
        X_test_embeddings.append(avg_embedding)
    else:
        # Handle the case when no embeddings are available
        X_test_embeddings.append([0.0] * embedding_size)

# Train Logistic Regression on average embeddings
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_embeddings, y_train_headline)
y_pred = lr.predict(X_test_embeddings)

# Calculate accuracy and report
accuracy = accuracy_score(y_test_headline, y_pred)
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.6175


In [None]:
# Load pre-trained GloVe models with different dimensions
glove_50d = api.load("glove-wiki-gigaword-50")
glove_100d = api.load("glove-wiki-gigaword-100")
glove_200d = api.load("glove-wiki-gigaword-200")



###**Title**

In [None]:
# Compute average word embeddings for each review
def compute_avg_embedding(review, model):
    words = review.split()
    embeddings = [model[word] for word in words if word in model]
    if embeddings:
        avg_embedding = sum(embeddings) / len(embeddings)
        return avg_embedding
    else:
        return [0.0] * model.vector_size

X_train_embeddings_50d = [compute_avg_embedding(review, glove_50d) for review in X_train_title]
X_test_embeddings_50d = [compute_avg_embedding(review, glove_50d) for review in X_test_title]

X_train_embeddings_100d = [compute_avg_embedding(review, glove_100d) for review in X_train_title]
X_test_embeddings_100d = [compute_avg_embedding(review, glove_100d) for review in X_test_title]

X_train_embeddings_200d = [compute_avg_embedding(review, glove_200d) for review in X_train_title]
X_test_embeddings_200d = [compute_avg_embedding(review, glove_200d) for review in X_test_title]

# Train Logistic Regression on average embeddings and report accuracy
def train_and_report_accuracy(X_train_embeddings, X_test_embeddings):
    lr = LogisticRegression(max_iter=1000)
    lr.fit(X_train_embeddings, y_train_title)
    y_pred = lr.predict(X_test_embeddings)
    accuracy = accuracy_score(y_test_title, y_pred)
    return accuracy

accuracy_50d = train_and_report_accuracy(X_train_embeddings_50d, X_test_embeddings_50d)
accuracy_100d = train_and_report_accuracy(X_train_embeddings_100d, X_test_embeddings_100d)
accuracy_200d = train_and_report_accuracy(X_train_embeddings_200d, X_test_embeddings_200d)

print(f"Accuracy (50D): {accuracy_50d:.4f}")
print(f"Accuracy (100D): {accuracy_100d:.4f}")
print(f"Accuracy (200D): {accuracy_200d:.4f}")

Accuracy (50D): 0.4761
Accuracy (100D): 0.4976
Accuracy (200D): 0.5134


###**Headline**

In [None]:
# Compute average word embeddings for each review
def compute_avg_embedding(review, model):
    words = review.split()
    embeddings = [model[word] for word in words if word in model]
    if embeddings:
        avg_embedding = sum(embeddings) / len(embeddings)
        return avg_embedding
    else:
        return [0.0] * model.vector_size

X_train_embeddings_50d = [compute_avg_embedding(review, glove_50d) for review in X_train_headline]
X_test_embeddings_50d = [compute_avg_embedding(review, glove_50d) for review in X_test_headline]

X_train_embeddings_100d = [compute_avg_embedding(review, glove_100d) for review in X_train_headline]
X_test_embeddings_100d = [compute_avg_embedding(review, glove_100d) for review in X_test_headline]

X_train_embeddings_200d = [compute_avg_embedding(review, glove_200d) for review in X_train_headline]
X_test_embeddings_200d = [compute_avg_embedding(review, glove_200d) for review in X_test_headline]

# Train Logistic Regression on average embeddings and report accuracy
def train_and_report_accuracy(X_train_embeddings, X_test_embeddings):
    lr = LogisticRegression(max_iter=1000)
    lr.fit(X_train_embeddings, y_train_headline)
    y_pred = lr.predict(X_test_embeddings)
    accuracy = accuracy_score(y_test_headline, y_pred)
    return accuracy

accuracy_50d = train_and_report_accuracy(X_train_embeddings_50d, X_test_embeddings_50d)
accuracy_100d = train_and_report_accuracy(X_train_embeddings_100d, X_test_embeddings_100d)
accuracy_200d = train_and_report_accuracy(X_train_embeddings_200d, X_test_embeddings_200d)

print(f"Accuracy (50D): {accuracy_50d:.4f}")
print(f"Accuracy (100D): {accuracy_100d:.4f}")
print(f"Accuracy (200D): {accuracy_200d:.4f}")

Accuracy (50D): 0.5784
Accuracy (100D): 0.5907
Accuracy (200D): 0.6056


#**Deep Learning Models**

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional, Dropout,SimpleRNN
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

In [None]:
# Define parameters for the RNN model
vocab_size = 10000
max_sequence_length = 100
embedding_dim = 100
num_epochs = 5

# Tokenize the text data
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_title)
sequences = tokenizer.texts_to_sequences(X_train_title)

# Pad sequences to ensure they have the same length
X_train_padded = pad_sequences(sequences, maxlen=max_sequence_length, truncating='post', padding='post')

# Encode target labels using one-hot encoding
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train_title)
y_train_one_hot = to_categorical(y_train_encoded, num_classes=3)

#**RNN**

###**Title**

In [None]:
# Create the RNN model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(SimpleRNN(64, return_sequences=True))
model.add(SimpleRNN(64))
model.add(Dense(3, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_padded, y_train_one_hot, epochs=num_epochs, validation_split=0.2, batch_size=32, verbose=2)

# Evaluate the model
sequences_test = tokenizer.texts_to_sequences(X_test_title)
X_test_padded = pad_sequences(sequences_test, maxlen=max_sequence_length, truncating='post', padding='post')
y_pred = model.predict(X_test_padded)

# Convert one-hot encoded predictions to labels
y_pred_labels_encoded = np.argmax(y_pred, axis=1)
y_pred_labels = label_encoder.inverse_transform(y_pred_labels_encoded)

# Encode test labels and convert to one-hot encoding
y_test_encoded = label_encoder.transform(y_test_title)
y_test_one_hot = to_categorical(y_test_encoded, num_classes=3)

# Evaluate accuracy
accuracy = np.mean(y_pred_labels_encoded == y_test_encoded)
print(f'Accuracy: {accuracy}')

Epoch 1/5
1857/1857 - 71s - loss: 1.0601 - accuracy: 0.4041 - val_loss: 1.0594 - val_accuracy: 0.3966 - 71s/epoch - 38ms/step
Epoch 2/5
1857/1857 - 67s - loss: 1.0577 - accuracy: 0.4000 - val_loss: 1.0592 - val_accuracy: 0.3966 - 67s/epoch - 36ms/step
Epoch 3/5
1857/1857 - 79s - loss: 1.0575 - accuracy: 0.4015 - val_loss: 1.0637 - val_accuracy: 0.3966 - 79s/epoch - 43ms/step
Epoch 4/5
1857/1857 - 66s - loss: 1.0591 - accuracy: 0.4028 - val_loss: 1.0599 - val_accuracy: 0.3920 - 66s/epoch - 36ms/step
Epoch 5/5
1857/1857 - 67s - loss: 1.0577 - accuracy: 0.4008 - val_loss: 1.0570 - val_accuracy: 0.4084 - 67s/epoch - 36ms/step
Accuracy: 0.40884602952268073


###**Headline**

In [None]:
# Create the RNN model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(SimpleRNN(64, return_sequences=True))
model.add(SimpleRNN(64))
model.add(Dense(3, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_padded, y_train_one_hot, epochs=num_epochs, validation_split=0.2, batch_size=32, verbose=2)

# Evaluate the model
sequences_test = tokenizer.texts_to_sequences(X_test_headline)
X_test_padded = pad_sequences(sequences_test, maxlen=max_sequence_length, truncating='post', padding='post')
y_pred = model.predict(X_test_padded)

# Convert one-hot encoded predictions to labels
y_pred_labels_encoded = np.argmax(y_pred, axis=1)
y_pred_labels = label_encoder.inverse_transform(y_pred_labels_encoded)

# Encode test labels and convert to one-hot encoding
y_test_encoded = label_encoder.transform(y_test_headline)
y_test_one_hot = to_categorical(y_test_encoded, num_classes=3)

# Evaluate accuracy
accuracy = np.mean(y_pred_labels_encoded == y_test_encoded)
print(f'Accuracy: {accuracy}')

Epoch 1/5
1857/1857 - 68s - loss: 1.0621 - accuracy: 0.4004 - val_loss: 1.0656 - val_accuracy: 0.3966 - 68s/epoch - 36ms/step
Epoch 2/5
1857/1857 - 67s - loss: 1.0589 - accuracy: 0.4012 - val_loss: 1.0595 - val_accuracy: 0.3966 - 67s/epoch - 36ms/step
Epoch 3/5
1857/1857 - 66s - loss: 1.0579 - accuracy: 0.4019 - val_loss: 1.0578 - val_accuracy: 0.3947 - 66s/epoch - 36ms/step
Epoch 4/5
1857/1857 - 66s - loss: 1.0568 - accuracy: 0.4046 - val_loss: 1.0572 - val_accuracy: 0.3966 - 66s/epoch - 36ms/step
Epoch 5/5
1857/1857 - 67s - loss: 1.0575 - accuracy: 0.4041 - val_loss: 1.0625 - val_accuracy: 0.3966 - 67s/epoch - 36ms/step
Accuracy: 0.5683654778579894


#**LSTM**

###**Title**

In [None]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(64))
model.add(Dense(3, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_padded, y_train_one_hot, epochs=num_epochs, validation_split=0.2, batch_size=32, verbose=2)

# Evaluate the model
sequences_test = tokenizer.texts_to_sequences(X_test_title)
X_test_padded = pad_sequences(sequences_test, maxlen=max_sequence_length, truncating='post', padding='post')
y_pred = model.predict(X_test_padded)

# Convert one-hot encoded predictions to labels
y_pred_labels_encoded = np.argmax(y_pred, axis=1)
y_pred_labels = label_encoder.inverse_transform(y_pred_labels_encoded)

# Encode test labels and convert to one-hot encoding
y_test_encoded = label_encoder.transform(y_test_title)
y_test_one_hot = to_categorical(y_test_encoded, num_classes=3)

# Evaluate accuracy
accuracy = np.mean(y_pred_labels_encoded == y_test_encoded)
print(f'Accuracy: {accuracy}')

Epoch 1/5
1857/1857 - 166s - loss: 1.0560 - accuracy: 0.4043 - val_loss: 1.0575 - val_accuracy: 0.3966 - 166s/epoch - 90ms/step
Epoch 2/5
1857/1857 - 166s - loss: 1.0552 - accuracy: 0.4057 - val_loss: 1.0578 - val_accuracy: 0.3966 - 166s/epoch - 89ms/step
Epoch 3/5
1857/1857 - 164s - loss: 1.0551 - accuracy: 0.4078 - val_loss: 1.0585 - val_accuracy: 0.3966 - 164s/epoch - 88ms/step
Epoch 4/5
1857/1857 - 161s - loss: 1.0551 - accuracy: 0.4060 - val_loss: 1.0575 - val_accuracy: 0.3966 - 161s/epoch - 87ms/step
Epoch 5/5
1857/1857 - 163s - loss: 1.0549 - accuracy: 0.4069 - val_loss: 1.0571 - val_accuracy: 0.3966 - 163s/epoch - 88ms/step
Accuracy: 0.4090615235427217


###**Headline**

In [None]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(64))
model.add(Dense(3, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_padded, y_train_one_hot, epochs=num_epochs, validation_split=0.2, batch_size=32, verbose=2)

# Evaluate the model
sequences_test = tokenizer.texts_to_sequences(X_test_headline)
X_test_padded = pad_sequences(sequences_test, maxlen=max_sequence_length, truncating='post', padding='post')
y_pred = model.predict(X_test_padded)

# Convert one-hot encoded predictions to labels
y_pred_labels_encoded = np.argmax(y_pred, axis=1)
y_pred_labels = label_encoder.inverse_transform(y_pred_labels_encoded)

# Encode test labels and convert to one-hot encoding
y_test_encoded = label_encoder.transform(y_test_headline)
y_test_one_hot = to_categorical(y_test_encoded, num_classes=3)

# Evaluate accuracy
accuracy = np.mean(y_pred_labels_encoded == y_test_encoded)
print(f'Accuracy: {accuracy}')

Epoch 1/5
1857/1857 - 169s - loss: 1.0560 - accuracy: 0.4050 - val_loss: 1.0573 - val_accuracy: 0.3966 - 169s/epoch - 91ms/step
Epoch 2/5
1857/1857 - 168s - loss: 1.0554 - accuracy: 0.4059 - val_loss: 1.0571 - val_accuracy: 0.3966 - 168s/epoch - 91ms/step
Epoch 3/5
1857/1857 - 163s - loss: 1.0551 - accuracy: 0.4071 - val_loss: 1.0571 - val_accuracy: 0.3966 - 163s/epoch - 88ms/step
Epoch 4/5
1857/1857 - 161s - loss: 1.0550 - accuracy: 0.4066 - val_loss: 1.0582 - val_accuracy: 0.3966 - 161s/epoch - 87ms/step
Epoch 5/5
1857/1857 - 161s - loss: 1.0551 - accuracy: 0.4063 - val_loss: 1.0569 - val_accuracy: 0.3966 - 161s/epoch - 87ms/step
Accuracy: 0.5683654778579894


##**BiLSTM**
---


###**Title**

In [None]:
# Create the BiLstm model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model
history = model.fit(X_train_padded, y_train_one_hot, epochs=num_epochs, validation_split=0.2, batch_size=32, verbose=2)

# Evaluate the model
sequences_test = tokenizer.texts_to_sequences(X_test_title)
X_test_padded = pad_sequences(sequences_test, maxlen=max_sequence_length, truncating='post', padding='post')
y_pred = model.predict(X_test_padded)

# Convert one-hot encoded predictions to labels
y_pred_labels_encoded = np.argmax(y_pred, axis=1)
y_pred_labels = label_encoder.inverse_transform(y_pred_labels_encoded)

# Encode test labels and convert to one-hot encoding
y_test_encoded = label_encoder.transform(y_test_title)
y_test_one_hot = to_categorical(y_test_encoded, num_classes=3)

# Evaluate accuracy
accuracy = np.mean(y_pred_labels_encoded == y_test_encoded)
print(f'Accuracy: {accuracy}')

Epoch 1/5
1857/1857 - 320s - loss: 0.8458 - accuracy: 0.6195 - val_loss: 0.7708 - val_accuracy: 0.6743 - 320s/epoch - 172ms/step
Epoch 2/5
1857/1857 - 307s - loss: 0.7003 - accuracy: 0.7154 - val_loss: 0.7535 - val_accuracy: 0.6831 - 307s/epoch - 165ms/step
Epoch 3/5
1857/1857 - 311s - loss: 0.6114 - accuracy: 0.7572 - val_loss: 0.7850 - val_accuracy: 0.6838 - 311s/epoch - 167ms/step
Epoch 4/5
1857/1857 - 306s - loss: 0.5345 - accuracy: 0.7890 - val_loss: 0.8152 - val_accuracy: 0.6852 - 306s/epoch - 165ms/step
Epoch 5/5
1857/1857 - 308s - loss: 0.4658 - accuracy: 0.8151 - val_loss: 0.9050 - val_accuracy: 0.6827 - 308s/epoch - 166ms/step
Accuracy: 0.6786984161189527


###**Headline**

In [None]:
# Tokenize the text data
tokenizer = Tokenizer(num_words=vocab_size, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train_headline)
sequences = tokenizer.texts_to_sequences(X_train_headline)

# Pad sequences to ensure they have the same length
X_train_padded = pad_sequences(sequences, maxlen=max_sequence_length, truncating='post', padding='post')

# Encode target labels using one-hot encoding
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train_headline)
y_train_one_hot = to_categorical(y_train_encoded, num_classes=3)

In [None]:
# Create the BiLstm model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(64, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.5))
model.add(Dense(3, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Train the model
history = model.fit(X_train_padded, y_train_one_hot, epochs=num_epochs, validation_split=0.2, batch_size=32, verbose=2)

# Evaluate the model
sequences_test = tokenizer.texts_to_sequences(X_test_headline)
X_test_padded = pad_sequences(sequences_test, maxlen=max_sequence_length, truncating='post', padding='post')
y_pred = model.predict(X_test_padded)

# Convert one-hot encoded predictions to labels
y_pred_labels_encoded = np.argmax(y_pred, axis=1)
y_pred_labels = label_encoder.inverse_transform(y_pred_labels_encoded)

# Encode test labels and convert to one-hot encoding
y_test_encoded = label_encoder.transform(y_test_headline)
y_test_one_hot = to_categorical(y_test_encoded, num_classes=3)

# Evaluate accuracy
accuracy = np.mean(y_pred_labels_encoded == y_test_encoded)
print(f'Accuracy: {accuracy}')

Epoch 1/5
1857/1857 - 315s - loss: 0.6815 - accuracy: 0.6796 - val_loss: 0.6288 - val_accuracy: 0.7179 - 315s/epoch - 170ms/step
Epoch 2/5
1857/1857 - 308s - loss: 0.5701 - accuracy: 0.7518 - val_loss: 0.6272 - val_accuracy: 0.7221 - 308s/epoch - 166ms/step
Epoch 3/5
1857/1857 - 307s - loss: 0.4980 - accuracy: 0.7893 - val_loss: 0.6583 - val_accuracy: 0.7151 - 307s/epoch - 165ms/step
Epoch 4/5
1857/1857 - 307s - loss: 0.4267 - accuracy: 0.8215 - val_loss: 0.7096 - val_accuracy: 0.7132 - 307s/epoch - 165ms/step
Epoch 5/5
1857/1857 - 307s - loss: 0.3606 - accuracy: 0.8529 - val_loss: 0.7716 - val_accuracy: 0.7098 - 307s/epoch - 165ms/step
Accuracy: 0.7070897532593471


#**Model Performance**
----
*Sentiment Analysis:*

- Random Forest model performed the best and gave the Accuracy score of: 70% on Title and 72% on Headline


- While RNN and LSTM model performed very poorly, BiLSTM showed significant improvement and gave much better results (accuracy score of: 68% on Title and 71% on Headline )


#**Conclusions**
---

**Business Problem Solutions:**

> **Based on inferences from EDA, it is recommended to post:**
-	News on Facebook on the topic Obama between 14:00 till mid-night, on a Saturday, to improve the chances of it being popular.
-	News on LinkedIn on the topic Microsoft, on a Monday, to improve the chances of it being popular.

> **Based on inferences from the models:**
-	Before publishing any news, they can check its popularity on a particular platform and how it will be perceived on the platform.
-	Higher the popularity scores the better it is. And though negative sentiment news are seen to be more popular, news with positive sentiment would tend to have a greater impact.
