<a href="https://colab.research.google.com/github/fahminmahili/sentiment-analysis-of-text-using-naive-bayes/blob/main/NaiveBayestoSentimentAnalysisofText.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


TASK 1: Implementing Naive Bayes Algorithm from Scratch

In [None]:
import os

# Define folder paths
neg_folder_path = '/content/drive/MyDrive/txt_sentoken/neg'
pos_folder_path = '/content/drive/MyDrive/txt_sentoken/pos'

def load_data(folder_path, label):
    data = []
    for filename in os.listdir(folder_path):
        with open(os.path.join(folder_path, filename), 'r', encoding='latin1') as file:
            text = file.read()
            data.append((text.lower(), label))
    return data

# Load positive and negative data
pos_data = load_data(pos_folder_path, 'positive')
neg_data = load_data(neg_folder_path, 'negative')

# Combine datasets
full_data = pos_data + neg_data


In [None]:
from sklearn.model_selection import train_test_split

# Split data
train_data, test_data = train_test_split(full_data, test_size=0.2, random_state=42)


In [None]:
from collections import defaultdict
import numpy as np

class NaiveBayes:
    def __init__(self):
        self.class_probabilities = defaultdict(float)
        self.word_given_class_probabilities = defaultdict(lambda: defaultdict(float))
        self.vocabulary = set()

    def train(self, data):
        # Count class occurrences and word occurrences
        class_counts = defaultdict(int)
        word_counts = defaultdict(lambda: defaultdict(int))

        for text, label in data:
            class_counts[label] += 1
            for word in text.split():
                word_counts[label][word] += 1
                self.vocabulary.add(word)

        # Calculate probabilities
        total_documents = len(data)
        for label, count in class_counts.items():
            self.class_probabilities[label] = count / total_documents
            for word in self.vocabulary:
                self.word_given_class_probabilities[label][word] = (
                    (word_counts[label][word] + 1) /
                    (sum(word_counts[label].values()) + len(self.vocabulary))
                )

    def predict(self, text):
        # Calculate class scores
        scores = {label: np.log(self.class_probabilities[label]) for label in self.class_probabilities}

        for word in text.split():
            if word in self.vocabulary:
                for label in scores:
                    scores[label] += np.log(self.word_given_class_probabilities[label][word])

        # Return the label with the highest score
        return max(scores, key=scores.get)


In [None]:
# Initialize and train the model
nb_model = NaiveBayes()
nb_model.train(train_data)


In [None]:
from sklearn.metrics import accuracy_score

# Make predictions on test data
test_texts, test_labels = zip(*test_data)
predictions = [nb_model.predict(text) for text in test_texts]

# Evaluate accuracy
accuracy = accuracy_score(test_labels, predictions)
print(f'Accuracy From Scratch: {accuracy * 100:.2f}%')


Accuracy From Scratch: 79.60%


TASK 2: Implementing Naive Bayes Algorithm using Python Library

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Function to load data from a folder
def load_data(folder_path, label):
    data = []
    for filename in os.listdir(folder_path):
        with open(os.path.join(folder_path, filename), 'r', encoding='utf-8') as file:
            text = file.read()
            data.append((text, label))
    return data

# Paths to the "neg" and "pos" folders
neg_folder_path = '/content/drive/MyDrive/txt_sentoken/neg'
pos_folder_path = '/content/drive/MyDrive/txt_sentoken/pos'

# Load positive and negative data
pos_data = load_data(pos_folder_path, 'positive')
neg_data = load_data(neg_folder_path, 'negative')

# Combine datasets
full_data = pos_data + neg_data

# Split data
train_data, test_data = train_test_split(full_data, test_size=0.2, random_state=42)

# Extract texts and labels
train_texts, train_labels = zip(*train_data)
test_texts, test_labels = zip(*test_data)

# Create a pipeline with CountVectorizer and Multinomial Naive Bayes
nb_pipeline = make_pipeline(CountVectorizer(), MultinomialNB())

# Train the model
nb_pipeline.fit(train_texts, train_labels)

# Make predictions
nb_predictions = nb_pipeline.predict(test_texts)

# Evaluate accuracy
nb_accuracy = accuracy_score(test_labels, nb_predictions)
print(f'Accuracy using Python library: {nb_accuracy * 100:.2f}%')


Accuracy using Python library: 78.86%
