# Assignment 2

## Group members
- Axel Gustafsson - gusgusaxa@student.gu.se
- Erdem Halil - gushaliler@student.gu.se

#### 1. Reading the review data

In [336]:
def read_documents(doc_file):
    docs = []
    labels = []
    with open(doc_file, encoding='utf-8') as f:
        for line in f:
            words = line.strip().split()
            docs.append(words[3:])
            labels.append(words[1])
    return docs, labels

all_docs, all_labels = read_documents('reviews.txt')

split_point = int(0.80*len(all_docs))
train_docs = all_docs[:split_point]
train_labels = all_labels[:split_point]  
val_docs = all_docs[split_point:]
val_labels = all_labels[split_point:]

#### 2. Estimating parameters for the Naive Bayes classifier

In [337]:
from collections import Counter
import numpy as np

def train_nb(documents, labels):
    # Compute the prior log probabilities
    prior = Counter(labels)
    for label in prior:
        prior[label] = np.log(prior[label] / len(labels))

    # Set to store all unique words in the training set
    unique_words = set()
    # Dict to count the number of words in each class
    words = {label: 0 for label in prior}

    # Count the number of times each word appears in each class
    likelihood = {}
    for doc, label in zip(documents, labels):
        unique_words.update(doc)    # Update the set with each word in the document
        for word in doc:
            if word not in likelihood:
                likelihood[word] = {label: 0 for label in prior}
            likelihood[word][label] += 1    # Increment the count for the word in the document
            words[label] += 1   # Increment the count of words in the class

    # Get the vocabulary
    vocab = len(unique_words)

    # Compute the log likelihood
    for word in likelihood:
        for label in likelihood[word]:
            # Add 1 to the numerator and vocab to the denominator for smoothing
            likelihood[word][label] = np.log((likelihood[word][label] + 1) / (words[label] + vocab))
    
    return prior, likelihood

prior, likelihood = train_nb(train_docs, train_labels)

#### 3. Classifying new documents

In [338]:
def score_doc_label(document, label, prior, likelihood):
    # Initialize the score with the prior probability of the class
    score = prior[label]
    for word in document:
        if word in likelihood:
            # Add the log likelihood of the word to the score
            score += likelihood[word][label]
    return score

# Sanity check 1
print("Sanity check 1")
great_pos_score = np.exp(score_doc_label(['great'], 'pos', prior, likelihood))
print(f"Great with positive label: {great_pos_score}")
great_neg_score = np.exp(score_doc_label(['great'], 'neg', prior, likelihood))
print(f"Great with negative label: {great_neg_score}")

bad_pos_score = np.exp(score_doc_label(['bad'], 'pos', prior, likelihood))
print(f"Bad with positive label: {bad_pos_score}")
bad_neg_score = np.exp(score_doc_label(['bad'], 'neg', prior, likelihood))
print(f"Bad with negative label: {bad_neg_score}")

# Sanity check 2
print("\nSanity check 2")
crash_pos_score = np.exp(score_doc_label(['a', 'top-quality', 'performance'], 'pos', prior, likelihood))
print(crash_pos_score)
crash_neg_score = np.exp(score_doc_label(['a', 'top-quality', 'performance'], 'neg', prior, likelihood))
print(crash_neg_score)

Sanity check 1
Great with positive label: 0.0013212141496043825
Great with negative label: 0.0005283997934747295
Bad with positive label: 0.00017230368700664423
Bad with negative label: 0.0004547440646873432

Sanity check 2
2.739162445438219e-06
1.3890482802798863e-06


In [339]:
def classify_nb(document, prior, likelihood):
    labels = prior.keys()   # Get all possible labels
    best_score = float('-inf') # Initialize the best score to be -infinity
    best_label = None

    # Go through each label, compute the score for the document and pick the best label based on score
    for label in labels:
        score = score_doc_label(document, label, prior, likelihood)
        if score > best_score:
            best_score = score
            best_label = label

    return best_label

# Sanity check
test_label = classify_nb(["good", "movie", "best", "don't", "disappointing", "watch", "fantastic"], prior, likelihood)
print(test_label)

test_label = classify_nb(["I", "love", "this", "movie", "definitely", "recommend", "fantastic"], prior, likelihood)
print(test_label)

neg
pos


#### 4. Evaluating the classifier

In [340]:
def classify_documents(documents, prior, likelihood):
    labels = []
    # Classify each document and return the labels based on the classification
    for doc in documents:
        labels.append(classify_nb(doc, prior, likelihood))
    return labels

# Sanity check
test_classification = classify_documents(val_docs, prior, likelihood)
print(test_classification)

['neg', 'pos', 'neg', 'pos', 'pos', 'neg', 'pos', 'neg', 'neg', 'pos', 'pos', 'pos', 'pos', 'neg', 'neg', 'neg', 'pos', 'neg', 'neg', 'pos', 'pos', 'pos', 'neg', 'pos', 'neg', 'neg', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'neg', 'pos', 'pos', 'neg', 'neg', 'pos', 'pos', 'pos', 'pos', 'neg', 'neg', 'neg', 'pos', 'neg', 'neg', 'pos', 'pos', 'pos', 'pos', 'pos', 'pos', 'neg', 'neg', 'neg', 'pos', 'neg', 'pos', 'neg', 'neg', 'neg', 'pos', 'neg', 'neg', 'neg', 'pos', 'neg', 'neg', 'pos', 'neg', 'pos', 'pos', 'neg', 'neg', 'neg', 'neg', 'neg', 'pos', 'neg', 'neg', 'pos', 'pos', 'pos', 'neg', 'neg', 'pos', 'pos', 'neg', 'pos', 'pos', 'neg', 'pos', 'pos', 'pos', 'neg', 'neg', 'pos', 'neg', 'neg', 'neg', 'neg', 'pos', 'pos', 'neg', 'pos', 'pos', 'pos', 'neg', 'pos', 'neg', 'neg', 'pos', 'pos', 'neg', 'neg', 'neg', 'neg', 'neg', 'pos', 'neg', 'neg', 'pos', 'neg', 'neg', 'neg', 'pos', 'pos', 'neg', 'pos', 'pos', 'neg', 'pos', 'neg', 'pos', 'pos', 'pos', 'neg'

In [341]:
def accuracy(true_labels, guessed_labels):
    correct = 0
    for true, guessed in zip(true_labels, guessed_labels):
        if true == guessed:
            correct += 1
    return correct / len(true_labels)

print(accuracy(val_labels, test_classification))

0.8153587914393622


In [342]:
# This won't work for other labels!
def f1_score(true_labels, guessed_labels):
    # tp -> true positive, fp -> false positive, fn -> false negative
    tp, fp, fn = 0, 0, 0
    for true, guessed in zip(true_labels, guessed_labels):
        if true == 'pos' and guessed == 'pos':
            tp += 1
        elif true == 'pos' and guessed == 'neg':
            fn += 1
        elif true == 'neg' and guessed == 'pos':
            fp += 1

    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    return 2 * precision * recall / (precision + recall)

print(f1_score(val_labels, test_classification))

0.8047914818101153


Some bs about difference between accuracy and f1 score.

#### 5. Error analysis

#### 6. Cross validation

In [343]:
def k_fold_cross_validation(all_docs, all_labels, N=10):
    acc = []
    for fold_nbr in range(N):
        # Define split points based on the fold number, the number of folds and the number of documents
        split_point_1 = int(float(fold_nbr)/N*len(all_docs))
        split_point_2 = int(float(fold_nbr+1)/N*len(all_docs))

        # Split the data into train and validation sets
        train_docs_fold = all_docs[:split_point_1] + all_docs[split_point_2:]
        train_labels_fold = all_labels[:split_point_1] + all_labels[split_point_2:]
        val_docs_fold = all_docs[split_point_1:split_point_2]
        val_labels_fold = all_labels[split_point_1:split_point_2]
        
        # Train the model and classify the validation set
        prior, likelihood = train_nb(train_docs_fold, train_labels_fold)
        guessed_labels = classify_documents(val_docs_fold, prior, likelihood)

        # Compute the accuracy and append it to the list
        acc.append(accuracy(val_labels_fold, guessed_labels))
        
    return acc
N = 10
acc = k_fold_cross_validation(all_docs, all_labels, N)
# Get the average accuracy (N = 10)
sum(acc) / len(acc)

0.8076216900805256

In [344]:
def leave_one_out_cross_validation(all_docs, all_labels, iterations=10):
    acc = []
    for i in range(iterations):
        # Split the data into train and validation sets by leaving i-th document out
        train_docs = all_docs[:i] + all_docs[i+1:]
        train_labels = all_labels[:i] + all_labels[i+1:]
        val_docs = [all_docs[i]]
        val_labels = [all_labels[i]]

        # Train the model and classify the validation set
        prior, likelihood = train_nb(train_docs, train_labels)
        guessed_label = classify_documents(val_docs, prior, likelihood)
        # Compute the accuracy and append it to the list
        acc.append(accuracy(val_labels, guessed_label))

    return acc

acc = leave_one_out_cross_validation(all_docs, all_labels)
# Get the average accuracy)
sum(acc) / len(acc)

0.8

Some bs about difference between these cross validation techniques.

#### 7. Domain sensitivity

In [346]:
def read_documents(doc_file, topics):
    # Initialize the docs and labels dictionaries
    # dict -> {topic: [list of documents]}
    docs = {topic: [] for topic in topics}
    labels = {topic: [] for topic in topics}
    with open(doc_file, encoding='utf-8') as f:
        for line in f:
            words = line.strip().split()
            topic = words[0]
            if topic not in topics:
                continue
            docs[topic].append(words[3:])
            labels[topic].append(words[1])
    return docs, labels

# Get only camera and books reviews
all_docs, all_labels = read_documents('reviews.txt', ["camera", "books"])

training_data = {}
# For each topic, split the data into train and validation sets
for topic in all_docs:
    split_point = int(0.80*len(all_docs[topic]))
    train_docs = all_docs[topic][:split_point]
    train_labels = all_labels[topic][:split_point]  
    val_docs = all_docs[topic][split_point:]
    val_labels = all_labels[topic][split_point:]
    # Store the data in the dictionary
    # dict -> {topic: (train_docs, train_labels, val_docs, val_labels)}
    # e.g. training_data["camera"][0] = train_docs
    #      training_data["camera"][1] = train_labels
    #      training_data["camera"][2] = val_docs
    #      training_data["camera"][3] = val_labels
    training_data[topic] = (train_docs, train_labels, val_docs, val_labels)

# Train the model and classify the validation set for camera
prior, likelihood = train_nb(training_data["camera"][0], training_data["camera"][1])
guessed_labels = classify_documents(training_data["camera"][2], prior, likelihood)
print(f'Accuracy for camera: {accuracy(training_data["camera"][3], guessed_labels)}')

# Train the model and classify the validation set for camera
prior, likelihood = train_nb(training_data["books"][0], training_data["books"][1])
guessed_labels = classify_documents(training_data["camera"][2], prior, likelihood)
print(f'Accuracy for camera applied to a software : {accuracy(training_data["camera"][3], guessed_labels)}')

Accuracy for camera: 0.8675
Accuracy for camera applied to a software : 0.65


#### 8. Naive Bayes for numerical data

In [347]:
# sepal_length,sepal_width,petal_length,petal_width,species
# 5.1,3.5,1.4,0.2,setosa
def read_data(data_file):
    
    parts = []
    labels = []
    with open(data_file, encoding='utf-8') as f:
        next(f) # skip header
        for line in f:
            words = line.strip().split(',')
            labels.append(words[-1])
            parts.append(words[:-1])
    return parts, labels

parts, labels = read_data('iris.csv')

In [348]:
def train_nb(parts, labels):
    # Compute the prior probabilities
    prior = Counter(labels)
    unique_words = set()

    word_counters = {}
    for label in labels:
        word_counters[label] = Counter()

    # Compute the conditional probabilities
    likelihood = {}
    for doc, label in zip(documents, labels):
        for word in doc:
            if word not in likelihood:
                likelihood[word] = Counter()
            likelihood[word][label] += 1
            unique_words.add(word)
            word_counters[label][word] += 1

    vocab = len(unique_words)
    words = {}
    for label in word_counters:
        words[label] = sum(word_counters[label].values())

    for word in likelihood:
        for label in likelihood[word]:
            likelihood[word][label] = np.log((likelihood[word][label] + 1) / (words[label] + vocab))

    for label in prior:
        prior[label] = np.log(prior[label] / len(labels))
    
    return prior, likelihood