<a href="https://colab.research.google.com/github/georgeto20/Naive_Bayes/blob/master/Naive_Bayes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
def get_vocabulary(D):
    """
    Given a list of documents, where each document is represented as
    a list of tokens, return the resulting vocabulary. The vocabulary
    should be a set of tokens which appear more than once in the entire
    document collection plus the "<unk>" token.
    """
    words = {}
    for document in D:
        for token in document:
            if token not in words.keys():
                words[token] = 1
            else:
                words[token] += 1
    vocab = [token for token in words.keys() if words[token] > 1]
    vocab.append("<unk>")
    return set(vocab)

In [0]:
class BBoWFeaturizer(object):
    def convert_document_to_feature_dictionary(self, doc, vocab):
        """
        Given a document represented as a list of tokens and the vocabulary
        as a set of tokens, compute the binary bag-of-words feature representation.
        This function should return a dictionary which maps from the name of the
        feature to the value of that feature.
        """
        bbow = {}
        for token in doc:
            if token in vocab:
                bbow[token] = 1
            else:
                bbow["<unk>"] = 1
        return bbow

In [0]:
class CBoWFeaturizer(object):
    def convert_document_to_feature_dictionary(self, doc, vocab):
        """
        Given a document represented as a list of tokens and the vocabulary
        as a set of tokens, compute the count bag-of-words feature representation.
        This function should return a dictionary which maps from the name of the
        feature to the value of that feature.
        """
        cbow = {}
        for token in doc:
            if token in vocab:
                if token not in cbow.keys():
                    cbow[token] = 1
                else:
                    cbow[token] += 1
            else:
                if "<unk>" not in cbow.keys():
                    cbow["<unk>"] = 1
                else:
                    cbow["<unk>"] += 1
        return cbow

In [0]:
import numpy as np

def compute_idf(D, vocab):
    """
    Given a list of documents D and the vocabulary as a set of tokens,
    where each document is represented as a list of tokens, return the IDF scores
    for every token in the vocab. The IDFs should be represented as a dictionary that
    maps from the token to the IDF value. If a token is not present in the
    vocab, it should be mapped to "<unk>".
    """
    idfs = {}
    for doc in D:
        unk_doc = False
        true_doc = [token for content in doc for token in content]
        for token in set(true_doc):
            if token in vocab:
                if token not in idfs.keys():
                    idfs[token] = 1
                else:
                    idfs[token] += 1
            elif not unk_doc:
                if "<unk>" not in idfs.keys():
                    idfs["<unk>"] = 1
                else:
                    idfs["<unk>"] += 1
                unk_doc = True
    idfs = dict(zip(idfs.keys(), np.log([len(D) / value for value in idfs.values()])))
    return idfs
    
class TFIDFFeaturizer(object):
    def __init__(self, idf):
        """The idf scores computed via `compute_idf`."""
        self.idf = idf
    
    def convert_document_to_feature_dictionary(self, doc, vocab):
        """
        Given a document represented as a list of tokens and
        the vocabulary as a set of tokens, compute
        the TF-IDF feature representation. This function
        should return a dictionary which maps from the name of the
        feature to the value of that feature.
        """
        tfidf = {}
        for token in doc:
            if token in vocab:
                if token not in tfidf.keys():
                    tfidf[token] = 1
                else:
                    tfidf[token] += 1
            else:
                if "<unk>" not in tfidf.keys():
                    tfidf["<unk>"] = 1
                else:
                    tfidf["<unk>"] += 1
        for key in tfidf.keys():
            tfidf[key] *= self.idf[key]
        return tfidf

In [0]:
import pandas as pd
import ast
from collections import Counter

def load_dataset(file_path, counts=False, hashtag=False, at=False):
    D = []
    y = []
    csv = pd.read_csv(file_path)
    for index, row in csv.iterrows():
        main_content = []
        if type(row['text']) == str:
            main_content = row['text'].split()
        hashtag_count = ""
        at_count = ""
        if counts:
            hashtag_count = str(row['hashcount'])
            at_count = str(row['atcount'])
        hashtag_content = []
        if hashtag:
            if type(row['hashcontent']) == str:
                hashes = ast.literal_eval(row['hashcontent'])
                hashes = [ht.strip() for ht in hashes]
                hashtag_content = hashes
        at_content = []
        if at:
            if type(row['atcontent']) == str:
                true_at = "".join(c for c in row['atcontent'] if c not in ('.',','))
                at_content = true_at.split()
        all_content = [main_content, hashtag_count, hashtag_content, at_count, at_content]
        D.append(all_content)
        y.append(row['label'])
    return D, y

def convert_to_features(D, featurizer, vocab, hashtag_vocab, at_vocab):
    X = []
    for doc in D:
        dictionary_to_append = Counter(featurizer.convert_document_to_feature_dictionary(doc[0], vocab))
        if len(doc[1]) > 0:
            dictionary_to_append += Counter({doc[1]: 1})
        if len(hashtag_vocab) > 1:
            dictionary_to_append += Counter(featurizer.convert_document_to_feature_dictionary(doc[2], hashtag_vocab))
        if len(doc[3]) > 0:
            dictionary_to_append += Counter({doc[3]: 1})
        if len(at_vocab) > 1:
            dictionary_to_append += Counter(featurizer.convert_document_to_feature_dictionary(doc[4], at_vocab))
        X.append(dict(dictionary_to_append))
    return X

In [0]:
from collections import Counter
a = Counter({1: 2, 2: 3})
b = Counter({1: 3, 3: 4})
dict(a + b)

{1: 5, 2: 3, 3: 4}

In [0]:
def train_naive_bayes(X, y, k, vocab):
    """
    Computes the statistics for the Naive Bayes classifier.
    X is a list of feature representations, where each representation
    is a dictionary that maps from the feature name to the value.
    y is a list of integers that represent the labels.
    k is a float which is the smoothing parameters.
    vocab is the set of vocabulary tokens.
    
    Returns two values:
        p_y: A dictionary from the label to the corresponding p(y) score
        p_v_y: A nested dictionary where the outer dictionary's key is
            the label and the innner dictionary maps from a feature
            to the probability p(v|y). For example, `p_v_y[1]["hello"]`
            should be p(v="hello"|y=1).
    """
    unique_y = set(y)
    p_y = {label: 0 for label in unique_y}
    for label in y:
        p_y[label] += 1/len(y)
    numerator = {label: {} for label in unique_y}
    denominator = {label: 0 for label in unique_y}
    for i, feat_rep in enumerate(X):
        label = y[i]
        for token in feat_rep:
            if token not in numerator[label].keys():
                numerator[label][token] = feat_rep[token]
            else:
                numerator[label][token] += feat_rep[token]
            denominator[label] += feat_rep[token]
    p_v_y = {label: {} for label in unique_y}
    for label, feat_rep in numerator.items():
        for token in vocab:
            if token in feat_rep:
                p_v_y[label][token] = (k + feat_rep[token]) / (denominator[label] + k*len(vocab))
            else:
                p_v_y[label][token] = k / (denominator[label] + k*len(vocab))
    return p_y, p_v_y

In [0]:
def predict_naive_bayes(D, p_y, p_v_y):
    """
    Runs the prediction rule for Naive Bayes. D is a list of documents,
    where each document is a list of tokens.
    p_y and p_v_y are output from `train_naive_bayes`.
    
    Note that any token which is not in p_v_y should be mapped to
    "<unk>". Further, the input dictionaries are probabilities. You
    should convert them to log-probabilities while you compute
    the Naive Bayes prediction rule to prevent underflow errors.
    
    Returns two values:
        predictions: A list of integer labels, one for each document,
            that is the predicted label for each instance.
        confidences: A list of floats, one for each document, that is
            p(y|d) for the corresponding label that is returned.
    """
    predictions = []
    confidences = []
    keys_list = list(p_y.keys())
    for doc in D:
        token_sum = [0 for label in p_y.keys()]
        token_product = [[] for label in p_y.keys()]
        true_doc = [token for content in doc for token in content]
        for token in true_doc:
            for label in p_y.keys():
                if token in p_v_y[label]:
                    token_sum[keys_list.index(label)] += np.log(p_v_y[label][token])
                    token_product[keys_list.index(label)].append(p_v_y[label][token])
                else:
                    token_sum[keys_list.index(label)] += np.log(p_v_y[label]["<unk>"])
                    token_product[keys_list.index(label)].append(p_v_y[label]["<unk>"])
        best_label = keys_list[0]
        for label in keys_list[1:]:
            if token_sum[keys_list.index(label)] + np.log(p_y[label]) > token_sum[keys_list.index(best_label)] + np.log(p_y[best_label]):
                best_label = label
        predictions.append(best_label)
        p_d = np.sum([np.prod(token_product[keys_list.index(label)]) * p_y[label] for label in p_y.keys()])
        if p_d == 0:
            confidences.append(1)
        else:
            confidences.append(np.prod(token_product[keys_list.index(best_label)]) * p_y[best_label] / p_d)
    return predictions, confidences

In [0]:
# FIRST EXPERIMENT
D_train, y_train = load_dataset('/content/gdrive/Shared drives/CIS519/data/new data/train_20_now.csv', True, True, True)
D_valid, y_valid = load_dataset('/content/gdrive/Shared drives/CIS519/data/new data/validation_20_now.csv', True, True, True)
D_test, y_test = load_dataset('/content/gdrive/Shared drives/CIS519/data/new data/test_20_now.csv', True, True, True)

vocab = get_vocabulary([doc[0] for doc in D_train])
hashtag_vocab = get_vocabulary([doc[2] for doc in D_train])
at_vocab = get_vocabulary([doc[4] for doc in D_train])

In [0]:
print("BBoW:")
featurizer = BBoWFeaturizer()
X_train = convert_to_features(D_train, featurizer, vocab, hashtag_vocab, at_vocab)
best_k = 0
best_accuracy = 0
best_p_y = {}
best_p_v_y = {}
for k in [0.001, 0.01, 0.1, 1.0, 10.0]:
    p_y, p_v_y = train_naive_bayes(X_train, y_train, k, vocab)
    predictions, confidences = predict_naive_bayes(D_valid, p_y, p_v_y)
    accuracy = len([i for i in range(len(predictions)) if predictions[i] == y_valid[i]]) / len(y_valid)
    print("k: %g, accuracy: %.4f" %(k, accuracy))
    if accuracy > best_accuracy:
        best_k = k
        best_accuracy = accuracy
        best_p_y = p_y
        best_p_v_y = p_v_y
predictions, confidences = predict_naive_bayes(D_test, best_p_y, best_p_v_y)
accuracy = len([i for i in range(len(predictions)) if predictions[i] == y_test[i]]) / len(y_test)
average_confidence = sum(confidences) / len(confidences)
print("best k: %g, test accuracy: %.4f, average confidence: %.4f" %(best_k, accuracy, average_confidence))
print("CBoW:")
featurizer = CBoWFeaturizer()
X_train = convert_to_features(D_train, featurizer, vocab, hashtag_vocab, at_vocab)
best_k = 0
best_accuracy = 0
best_p_y = {}
best_p_v_y = {}
for k in [0.001, 0.01, 0.1, 1.0, 10.0]:
    p_y, p_v_y = train_naive_bayes(X_train, y_train, k, vocab)
    predictions, confidences = predict_naive_bayes(D_valid, p_y, p_v_y)
    accuracy = len([i for i in range(len(predictions)) if predictions[i] == y_valid[i]]) / len(y_valid)
    print("k: %g, accuracy: %.4f" %(k, accuracy))
    if accuracy > best_accuracy:
        best_k = k
        best_accuracy = accuracy
        best_p_y = p_y
        best_p_v_y = p_v_y
predictions, confidences = predict_naive_bayes(D_test, best_p_y, best_p_v_y)
accuracy = len([i for i in range(len(predictions)) if predictions[i] == y_test[i]]) / len(y_test)
average_confidence = sum(confidences) / len(confidences)
print("best k: %g, test accuracy: %.4f, average confidence: %.4f" %(best_k, accuracy, average_confidence))
print("TFIDF:")
featurizer = TFIDFFeaturizer(compute_idf(D_train, vocab.union(hashtag_vocab).union(at_vocab)))
X_train = convert_to_features(D_train, featurizer, vocab, hashtag_vocab, at_vocab)
best_k = 0
best_accuracy = 0
best_p_y = {}
best_p_v_y = {}
for k in [0.001, 0.01, 0.1, 1.0, 10.0]:
    p_y, p_v_y = train_naive_bayes(X_train, y_train, k, vocab)
    predictions, confidences = predict_naive_bayes(D_valid, p_y, p_v_y)
    accuracy = len([i for i in range(len(predictions)) if predictions[i] == y_valid[i]]) / len(y_valid)
    print("k: %g, accuracy: %.4f" %(k, accuracy))
    if accuracy > best_accuracy:
        best_k = k
        best_accuracy = accuracy
        best_p_y = p_y
        best_p_v_y = p_v_y
predictions, confidences = predict_naive_bayes(D_test, best_p_y, best_p_v_y)
accuracy = len([i for i in range(len(predictions)) if predictions[i] == y_test[i]]) / len(y_test)
average_confidence = sum(confidences) / len(confidences)
print("best k: %g, test accuracy: %.4f, average confidence: %.4f" %(best_k, accuracy, average_confidence))

BBoW:
k: 0.001, accuracy: 0.3205
k: 0.01, accuracy: 0.3207
k: 0.1, accuracy: 0.3214
k: 1, accuracy: 0.3349
k: 10, accuracy: 0.2936
best k: 1, test accuracy: 0.3499, average confidence: 0.6686
CBoW:
k: 0.001, accuracy: 0.3200
k: 0.01, accuracy: 0.3205
k: 0.1, accuracy: 0.3223
k: 1, accuracy: 0.3336
k: 10, accuracy: 0.2946
best k: 1, test accuracy: 0.3497, average confidence: 0.6671
TFIDF:
k: 0.001, accuracy: 0.3150
k: 0.01, accuracy: 0.3164
k: 0.1, accuracy: 0.3191
k: 1, accuracy: 0.3237
k: 10, accuracy: 0.3313
best k: 10, test accuracy: 0.3551, average confidence: 0.7332


In [0]:
# SECOND EXPERIMENT
D_train, y_train = load_dataset('/content/gdrive/Shared drives/CIS519/data/new data/train_20_now.csv')
D_valid, y_valid = load_dataset('/content/gdrive/Shared drives/CIS519/data/new data/validation_20_now.csv')

f = open('/content/gdrive/Shared drives/CIS519/human_test.txt')

D_test = []
for line in f:
  D_test.append([line.split()])

vocab = get_vocabulary([doc[0] for doc in D_train])
hashtag_vocab = get_vocabulary([doc[2] for doc in D_train])
at_vocab = get_vocabulary([doc[4] for doc in D_train])

In [0]:
print("BBoW:")
featurizer = BBoWFeaturizer()
X_train = convert_to_features(D_train, featurizer, vocab, hashtag_vocab, at_vocab)
best_k = 0
best_accuracy = 0
best_p_y = {}
best_p_v_y = {}
for k in [0.001, 0.01, 0.1, 1.0, 10.0]:
    p_y, p_v_y = train_naive_bayes(X_train, y_train, k, vocab)
    predictions, confidences = predict_naive_bayes(D_valid, p_y, p_v_y)
    accuracy = len([i for i in range(len(predictions)) if predictions[i] == y_valid[i]]) / len(y_valid)
    print("k: %g, accuracy: %.4f" %(k, accuracy))
    if accuracy > best_accuracy:
        best_k = k
        best_accuracy = accuracy
        best_p_y = p_y
        best_p_v_y = p_v_y
predictions, confidences = predict_naive_bayes(D_test, best_p_y, best_p_v_y)
print(predictions)
print("CBoW:")
featurizer = CBoWFeaturizer()
X_train = convert_to_features(D_train, featurizer, vocab, hashtag_vocab, at_vocab)
best_k = 0
best_accuracy = 0
best_p_y = {}
best_p_v_y = {}
for k in [0.001, 0.01, 0.1, 1.0, 10.0]:
    p_y, p_v_y = train_naive_bayes(X_train, y_train, k, vocab)
    predictions, confidences = predict_naive_bayes(D_valid, p_y, p_v_y)
    accuracy = len([i for i in range(len(predictions)) if predictions[i] == y_valid[i]]) / len(y_valid)
    print("k: %g, accuracy: %.4f" %(k, accuracy))
    if accuracy > best_accuracy:
        best_k = k
        best_accuracy = accuracy
        best_p_y = p_y
        best_p_v_y = p_v_y
predictions, confidences = predict_naive_bayes(D_test, best_p_y, best_p_v_y)
print(predictions)
print("TFIDF:")
featurizer = TFIDFFeaturizer(compute_idf(D_train, vocab.union(hashtag_vocab).union(at_vocab)))
X_train = convert_to_features(D_train, featurizer, vocab, hashtag_vocab, at_vocab)
best_k = 0
best_accuracy = 0
best_p_y = {}
best_p_v_y = {}
for k in [0.001, 0.01, 0.1, 1.0, 10.0]:
    p_y, p_v_y = train_naive_bayes(X_train, y_train, k, vocab)
    predictions, confidences = predict_naive_bayes(D_valid, p_y, p_v_y)
    accuracy = len([i for i in range(len(predictions)) if predictions[i] == y_valid[i]]) / len(y_valid)
    print("k: %g, accuracy: %.4f" %(k, accuracy))
    if accuracy > best_accuracy:
        best_k = k
        best_accuracy = accuracy
        best_p_y = p_y
        best_p_v_y = p_v_y
predictions, confidences = predict_naive_bayes(D_test, best_p_y, best_p_v_y)
print(predictions)

BBoW:
k: 0.001, accuracy: 0.3338
k: 0.01, accuracy: 0.3377
k: 0.1, accuracy: 0.3414
k: 1, accuracy: 0.3482
k: 10, accuracy: 0.2950
[1, 1, 1, 2, 1, 3, 1, 1, 1, 3, 3, 1, 3, 1, 3, 3, 2, 1, 1, 1, 3, 3, 3, 2, 3, 2, 3, 1, 3, 1, 3, 1, 3, 1, 1, 3, 1, 2, 3, 2, 1, 2, 3, 1, 1, 1, 1, 1, 1, 3, 3, 3, 3, 2, 3, 3, 2, 1, 3, 1, 1, 1, 3, 1, 3, 3, 3, 3, 3, 3, 1, 1, 3, 3, 3, 3, 3, 1, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 1, 3, 3, 3, 1, 3, 1, 1, 3, 3, 9, 2, 3, 4, 3, 3, 1, 3, 1, 3, 1, 3, 1, 3, 7, 2, 3, 1, 1, 3, 3, 1, 1, 1, 3, 1, 1, 2, 1, 3, 3, 19, 3, 1, 3, 3, 3, 1, 3, 1, 4, 2, 3, 3, 3, 2, 3, 1, 1, 1, 4, 4, 3, 3, 1, 3, 3, 1, 3, 3, 1, 1, 2, 3, 6, 1, 3, 3, 3, 1, 1, 1, 3, 3, 3, 3, 3, 3, 1, 1, 2, 3, 1, 3, 4, 1, 1, 2, 3, 1]
CBoW:
k: 0.001, accuracy: 0.3324
k: 0.01, accuracy: 0.3361
k: 0.1, accuracy: 0.3395
k: 1, accuracy: 0.3503
k: 10, accuracy: 0.2950
[1, 1, 1, 2, 1, 3, 1, 1, 1, 3, 3, 1, 3, 1, 3, 3, 2, 1, 1, 1, 3, 3, 3, 2, 3, 2, 3, 1, 3, 1, 3, 1, 3, 1, 1, 3, 1, 2, 3, 2, 1, 2, 3, 1, 1, 