<a href="https://colab.research.google.com/github/chuma9/cs229-project/blob/jzqin/NB_logistic_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import collections

import numpy as np
import csv

In [13]:
# if using Google Colab
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [0]:
def load_2013_dataset(csv_path):
    """Load the 2013 Climate Change Tweets dataset from a CSV file

    Args:
         csv_path: Path to CSV file containing dataset.

    Returns:
        messages: A list of string values containing the text of each tweet.
        labels: The binary labels (0, 1, or 2) for each message. A 0 indicates the tweet
            does not believe climate change is human-caused. A 1 indicates that tweet believes 
            climate change is human-caused. A 2 indicates the tweet is N/A for the task.
        labelsConf: A confidence value assigned to the belief of each tweet belongs to its assigned label
        
    """
    
    messages = []
    labels = []
    labelsConf = []
    
    # determined a priori which types of labels are in the dataset
    negativeLabels = ['N', 'No']
    positiveLabels = ['Y', 'Yes']
    NALabels = ['N/A', 'NA']

    with open(csv_path, 'r', newline='', encoding='utf8') as csv_file:
        reader = csv.reader(csv_file, delimiter=',')
        
        # skip header line
        next(reader)

        for message, label, labelConf in reader:
            messages.append(message)
            
            if (label in positiveLabels):
                labels.append(1)
            elif (label in negativeLabels):
                labels.append(0)
            elif (label in NALabels):
                labels.append(2)
            else:
                raise Exception('Unknown labels in dataset! Please check that labels are "N/A", "NA", "N", "No", "Yes", "Y".')
            labelsConf.append(np.float(labelConf))
    
    return np.array(messages), np.array(labels), np.array(labelsConf)

def split_dataset(messages, labels, testFrac = 0.2):
    """Split dataset into training and test sets

    Args:
        messages: Numpy array of messages in dataset
        labels: Numpy array of labels for each message in dataset
        testFrac: fraction of messages and labels to assign to test set

    Returns:
        trainMessages: A list of string values containing the text of each tweet for training data.
        trainLabels: Labels for trainMessages. The binary labels (0, 1, or 2) for each message.
            A 0 indicates the tweet does not believe climate change is human-caused. A 1 indicates that 
            tweet believes climate change is human-caused. A 2 indicates the tweet is N/A for the task.
        testMessages: A list of string values containing the text of each tweet for test data.
        testLabels: Labels for testMessages. The binary labels (0, 1, or 2) for each message.
            A 0 indicates the tweet does not believe climate change is human-caused. A 1 indicates that 
            tweet believes climate change is human-caused. A 2 indicates the tweet is N/A for the task.
        
    """
    numMessages = len(messages)
    groupAssignments = np.zeros(numMessages)
    
    testInd = np.random.choice(numMessages, int(testFrac*numMessages), replace=False)
    groupAssignments[testInd] = 1
    trainMask = groupAssignments == 0
    
    trainMessages = messages[trainMask]
    trainLabels = labels[trainMask]
    testMessages = messages[testInd]
    testLabels = labels[testInd]
    
    return(trainMessages, trainLabels, testMessages, testLabels)
    
    
def filter_data(messages, labels):
    """Filter out data that are not applicable for determining climate change sentiment (i.e. labels with index not 0 or 1)

    Args:
        messages: Numpy array of messages in dataset
        labels: Numpy array of labels for each message in dataset

    Returns:
        filteredMessages: Numpy array of filtered messages in dataset
        filteredLabels: Numpy array of filtered labels for each message in dataset
        
    """
    
    labelsKeep = (labels == 0) | (labels == 1)
    filteredMessages = messages[labelsKeep]
    filteredLabels = labels[labelsKeep]
    
    return(filteredMessages, filteredLabels)

In [0]:
messages, labels, labelsConf = load_2013_dataset('/gdrive/My Drive/twitter_2013.csv')
filteredMessages, filteredLabels = filter_data(messages, labels)
trainMessages, trainLabels, testMessages, testLabels = split_dataset(filteredMessages, filteredLabels, 0.2)

# dataset2013 = load_2013_dataset('../data/twitter_2013.csv')

In [0]:
def get_words(message):
    """Get the normalized list of words from a message string.

    This function should split a message into words, normalize them, and return
    the resulting list. For splitting, you should split on spaces. For normalization,
    you should convert everything to lowercase.

    Args:
        message: A string containing an SMS message

    Returns:
       The list of normalized words from the message.
    """

    message = message.lower().split(' ')
    return(message)


def create_dictionary(messages):
    """Create a dictionary mapping words to integer indices.

    This function should create a dictionary of word to indices using the provided
    training messages. Use get_words to process each message.

    Only include add words to the dictionary if they occur in at least five messages.

    Args:
        messages: A list of strings containing SMS messages

    Returns:
        A python dict mapping words to integers.
    """

    wordCount = {}         # count number of messages each word appears in 
    wordDict = {}          # final dictionary of words <--> indices to output
    
    for message in messages:
        wordsSeen = set()  # keep track of whether all unique words in a message
        words = get_words(message)
        words = list(set(words))
        for word in words:
            if word not in wordCount.keys():
                wordCount[word] = 1
            else:
                wordCount[word] = wordCount[word] + 1
            
    # remove all words that are seen less than 5 times from dictionary
    wordCount = {word : num for word, num in wordCount.items() if num >= 5}
            
    # map words that are seen sufficiently to dictionary, and generate index for each word
    i = 0
    for word in wordCount.keys():
        wordDict[word] = i
        i += 1
        
    return(wordDict)


def transform_text(messages, word_dictionary):
    """Transform a list of text messages into a numpy array for further processing.

    Creates a numpy array that contains the number of times each word
    appears in each message. Each row in the resulting array corresponds to each
    message and each column corresponds to a word.


    Args:
        messages: A list of strings where each string is an SMS message.
        word_dictionary: A python dict mapping words to integers.

    Returns:
        A numpy array marking the words present in each message.
    """

    numMessages = len(messages)
    numWords = len(word_dictionary.keys())
    messageArray = np.zeros((numMessages, numWords))
    
    for i in range(len(messages)):
        words = get_words(messages[i])
        for word in words:
            if word in word_dictionary.keys():
                index = word_dictionary[word]
                messageArray[i][index] += 1
    
    return(messageArray)


def fit_naive_bayes_model(matrix, labels):
    """Fit a naive bayes model.

    This function fits a Naive Bayes model given a training matrix and labels.

    Args:
        matrix: A numpy array containing word counts for the training data
        labels: The binary (0 or 1) labels for that training data

    Returns: The trained model
    """

    n = len(labels)     # number of messages present
    v = matrix.shape[1] # size of vocabulary
    
    # first calculate theta_y
    theta_y = np.sum(labels) / n
    
    # calculate theta_k given y = 1, store as array of length v = size of vocab
    posExamples = matrix[labels == 1, :]
    sumPosExamples = np.sum(posExamples)
    theta_k1 = (1 + np.sum(posExamples, axis=0)) / (sumPosExamples + v) # assign MLE with laplace smoothing
    
    # calculate theta_k given y = 0, store as array of length v = size of vocab
    negExamples = matrix[labels == 0, :]
    sumNegExamples = np.sum(negExamples)
    theta_k0 = (1 + np.sum(negExamples, axis=0)) / (sumNegExamples + v) # assign MLE with laplace smoothing
    
    return((theta_y, theta_k1, theta_k0))


def predict_from_naive_bayes_model(model, matrix):
    """Use a Naive Bayes model to compute predictions for a target matrix.

    This function should be able to predict on the models that fit_naive_bayes_model
    outputs.

    Args:
        model: A trained model from fit_naive_bayes_model
        matrix: A numpy array containing word counts

    Returns: A numpy array containg the predictions from the model
    """
    theta_y, theta_k1, theta_k0 = model
    logProbPosY = np.log(theta_y)
    logProbNegY = np.log(1-theta_y)
    logProbs1 = np.log(theta_k1)
    logProbs0 = np.log(theta_k0)
    
    posExamplesLogProb = np.dot(matrix, logProbs1)
    negExamplesLogProb = np.dot(matrix, logProbs0)
    
    posExamplesProb = np.exp(posExamplesLogProb + logProbPosY)
    negExamplesProb = np.exp(negExamplesLogProb + logProbNegY)
    
    prob = posExamplesProb / (posExamplesProb + negExamplesProb)
    
    prob[prob < 0.5] = 0
    prob[prob >= 0.5] = 1
    
    return(prob)

In [85]:
wordDict = create_dictionary(trainMessages)
trainWordMatrix = transform_text(trainMessages, wordDict)
testWordMatrix = transform_text(testMessages, wordDict)
NBModel = fit_naive_bayes_model(trainWordMatrix, trainLabels)
preds = predict_from_naive_bayes_model(NBModel, testWordMatrix)
np.mean(testLabels == preds)

0.8011834319526627