In [1]:
# Binary sentiment analysis
# The input of the function would be a tweet in the form of string
# The output of the function would be either 0 or 1 for negative or positive sentiment, respectively.

def predict_from_scratch(tweet):

    # Loading libraries
    import numpy as np
    import pandas as pd
    import csv

    # Load data
    df = pd.read_csv(r"D:\SLU\AI MSc\Fall 22\NLP\train-v2.tsv", sep="\t", header=None, quoting = csv.QUOTE_NONE)
    
    # Rename columns
    df.rename(columns={0:"Label", 1:"Document"}, inplace=True)

    # Adding a new column named "Class"
    df['Class'] = None

    # Defining bags of words
    bag_of_words = []
    positive_bag_of_words = []
    negative_bag_of_words = []

    # Deviding the dataframe in to psitive and negative classes
    df_p = df[df['Label'] == 1]
    df_n = df[df['Label'] == 0]

    # Bag of words; all words
    for i in range(len(df)):
        doc = df.loc[i, 'Document']
        words_of_doc = doc.split()
        bag_of_words += words_of_doc

    # Bag of words; in POSITIVE class
    for i in range(len(df_p)):
        doc = df_p.loc[df_p.index[i], 'Document']
        words_of_doc = doc.split()
        positive_bag_of_words += words_of_doc

    # Bag of words; in NEGATIVE class
    for i in range(len(df_n)):
        doc = df_n.loc[df_n.index[i], 'Document']
        words_of_doc = doc.split()
        negative_bag_of_words += words_of_doc

    # Finding unique words in each bags of words
    unique_words = list(set(bag_of_words))
    positive_unique_words = list(set(positive_bag_of_words))
    negative_unique_words = list(set(negative_bag_of_words))

    # Removing unnecessary words. The list can be updated later
    unn_words = ['{URL}', '@USER']
    unique_words = [w for w in unique_words if w not in unn_words]
    positive_unique_words = [w for w in positive_unique_words if w not in unn_words]
    negative_unique_words = [w for w in negative_unique_words if w not in unn_words]

    # Calculating prior possibilities for negative and positive classes
    num_negative_label = len(df[df['Label'] == 0])
    num_positive_label = len(df[df['Label'] == 1])
    negative_prior = num_negative_label / len(df)
    positive_prior = num_positive_label / len(df)

    # List of unique words in "tweet"
    words_in_tweet = list(set(tweet.split()))

    # Setting the initial value 
    p_tweet_negative = negative_prior
    p_tweet_positive = positive_prior

    # Calculating the likelihood possibility of each word in "tweet" and multiply it by prior possibilities
    for i in range(len(words_in_tweet)):

        # Selecting words in "tweet" one by one
        sample = words_in_tweet[i]

        # If the word in not in the bag of words, pass it
        if sample not in unique_words:
            pass

        # Frequency of the word in positive and negative bags of words 
        w_p = positive_bag_of_words.count(sample)
        w_n = negative_bag_of_words.count(sample)

        # Likelihood possibilities for positive class
        p_likelihood_positive = (w_p + 1) / (len(positive_bag_of_words) + len(unique_words))
        p_tweet_positive *= p_likelihood_positive

        # Likelihood possibilities for negative class
        p_likelihood_negative = (w_n + 1) / (len(negative_bag_of_words) + len(unique_words))
        p_tweet_negative *= p_likelihood_negative

    if p_tweet_positive > p_tweet_negative:
        return 1
    elif p_tweet_negative > p_tweet_positive:
        return 0


In [2]:
# test
predict_from_scratch('@USER Ac i tithau')

1