# CS 585 - HW 3 - EDUARDO GALEOTE - A20552496

In [1]:
#NLTK setup - uncomment and run first time you import NLTK
import nltk
nltk.download('punkt')

import pandas as pd
from nltk.tokenize import word_tokenize
from csv import QUOTE_NONE

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/edugaleote/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
df_sst = pd.read_csv("train.tsv",delimiter="\t")
df_sst.head(3)

Unnamed: 0,sentence,label
0,hide new secretions from the parental units,0
1,"contains no wit , only labored gags",0
2,that loves its characters and communicates som...,1


#### PROBLEM 1 – Reading the data

In [3]:
# Splitting the data
val_df = df_sst.iloc[:100]
test_df = df_sst.iloc[100:200]
train_df = df_sst.iloc[200:]

# If you want to check the sizes:
print("Validation size:", len(val_df))
print("Test size:", len(test_df))
print("Training size:", len(train_df))


Validation size: 100
Test size: 100
Training size: 67149


In [4]:
# Calculating the number of occurrences of each label in the training set
num_positives = train_df['label'].sum()
num_negatives = len(train_df) - num_positives

# Calculating prior probabilities
prior_positive = num_positives / len(train_df)
prior_negative = num_negatives / len(train_df)

print("Prior probability for positive sentiment:", prior_positive)
print("Prior probability for negative sentiment:", prior_negative)


Prior probability for positive sentiment: 0.5579681007907787
Prior probability for negative sentiment: 0.44203189920922126


#### PROBLEM 2 – Tokenizing data

In [5]:
def tokenize_sentence(sentence):
    # Tokenize the sentence by splitting based on whitespace
    tokens = sentence.split()
    
    # Add start and end symbols to the tokenized sequence
    return ['<s>'] + tokens + ['</s>']

# Test the function
sentence = "hello class"
print(tokenize_sentence(sentence))


['<s>', 'hello', 'class', '</s>']


In [6]:
# Applying the function to the 'sentence' column
train_df['tokenized'] = train_df['sentence'].apply(tokenize_sentence)

# Displaying the tokenization of the first sentence
print(train_df['tokenized'].iloc[0])

['<s>', 'told', 'in', 'scattered', 'fashion', '</s>']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['tokenized'] = train_df['sentence'].apply(tokenize_sentence)


In [7]:
# Create a set to store the unique tokens
vocab = set()

# Iterate over each tokenized sentence and update the vocabulary set
for tokens in train_df['tokenized']:
    vocab.update(tokens)

# Determine the size of the vocabulary
vocab_size = len(vocab)

print("Vocabulary size (including <s> and </s> symbols):", vocab_size)


Vocabulary size (including <s> and </s> symbols): 14813


#### PROBLEM 3 – Bigram counts

In [8]:
def count_bigrams(tokenized_sequences):
    # Initialize an empty dictionary to store bigram counts
    bigram_counts = {}

    # Iterate over each tokenized sentence
    for tokens in tokenized_sequences:
        for i in range(len(tokens) - 1):  # -1 because we're looking at pairs of tokens
            # Get the current token and the next token
            wi, wj = tokens[i], tokens[i + 1]
            
            # If wi is not in the dictionary, add it with an empty dictionary as its value
            if wi not in bigram_counts:
                bigram_counts[wi] = {}
            
            # If wj is not in the wi dictionary, add it with count 0
            if wj not in bigram_counts[wi]:
                bigram_counts[wi][wj] = 0

            # Increment the count of the bigram (wi, wj)
            bigram_counts[wi][wj] += 1
            
    return bigram_counts


In [9]:
bigram_counts = count_bigrams(train_df['tokenized'].tolist())
# Now, you can retrieve the count of any bigram using bigram_counts[wi][wj]
print(bigram_counts["academy"]["award"])

15


In [10]:
count_start_the = bigram_counts.get('<s>', {}).get('the', 0)
print(f"The number of times a sentence starts with 'the' is: {count_start_the}")

The number of times a sentence starts with 'the' is: 4450


#### PROBLEM 4 – Smoothing

In [11]:
import math

def smoothed_log_probability(wm, wm_1, bigram_counts, alpha, vocab_size):
    # Count of all bigrams starting with wm_1
    wm_1_count = sum(bigram_counts.get(wm_1, {}).values())
    # Get the bigram count, if it doesn't exist in the dictionary, default to 0
    bigram_count = bigram_counts.get(wm_1, {}).get(wm, 0)
    # Implementing the formula for smoothed probability
    p_smooth = (bigram_count + alpha) / (wm_1_count + vocab_size * alpha)

    # Returning the negative log-probability
    return math.log(p_smooth)

# Calculating and printing the log probabilities for the given alphas
for alpha in [0.001, 0.5]:
    log_prob = smoothed_log_probability("award", "academy", bigram_counts, alpha, vocab_size)
    print(f"Log probability for alpha={alpha}: {log_prob}")


Log probability for alpha=0.001: -1.0250904304166832
Log probability for alpha=0.5: -6.172912066128204


#### PROBLEM 5 – Sentence log-probability

In [12]:
def sentence_log_probability(sentence, bigram_counts, alpha, vocab_size):
    # Tokenize the sentence and add start and end symbols
    tokens = ['<s>'] + sentence.split() + ['</s>']
    
    # Compute the sum of the log probabilities for each bigram in the sentence
    log_prob = 0
    for i in range(1, len(tokens)):
        # Extract the current word and the previous word
        wm, wm_1 = tokens[i], tokens[i-1]
        
        # Use the smoothed_log_probability to get the log-probability for each bigram
        log_prob += smoothed_log_probability(wm, wm_1, bigram_counts, alpha, vocab_size)
    
    return log_prob

# Sentences
sentences = [
    "this was a really great movie but it was a little too long.",
    "long too little a was it but movie great really a was this."
]

# Compute the log-probability for each sentence using the given alpha value
alpha = 0.001
for s in sentences:
    log_prob = sentence_log_probability(s, bigram_counts, alpha, vocab_size)
    print(f"Log probability of the sentence '{s}': {log_prob}")



Log probability of the sentence 'this was a really great movie but it was a little too long.': -85.38817424066075
Log probability of the sentence 'long too little a was it but movie great really a was this.': -163.18320211328515


#### PROBLEM 6 – Tuning Alpha

In [13]:
def validation_log_likelihood(alpha, validation_data, bigram_counts, vocab_size):
    # Calculate the sum of the log probabilities for all sentences in the validation set
    total_log_prob = sum(sentence_log_probability(sentence, bigram_counts, alpha, vocab_size) for sentence in validation_data)
    return total_log_prob

# Given validation data
validation_data = val_df

# Different alpha values
alphas = [0.001, 0.01, 0.1]

# Calculate and print the log-likelihood estimates for each alpha
log_likelihoods = {}
for alpha in alphas:
    log_likelihood = validation_log_likelihood(alpha, validation_data, bigram_counts, vocab_size)
    log_likelihoods[alpha] = log_likelihood
    print(f"Log-likelihood estimate for alpha {alpha}: {log_likelihood}")

# Select the alpha with the highest log-likelihood (least negative value)
selected_alpha = max(log_likelihoods, key=log_likelihoods.get)

print(f"The selected alpha is: {selected_alpha}")


Log-likelihood estimate for alpha 0.001: -47.23596387105087
Log-likelihood estimate for alpha 0.01: -44.09395975180249
Log-likelihood estimate for alpha 0.1: -41.70366084780157
The selected alpha is: 0.1


#### PROBLEM 7 – Applying Language Models

In [14]:
positive_sentences=pd.DataFrame()
negative_sentences=pd.DataFrame()

positive_sentences['sentence'] = df_sst[df_sst['label'] == 1]['sentence'].tolist()
negative_sentences['sentence'] = df_sst[df_sst['label'] == 0]['sentence'].tolist()

positive_sentences['tokenized'] = positive_sentences['sentence'].apply(tokenize_sentence)
negative_sentences['tokenized'] = negative_sentences['sentence'].apply(tokenize_sentence)

positive_bigram_counts = count_bigrams(positive_sentences['tokenized'].tolist())
negative_bigram_counts = count_bigrams(negative_sentences['tokenized'].tolist())

posvocab = set()

for tokens in positive_sentences['tokenized']:
    posvocab.update(tokens)

posvocab_size = len(posvocab)

negvocab = set()

for tokens in negative_sentences['tokenized']:
    negvocab.update(tokens)

negvocab_size = len(negvocab)

predicted_labels = []

for _, row in test_df.iterrows():
    sentence = row['sentence']
    
    positive_score = sentence_log_probability(sentence, positive_bigram_counts, selected_alpha, posvocab_size) + prior_positive
    negative_score = sentence_log_probability(sentence, negative_bigram_counts, selected_alpha, negvocab_size) + prior_negative
    
    if positive_score > negative_score:
        predicted_labels.append(1)
    else:
        predicted_labels.append(0)

test_df['predicted_label'] = predicted_labels

# Count the number of positive and negative predicted labels
num_predicted_positives = test_df[test_df['predicted_label'] == 1].shape[0]
num_predicted_negatives = test_df[test_df['predicted_label'] == 0].shape[0]

print(f"Number of Predicted Positives: {num_predicted_positives}")
print(f"Number of Predicted Negatives: {num_predicted_negatives}")

# Count the number of correctly predicted labels
correctly_predicted_positives = test_df[(test_df['label'] == 1) & (test_df['predicted_label'] == 1)].shape[0]
correctly_predicted_negatives = test_df[(test_df['label'] == 0) & (test_df['predicted_label'] == 0)].shape[0]

print(f"Number of Correctly Predicted Positives: {correctly_predicted_positives}")
print(f"Number of Correctly Predicted Negatives: {correctly_predicted_negatives}")

# Count the number of incorrectly predicted labels
incorrectly_predicted_positives = num_predicted_positives - correctly_predicted_positives
incorrectly_predicted_negatives = num_predicted_negatives - correctly_predicted_negatives

print(f"Number of Incorrectly Predicted Positives: {incorrectly_predicted_positives}")
print(f"Number of Incorrectly Predicted Negatives: {incorrectly_predicted_negatives}")

correct_predictions = (test_df['label'] == test_df['predicted_label']).sum()
total_predictions = len(test_df)
accuracy = correct_predictions / total_predictions * 100

print(f"Accuracy: {accuracy:.2f}%")



Number of Predicted Positives: 50
Number of Predicted Negatives: 50
Number of Correctly Predicted Positives: 50
Number of Correctly Predicted Negatives: 47
Number of Incorrectly Predicted Positives: 0
Number of Incorrectly Predicted Negatives: 3
Accuracy: 97.00%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['predicted_label'] = predicted_labels
