In [None]:
#Dawn Schmidt
#Machine Learning 2
#Assignment 3
#August 10, 2024

In [65]:
#import libraries
import json
import string
import numpy as np
import warnings
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

#filter specific warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="transformers.tokenization_utils_base")

# Load the data set

In [129]:
#load the data file and remove <start> and <end> tokens; replace with ''
data=json.load(open('data_assignment3.json'))
with open('data_assignment3.json') as f:
    data = json.load(f)
cleaned_data = [sentence.replace('<start>', '').replace('<end>', '').strip() for sentence in data]


In [131]:
#show top 5 rows of data of cleaned data
cleaned_data[0:5]

['A man rows a boat against a night sky .',
 'Two dogs running through a field .',
 'A small boy wearing a red helmet rides his bicycle down a patterned path .',
 'A black dog standing in a shallow area of water on a rocky beach .',
 'Two white dogs are running together .']

# Data Cleaning

In [133]:
#initialize the stemmer
stemmer = PorterStemmer()

#function for cleaning text
def clean_text(text):
    #normalize case to lower case
    text = text.lower()
    #remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    #tokenization
    tokens = word_tokenize(text)
    #remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

#apply cleaning to all sentences
cleaned_data = [clean_text(sentence) for sentence in cleaned_data]

#print the first 5 cleaned sentences
print(cleaned_data[0:5])


['man rows boat night sky', 'two dogs running field', 'small boy wearing red helmet rides bicycle patterned path', 'black dog standing shallow area water rocky beach', 'two white dogs running together']


# Sentence Embeddings

In [74]:
#load a pre-trained model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

#generate embeddings for each sentence
embeddings = model.encode(cleaned_data)

#print the shape of embeddings to verify
print(embeddings.shape)


(500, 384)


# Group Sentences by Similarity

In [98]:
#calculate cosine similarity between all pairs
similarity_matrix = cosine_similarity(embeddings)

#create groups ensuring each group has exactly 5 sentences
used_sentences = set()
groups = []

for i in range(len(cleaned_data)):
    if i in used_sentences:
        continue
    
    #get the indices of the most similar sentences
    similar_indices = np.argsort(-similarity_matrix[i])[1:6]
    
    #ensure we don't use already used sentences and that we form a group of 5
    group = [i]
    for idx in similar_indices:
        if idx not in used_sentences and len(group) < 5:
            group.append(idx)
    
    if len(group) == 5:
        groups.append(group)
        used_sentences.update(group)
        
print('Grouping complete')


Grouping complete


# Export results .txt File

In [100]:
#write the grouped sentences to a text file
with open('grouped_sentences_output.txt', 'w') as f:
    for group in groups:
        for idx in group:
            f.write(cleaned_data[idx] + '\n')
        f.write('---\n')

print("Output written to grouped_sentences_output.txt")

Output written to grouped_sentences_output.txt


# Check submission file

In [114]:
# Define the filename directly, since sys.argv doesn't apply in a notebook environment
file_name = 'TEST.txt'

def check_if_my_submission_is_correct(file_name):
    try:
        with open(file_name, 'r') as f:
            count = 0
            line_set = set()
            for line in f.read().splitlines():
                break_line = ''.join(set(line)) == '-'
                if break_line:
                    if count != 5:
                        msg = f'The number of lines after last break line is {count}, but it should be 5'
                        msg += '\n' + 'There needs to be a break line after every 5 entries, and the break line should consist only of "-"'
                        print('\033[31m' + '-----------------FAIL-----------------' + '\033[m')
                        print(msg)
                        break
                    count = 0
                else:
                    if line in line_set:
                        msg = f'Duplicate entry: {line}'
                        msg += '\n' + 'There should be no duplicate entry'
                        print('\033[31m' + '-----------------FAIL-----------------' + '\033[m')
                        print(msg)
                        break
                    line_set.add(line)
                    count += 1
            else:
                print('\033[32m' + '-----------------PASS-----------------' + '\033[m')
    except FileNotFoundError:
        print(f"File not found: {file_name}. Please check the file path and try again.")

# Check the submission file
check_if_my_submission_is_correct(file_name)


[32m-----------------PASS-----------------[m


In [None]:
###TEST###

import json
import string
import numpy as np
import warnings
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Suppress specific warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="transformers.tokenization_utils_base")

### Section 1: Load and Clean the Data ###

# Load the data file
with open('data_assignment3.json') as f:
    data = json.load(f)

# Remove <start> and <end> tokens from original data
original_sentences = [sentence.replace('<start>', '').replace('<end>', '').strip() for sentence in data]

# Remove exact duplicates
unique_original_sentences = list(set(original_sentences))

# Initialize the stemmer
stemmer = PorterStemmer()

# Function to clean text
def clean_text(text):
    # Normalize case to lower case
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

# Create a mapping of cleaned sentences to original sentences
cleaned_to_original = {clean_text(sentence): sentence for sentence in unique_original_sentences}

# Create a list of cleaned sentences
cleaned_data = list(cleaned_to_original.keys())

# Optional: Check the first 5 cleaned sentences
print("Sample of cleaned data:", cleaned_data[:5])

### Section 2: Generate Sentence Embeddings ###

# Load a pre-trained SentenceTransformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Generate embeddings for each cleaned sentence
embeddings = model.encode(cleaned_data)

# Optional: Print the shape of embeddings to verify
print("Shape of embeddings:", embeddings.shape)

### Section 3: Group Sentences by Similarity, Avoiding Duplicates ###

# Calculate cosine similarity between all pairs of embeddings
similarity_matrix = cosine_similarity(embeddings)

# Create groups ensuring each group has exactly 5 sentences without duplicates
used_sentences = set()
groups = []

for i in range(len(cleaned_data)):
    if i in used_sentences:
        continue
    
    # Get the indices of the most similar sentences
    similar_indices = np.argsort(-similarity_matrix[i])[1:]
    
    # Form a group of 5 sentences, avoiding duplicates
    group = [i]
    for idx in similar_indices:
        if idx not in used_sentences and len(group) < 5:
            group.append(idx)
    
    if len(group) == 5:
        groups.append(group)
        used_sentences.update(group)

### Section 4: Print Top 10 Lines Before Exporting ###

# Prepare the first 10 lines for display
output_lines = []
for group in groups[:2]:  # 2 groups of 5 sentences = 10 lines
    for idx in group:
        output_lines.append(cleaned_to_original[cleaned_data[idx]])
    output_lines.append('---')

# Print the top 10 lines
print("\n".join(output_lines))

### Section 5: Export the Groups to a TXT File ###

# Write the grouped original sentences to a text file
with open('grouped_sentences_output.txt', 'w') as f:
    for group in groups:
        for idx in group:
            f.write(cleaned_to_original[cleaned_data[idx]] + '\n')
        f.write('---\n')

print("Output written to grouped_sentences_output.txt")
