Modules needed

In [2]:
import logging
import gensim
from gensim.models import FastText
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

### Vectorize with FastText

In [3]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Load the unlabeled Korean corpus
with open("data/processed/processed_wiki_ko.txt", "r", encoding="utf-8") as f:
    korean_corpus = [line.strip().split() for line in f.readlines()]

# Train a FastText model on the unlabeled Korean corpus
fasttext_model = FastText(sentences=korean_corpus, vector_size=100, window=5, min_count=5, workers=4, epochs=10)

# Save the trained FastText model
fasttext_model.save("trained_fasttext_model")

2023-05-17 23:21:01,419 : INFO : collecting all words and their counts
2023-05-17 23:21:01,420 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2023-05-17 23:21:04,262 : INFO : PROGRESS: at sentence #10000, processed 7211557 words, keeping 1397320 word types
2023-05-17 23:21:06,405 : INFO : PROGRESS: at sentence #20000, processed 12865814 words, keeping 2275034 word types
2023-05-17 23:21:08,478 : INFO : PROGRESS: at sentence #30000, processed 17970224 words, keeping 2933833 word types
2023-05-17 23:21:10,113 : INFO : PROGRESS: at sentence #40000, processed 22567265 words, keeping 3463284 word types
2023-05-17 23:21:11,639 : INFO : PROGRESS: at sentence #50000, processed 26577175 words, keeping 3890028 word types
2023-05-17 23:21:13,408 : INFO : PROGRESS: at sentence #60000, processed 30535457 words, keeping 4311796 word types
2023-05-17 23:21:15,147 : INFO : PROGRESS: at sentence #70000, processed 34518022 words, keeping 4744297 word types
2023-05-17 23:21:16

### Find cosine simularity ranking with songs

In [3]:
# Load the saved FastText model
loaded_fasttext_model = FastText.load("trained_fasttext_model")

# Function to compute average FastText vector for a text
def text_to_avg_vector(text, model):
    words = text.split()
    word_vectors = [model.wv[word] for word in words if word in model.wv.key_to_index]
    
    if not word_vectors:
        return None
    
    return sum(word_vectors) / len(word_vectors)

# Load lyrics from the CSV file
lyrics_df = pd.read_csv('data/lyrics.csv')

# Compute average FastText vectors for lyrics and store them with song identifier
song_vectors = {}
for index, row in lyrics_df.iterrows():
    lyrics_vector = text_to_avg_vector(row['lyrics'], loaded_fasttext_model)
    if lyrics_vector is not None:
        song_identifier = row['title'] + ' - ' + row['artist']
        song_vectors[song_identifier] = lyrics_vector

# Read the diary entry
with open("data/diary.txt", "r", encoding="utf-8") as f:
    diary_entry = f.read().strip()

# Compute the average FastText vector for the diary entry
diary_vector = text_to_avg_vector(diary_entry, loaded_fasttext_model)

# Compute cosine similarity between diary entry and song lyrics
similarities = {}
for song_identifier, lyrics_vector in song_vectors.items():
    similarity = cosine_similarity(diary_vector.reshape(1, -1), lyrics_vector.reshape(1, -1))
    similarities[song_identifier] = similarity[0][0]

# Rank songs based on similarity and recommend top N songs
N = 10
recommended_songs = sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:N]

# Format the output
print("Top 10 similar songs:\n")
print("Rank\tSong Name - Artist\tSimilarity")
for i, (song_identifier, similarity) in enumerate(recommended_songs, start=1):
    similarity_percentage = round(similarity * 100)
    
    if i == 1: ordinal = "st"
    elif i == 2: ordinal = "nd"
    elif i == 3: ordinal = "rd"
    else: ordinal = "th"
    
    print(f"{i}{ordinal}\t{song_identifier}\t{similarity_percentage}% similar")