In [None]:
import argparse
import pickle

import numpy as np
import requests
import re
from bs4 import BeautifulSoup
import pandas as pd
import time
import os
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import TreebankWordTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [None]:
url_nelly = 'https://www.lyrics.com/artist/Nelly-Furtado/451535'
url_russ = 'https://www.lyrics.com/artist/Russ-Morgan/7176'

In [None]:
response_nelly = requests.get(url_nelly)
if response_nelly.status_code == 200:
    furtado_ly = response_nelly.text
    with open("nelly.txt", "w", encoding="utf-8") as file:
        file.write(furtado_ly)
    print("Nelly Furtado page saved.")

In [None]:
response_russ = requests.get(url_russ)
if response_russ.status_code == 200:
    russmorgan = response_russ.text
    with open("russmorgan.txt", "w", encoding="utf-8") as file:
        file.write(russmorgan)
    print("Russ Morgan page saved.")

In [None]:
with open('nelly.txt', 'r', encoding="utf-8") as my_file:
    meep = my_file.read()
    songs = re.findall(r'/lyric/\d+', meep)

with open('russmorgan.txt', 'r') as my_file:
    meep2 = my_file.read()
    songs2 = re.findall(r'/lyric/\d+', meep2)

In [None]:
lyur = []
sub_url = 'https://www.lyrics.com'
for song in songs:
    sub_link = sub_url + song
    lyur.append(sub_link)

lyur2 = []
for song in songs2:
    sub_link = sub_url + song
    lyur2.append(sub_link)

In [None]:
lyrics_texts = []
for index, element in enumerate(lyur):
    header = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0'}
    response_nelly_lyric = requests.get(element, headers=header)
    furtado_ly_ly = response_nelly_lyric.text

    with open(f'nelly_{index}.txt', "w", encoding="utf-8") as file:
        file.write(furtado_ly_ly)

    song_content = response_nelly_lyric.text
    song_soup = BeautifulSoup(song_content, 'html.parser')

    title_element = song_soup.find('h1', class_='lyric-title')
    title = title_element.text.strip() if title_element is not None else ''

    lyrics_texts.append({'Links': element, 'Title': title, 'Lyrics': ''})

    os.makedirs('Nelly Furtado', exist_ok=True)  # Create directory if it doesn't exist

    lyrics_div = song_soup.find('pre', id='lyric-body-text')

    if lyrics_div:
        lyrics = lyrics_div.get_text(strip=True)
        with open(f'Nelly Furtado/nelly_lyrics_{index}.txt', 'w', encoding='utf-8') as file:
            file.write(lyrics)

        lyrics_texts[index]['Lyrics'] = lyrics

    time.sleep(0.5)

In [None]:
lyrics_texts2 = []
for index, element in enumerate(lyur2):
    header = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:24.0) Gecko/20100101 Firefox/24.0'}
    response_russ_lyric = requests.get(element, headers=header)
    russ_ly_ly = response_russ_lyric.text

    with open(f'russ_{index}.txt', "w", encoding="utf-8") as file:
        file.write(russ_ly_ly)

    song_content = response_russ_lyric.text
    song_soup = BeautifulSoup(song_content, 'html.parser')

    title_element = song_soup.find('h1', class_='lyric-title')
    title = title_element.text.strip() if title_element is not None else ''

    lyrics_texts2.append({'Links': element, 'Title': title, 'Lyrics': ''})

    os.makedirs('russ_morgan', exist_ok=True)  # Create directory if it doesn't exist

    lyrics_div = song_soup.find('pre', id='lyric-body-text')

    if lyrics_div:
        lyrics = lyrics_div.get_text(strip=True)
        with open(f'russ_morgan/russ_lyrics_{index}.txt', 'w', encoding='utf-8') as file:
            file.write(lyrics)

        lyrics_texts2[index]['Lyrics'] = lyrics

    time.sleep(0.5)

In [None]:
lyrics_folders = ["Nelly Furtado", "russ_morgan"]
corpus = []
labels = []

for artist_folder in lyrics_folders:
    for filename in os.listdir(artist_folder):
        if filename.endswith(".txt"):
            file_path = os.path.join(artist_folder, filename)
            with open(file_path, "r", encoding="utf-8") as file:
                lyrics = file.read()
                corpus.append(lyrics)
                labels.append(artist_folder)

In [None]:
nltk.download("wordnet")
nltk.download('stopwords')

corpus = [s.lower() for s in corpus]

CLEAN_corpus = []
tokenizer = TreebankWordTokenizer()
lemmatizer = WordNetLemmatizer()

for doc in corpus:
    tokens = tokenizer.tokenize(text=doc)
    clean_doc = " ".join(lemmatizer.lemmatize(token) for token in tokens)
    CLEAN_corpus.append(clean_doc)

STOPWORDS = stopwords.words('english')

LABELS = ['Nelly Furtado']  + ['Russ Morgan']

lemmatizer = WordNetLemmatizer()
tokenizer = TreebankWordTokenizer()

def tokenize_lemmatize(text, stopwords=STOPWORDS, tokenizer=tokenizer, lemmatizer=lemmatizer):
    text = ''.join([ch for ch in text if ch not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    return [lemmatizer.lemmatize(token) for token in tokens if token not in stopwords]

In [None]:
vectorizer = CountVectorizer(stop_words=STOPWORDS)
X = vectorizer.fit_transform(corpus)
X_df = pd.DataFrame(X.todense(), columns=vectorizer.get_feature_names_out(), index=labels)

X = X_df.values  # Features (word frequencies)
y = X_df.index.values  # Labels (artist names)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = MultinomialNB()
model.fit(X_train, y_train)

accuracy = model.score(X_test, y_test)
print("Accuracy:", accuracy)

In [None]:
def predict_artist(text, vectorizer=vectorizer, model=model):
    processed_text = ' '.join(tokenize_lemmatize(text.lower()))
    features = vectorizer.transform([processed_text])
    predicted_artist = model.predict(features)
    return predicted_artist[0]


# Use the function to predict artist
text = "Thebirdsabove all sing of love, agentlesweet"
predicted_artist = predict_artist(text)
print("Predicted Artist:", predicted_artist)

In [None]:
import pickle

with open('trained_model.pkl', 'wb') as pickle_file:
    pickle.dump(model, pickle_file)    