In [43]:
import pandas as pd

#genius = pd.read_csv('raw_data/song_lyrics.csv', skiprows=range(1, 5_000_001), nrows=1_000_000)

In [58]:
first_million = pd.read_csv('raw_data/new_songs_first_1m.csv')

# clean data

In [59]:
import re
import string
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from textblob import TextBlob
import unicodedata


def preprocessing(sentence):
        
    sentence = sentence.strip()
    
    # Convert to lowercase
    sentence = sentence.lower()
    
    # Replace newline characters with space
    sentence = sentence.replace("\n", " ")
    
    # Remove text within brackets and the brackets themselves
    sentence = re.sub(r'\[.*?\]', '', sentence)
    
    # Remove single quotes
    sentence = sentence.replace("\'", "")
    
    # Remove numbers
    sentence = re.sub(r'\b\d+\b', '', sentence)
    
    # Remove punctuation
    sentence = ''.join(char for char in sentence if char not in string.punctuation)
    
    # Remove extra spaces
    sentence = re.sub(r'\s+', ' ', sentence).strip()
    
    # Tokenize sentence
    tokens = word_tokenize(sentence)
    
    # Remove punctuation from tokens
    filtered_tokens = [word for word in tokens if word.isalpha()]
    
    # Lemmatize the tokens
    wnl = WordNetLemmatizer()
    lemmatized_words = [wnl.lemmatize(word, pos='v') for word in filtered_tokens]
    lemmatized_words = [wnl.lemmatize(word, pos='n') for word in lemmatized_words]
    
    return lemmatized_words


In [60]:
def apply_cleaning_in_batches(df, clean_func, batch_size=100):
    """
    Apply a cleaning function to a DataFrame in batches.
    
    Args:
        df (pd.DataFrame): The input DataFrame containing a column to be cleaned.
        clean_func (function): The cleaning function to apply.
        batch_size (int): Number of rows to process in each batch.

    Returns:
        pd.DataFrame: The DataFrame with the cleaned data.
    """
    # Initialize a list to hold processed batches
    processed_chunks = []

    # Calculate number of batches
    num_batches = (len(df) + batch_size - 1) // batch_size

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(df))
        
        # Extract the current batch
        batch = df.iloc[start_idx:end_idx].copy()
        
        # Apply the cleaning function to the 'lyrics' column
        batch['clean_text'] = batch['lyrics'].apply(clean_func)
        
        # Append processed batch to the list
        processed_chunks.append(batch)
        
        # Print progress
        print(f"✅ Batch {i + 1}/{num_batches} processed ({batch.shape[0]} rows)")

    # Concatenate all processed chunks
    full_data = pd.concat(processed_chunks, ignore_index=True)
    
    print("🥳 All batches processed.")
    return full_data

In [62]:
first_million_clean = apply_cleaning_in_batches(first_million, preprocessing, batch_size=10_000)

✅ Batch 1/78 processed (10000 rows)
✅ Batch 2/78 processed (10000 rows)
✅ Batch 3/78 processed (10000 rows)
✅ Batch 4/78 processed (10000 rows)
✅ Batch 5/78 processed (10000 rows)
✅ Batch 6/78 processed (10000 rows)
✅ Batch 7/78 processed (10000 rows)
✅ Batch 8/78 processed (10000 rows)
✅ Batch 9/78 processed (10000 rows)
✅ Batch 10/78 processed (10000 rows)
✅ Batch 11/78 processed (10000 rows)
✅ Batch 12/78 processed (10000 rows)
✅ Batch 13/78 processed (10000 rows)
✅ Batch 14/78 processed (10000 rows)
✅ Batch 15/78 processed (10000 rows)
✅ Batch 16/78 processed (10000 rows)
✅ Batch 17/78 processed (10000 rows)
✅ Batch 18/78 processed (10000 rows)
✅ Batch 19/78 processed (10000 rows)
✅ Batch 20/78 processed (10000 rows)
✅ Batch 21/78 processed (10000 rows)
✅ Batch 22/78 processed (10000 rows)
✅ Batch 23/78 processed (10000 rows)
✅ Batch 24/78 processed (10000 rows)
✅ Batch 25/78 processed (10000 rows)
✅ Batch 26/78 processed (10000 rows)
✅ Batch 27/78 processed (10000 rows)
✅ Batch 28

In [64]:
type(first_million_clean['clean_text'][0])

list

## 1ST INTERMEDIARY SAVE TO FEATHER

In [65]:
import pickle

# Export Pipeline as pickle file
with open("raw_data/first_million_clean.pkl", "wb") as file:
    pickle.dump(first_million_clean, file)

In [68]:
#load lda model from picklefile // model was already trained on all twitter data and data is static
#with open('raw_data/first_million_clean.pkl', 'rb') as pickle_file:
 #   pickle_first_million_clean = pickle.load(pickle_file)

In [69]:
!pip install pyarrow



In [71]:
import pyarrow.feather as feather

feather.write_feather(first_million_clean, 'data.feather')

In [17]:
import pyarrow.feather as feather

first_million_clean = feather.read_feather('data.feather')

In [20]:
first_million_clean['clean_text'] = list(first_million_clean['clean_text'])

In [22]:
first_million_clean

Unnamed: 0,title,artist,year,lyrics,language_cld3,clean_text
0,Killa Cam,Cam'ron,2004,"[Chorus: Opera Steve & Cam'ron] Killa Cam, Kil...",en,"[killa, cam, killa, cam, cam, killa, cam, kill..."
1,Can I Live,JAY-Z,1996,"[Produced by Irv Gotti] [Intro] Yeah, hah, yea...",en,"[yeah, hah, yeah, rocafella, we, invite, you, ..."
2,Forgive Me Father,Fabolous,2003,Maybe cause I'm eatin And these bastards fiend...,en,"[maybe, cause, im, eatin, and, these, bastard,..."
3,Down and Out,Cam'ron,2004,[Produced by Kanye West and Brian Miller] [Int...,en,"[ugh, killa, baby, kanye, this, that, heron, f..."
4,Fly In,Lil Wayne,2005,"[Intro] So they ask me ""Young boy What you gon...",en,"[so, they, ask, me, young, boy, what, you, gon..."
...,...,...,...,...,...,...
779754,The Ballad of Sweeney Todd: “Lift Your Razor H...,Stephen Sondheim,1979,"[COMPANY] Lift your razor high, Sweeney! Hear ...",en,"[lift, your, razor, high, sweeney, hear, it, s..."
779755,Progress live,Knife Fight,2015,Don't give me that shit Cause I don't care wha...,en,"[dont, give, me, that, shit, cause, i, dont, c..."
779756,Dont Come For Me,Ari Gold,2001,Let me just ask you How you could do it How co...,en,"[let, me, just, ask, you, how, you, could, do,..."
779757,Benton Harbor Blues reprise,The Fiery Furnaces,2006,As I try to fill all of my empty days I stumbl...,en,"[a, i, try, to, fill, all, of, my, empty, day,..."


In [18]:
import pandas as pd


words = pd.read_csv('raw_data/top_30_words.csv')

In [19]:
words

Unnamed: 0,Topic,Words
0,Topic 0,"haha, girl, add, rock, guess, dont, dude, cd, ..."
1,Topic 1,"lol, ok, oh, like, thats, hot, exam, forward, ..."
2,Topic 2,"know, back, glad, tomorrow, welcome, school, r..."
3,Topic 3,"help, something, write, early, link, line, ema..."
4,Topic 4,"love, everyone, cute, summer, run, class, hope..."
5,Topic 5,"morning, good morning, good, sleep, go, goodni..."
6,Topic 6,"hope, tweet, wow, hour, yesterday, may, anyone..."
7,Topic 7,"twitter, two, phone, true, almost, every, supp..."
8,Topic 8,"play, home, ah, yet, face, hand, sunny, bet, c..."
9,Topic 9,"start, sure, bed, give, time, coffee, least, g..."


# CREATE UNIQUE LIST

In [61]:
import pandas as pd

def generate_unique_words_from_csv(file_path):
    words_df = pd.read_csv(file_path)

    if 'Words' not in words_df.columns:
        raise ValueError("CSV file must contain a 'Words' column.")

    words_column = words_df['Words'].dropna().astype(str).tolist()
    split_words = [word.strip() for entry in words_column for word in entry.split(',')]
    unique_words = set(split_words)
    unique_words_sorted = sorted(unique_words)

    return unique_words_sorted

# Example usage
file_path = 'raw_data/top_30_words.csv'
unique_words_list = generate_unique_words_from_csv(file_path)
print(unique_words_list)

['aa', 'able', 'absolutely', 'account', 'actually', 'ad', 'add', 'addict', 'advice', 'afternoon', 'age', 'ago', 'agree', 'ah', 'ahh', 'aint', 'airport', 'album', 'alex', 'alive', 'almost', 'alone', 'along', 'alot', 'already', 'alright', 'also', 'always', 'amaze', 'angel', 'annoy', 'another', 'answer', 'anyone', 'anything', 'anyway', 'apparently', 'apple', 'application', 'appreciate', 'around', 'arrive', 'art', 'ask', 'asleep', 'australia', 'avatar', 'aw', 'award', 'away', 'awesome', 'aww', 'babe', 'baby', 'back', 'back work', 'background', 'bad', 'bag', 'bake', 'band', 'bar', 'bath', 'bbq', 'bday', 'beach', 'bear', 'beat', 'beautiful', 'beautiful day', 'become', 'bed', 'beer', 'begin', 'behind', 'believe', 'best', 'best friend', 'bet', 'better', 'bf', 'big', 'bike', 'birthday', 'bitch', 'bite', 'black', 'blast', 'bless', 'blog', 'blood', 'blue', 'boo', 'book', 'bore', 'bos', 'bottle', 'bout', 'box', 'boy', 'boyfriend', 'brain', 'brand', 'break', 'breakfast', 'bright', 'brilliant', 'bri

## QUERY TAGS

In [80]:
def extract_tags(clean_text, unique_words_list):
    unique_words_set = set(unique_words_list)  # Convert to set for faster lookup
    tags = [word for word in clean_text if word in unique_words_set]
    return tags if tags else None

# Define process_dataframe_in_batches function
def process_dataframe_in_batches(df, unique_words_list, batch_size=100):
    processed_chunks = []
    num_batches = (len(df) + batch_size - 1) // batch_size

    for i in range(num_batches):
        start_idx = i * batch_size
        end_idx = min((i + 1) * batch_size, len(df))
        batch = df.iloc[start_idx:end_idx].copy()
        
        # Apply the extract_tags function to the 'clean_text' column
        batch['tags'] = batch['clean_text'].apply(lambda x: extract_tags(x, unique_words_list))
        
        processed_chunks.append(batch)
        print(f"✅ Batch {i + 1}/{num_batches} processed ({batch.shape[0]} rows)")
    
    # Concatenate all processed chunks into a single DataFrame
    full_data = pd.concat(processed_chunks, ignore_index=True)
    return full_data

In [81]:
first_million_clean_tags = process_dataframe_in_batches(first_million_clean, unique_words_list, batch_size=10_000)

✅ Batch 1/78 processed (10000 rows)
✅ Batch 2/78 processed (10000 rows)
✅ Batch 3/78 processed (10000 rows)
✅ Batch 4/78 processed (10000 rows)
✅ Batch 5/78 processed (10000 rows)
✅ Batch 6/78 processed (10000 rows)
✅ Batch 7/78 processed (10000 rows)
✅ Batch 8/78 processed (10000 rows)
✅ Batch 9/78 processed (10000 rows)
✅ Batch 10/78 processed (10000 rows)
✅ Batch 11/78 processed (10000 rows)
✅ Batch 12/78 processed (10000 rows)
✅ Batch 13/78 processed (10000 rows)
✅ Batch 14/78 processed (10000 rows)
✅ Batch 15/78 processed (10000 rows)
✅ Batch 16/78 processed (10000 rows)
✅ Batch 17/78 processed (10000 rows)
✅ Batch 18/78 processed (10000 rows)
✅ Batch 19/78 processed (10000 rows)
✅ Batch 20/78 processed (10000 rows)
✅ Batch 21/78 processed (10000 rows)
✅ Batch 22/78 processed (10000 rows)
✅ Batch 23/78 processed (10000 rows)
✅ Batch 24/78 processed (10000 rows)
✅ Batch 25/78 processed (10000 rows)
✅ Batch 26/78 processed (10000 rows)
✅ Batch 27/78 processed (10000 rows)
✅ Batch 28

In [82]:
first_million_clean_tags['clean_text'][0], first_million_clean_tags['tags'][0]

(array(['killa', 'cam', 'killa', 'cam', 'cam', 'killa', 'cam', 'killa',
        'cam', 'killa', 'cam', 'cam', 'killa', 'cam', 'killa', 'cam',
        'cam', 'killa', 'killa', 'killa', 'cam', 'killa', 'cam', 'cam',
        'killa', 'killa', 'killa', 'cam', 'killa', 'cam', 'cam', 'base',
        'load', 'killa', 'cam', 'killa', 'cam', 'uhhuh', 'killa', 'cam',
        'cam', 'santana', 'on', 'second', 'jim', 'on', 'third', 'killa',
        'cam', 'killa', 'cam', 'cam', 'im', 'at', 'bat', 'killa', 'killa',
        'killa', 'cam', 'killa', 'cam', 'cam', 'killa', 'im', 'bout', 'to',
        'hit', 'this', 'shit', 'out', 'the', 'world', 'killa', 'cam',
        'ugh', 'heatmakerz', 'killa', 'cam', 'cam', 'killa', 'cam',
        'killa', 'cam', 'killa', 'cam', 'cam', 'hahahaha', 'killa', 'cam',
        'killa', 'cam', 'cam', 'killa', 'killa', 'killa', 'cam', 'killa',
        'cam', 'cam', 'killa', 'we', 'make', 'this', 'shit', 'clap',
        'killa', 'cam', 'killa', 'cam', 'cam', 'killa', 'cam

In [83]:
first_million_clean_tags.dtypes

title            object
artist           object
year              int64
lyrics           object
language_cld3    object
clean_text       object
tags             object
dtype: object

# AMEND DATAFRAME

In [85]:
first_million_clean_tags['tags']

0         [load, second, im, im, bout, hit, shit, world,...
1         [yeah, hah, yeah, you, you, know, well, sort, ...
2         [maybe, cause, im, like, you, see, cant, so, i...
3         [ugh, baby, huh, yeah, let, ugh, im, back, ugh...
4         [so, ask, young, boy, what, you, second, time,...
                                ...                        
779754    [high, hear, sing, yes, laugh, often, smile, s...
779755    [dont, give, shit, cause, dont, care, what, yo...
779756    [let, ask, you, you, could, could, you, turn, ...
779757    [try, fill, day, round, past, stay, go, ride, ...
779758    [no, good, no, light, understand, try, hurt, f...
Name: tags, Length: 779759, dtype: object

In [86]:
first_million_clean_tags['tag_len'] = first_million_clean_tags['tags'].str.len()

In [87]:
first_million_clean_tags

Unnamed: 0,title,artist,year,lyrics,language_cld3,clean_text,tags,tag_len
0,Killa Cam,Cam'ron,2004,"[Chorus: Opera Steve & Cam'ron] Killa Cam, Kil...",en,"[killa, cam, killa, cam, cam, killa, cam, kill...","[load, second, im, im, bout, hit, shit, world,...",103.0
1,Can I Live,JAY-Z,1996,"[Produced by Irv Gotti] [Intro] Yeah, hah, yea...",en,"[yeah, hah, yeah, rocafella, we, invite, you, ...","[yeah, hah, yeah, you, you, know, well, sort, ...",145.0
2,Forgive Me Father,Fabolous,2003,Maybe cause I'm eatin And these bastards fiend...,en,"[maybe, cause, im, eatin, and, these, bastard,...","[maybe, cause, im, like, you, see, cant, so, i...",137.0
3,Down and Out,Cam'ron,2004,[Produced by Kanye West and Brian Miller] [Int...,en,"[ugh, killa, baby, kanye, this, that, heron, f...","[ugh, baby, huh, yeah, let, ugh, im, back, ugh...",165.0
4,Fly In,Lil Wayne,2005,"[Intro] So they ask me ""Young boy What you gon...",en,"[so, they, ask, me, young, boy, what, you, gon...","[so, ask, young, boy, what, you, second, time,...",157.0
...,...,...,...,...,...,...,...,...
779754,The Ballad of Sweeney Todd: “Lift Your Razor H...,Stephen Sondheim,1979,"[COMPANY] Lift your razor high, Sweeney! Hear ...",en,"[lift, your, razor, high, sweeney, hear, it, s...","[high, hear, sing, yes, laugh, often, smile, s...",13.0
779755,Progress live,Knife Fight,2015,Don't give me that shit Cause I don't care wha...,en,"[dont, give, me, that, shit, cause, i, dont, c...","[dont, give, shit, cause, dont, care, what, yo...",36.0
779756,Dont Come For Me,Ari Gold,2001,Let me just ask you How you could do it How co...,en,"[let, me, just, ask, you, how, you, could, do,...","[let, ask, you, you, could, could, you, turn, ...",94.0
779757,Benton Harbor Blues reprise,The Fiery Furnaces,2006,As I try to fill all of my empty days I stumbl...,en,"[a, i, try, to, fill, all, of, my, empty, day,...","[try, fill, day, round, past, stay, go, ride, ...",31.0


In [97]:
first_million_clean_tags.drop(columns=['year', 'lyrics', 'language_cld3', 'tag_len'], inplace=True)

In [98]:
first_million_clean_tags

Unnamed: 0,title,artist,clean_text,tags
0,Killa Cam,Cam'ron,"[killa, cam, killa, cam, cam, killa, cam, kill...","[load, second, im, im, bout, hit, shit, world,..."
1,Can I Live,JAY-Z,"[yeah, hah, yeah, rocafella, we, invite, you, ...","[yeah, hah, yeah, you, you, know, well, sort, ..."
2,Forgive Me Father,Fabolous,"[maybe, cause, im, eatin, and, these, bastard,...","[maybe, cause, im, like, you, see, cant, so, i..."
3,Down and Out,Cam'ron,"[ugh, killa, baby, kanye, this, that, heron, f...","[ugh, baby, huh, yeah, let, ugh, im, back, ugh..."
4,Fly In,Lil Wayne,"[so, they, ask, me, young, boy, what, you, gon...","[so, ask, young, boy, what, you, second, time,..."
...,...,...,...,...
779754,The Ballad of Sweeney Todd: “Lift Your Razor H...,Stephen Sondheim,"[lift, your, razor, high, sweeney, hear, it, s...","[high, hear, sing, yes, laugh, often, smile, s..."
779755,Progress live,Knife Fight,"[dont, give, me, that, shit, cause, i, dont, c...","[dont, give, shit, cause, dont, care, what, yo..."
779756,Dont Come For Me,Ari Gold,"[let, me, just, ask, you, how, you, could, do,...","[let, ask, you, you, could, could, you, turn, ..."
779757,Benton Harbor Blues reprise,The Fiery Furnaces,"[a, i, try, to, fill, all, of, my, empty, day,...","[try, fill, day, round, past, stay, go, ride, ..."


# FINAL EXPORT TO FEATHER

In [99]:
import pyarrow.feather as feather

feather.write_feather(first_million_clean_tags, 'first_million_clean_tags.feather')

In [100]:
import pyarrow.feather as feather

first_million_clean_tags = feather.read_feather('first_million_clean_tags.feather')

In [30]:
import pyarrow.feather as feather

first_million_clean_tags = feather.read_feather('raw_data/first_million_clean_tags.feather')

In [31]:
type(first_million_clean_tags)

pandas.core.frame.DataFrame

In [32]:
first_million_clean_tags.isnull().sum()

title         13
artist         0
clean_text     0
tags          79
dtype: int64

In [33]:
first_million_clean_tags = first_million_clean_tags.dropna(axis=0)

In [35]:
first_million_clean_tags.shape

(779667, 4)

In [36]:
feather.write_feather(first_million_clean_tags, 'first_million_clean_tags_02.feather')

In [37]:
first_million_clean_tags_02 = feather.read_feather('first_million_clean_tags_02.feather')

In [38]:
first_million_clean_tags_02.isnull().sum()

title         0
artist        0
clean_text    0
tags          0
dtype: int64

In [106]:
first_million_clean_tags['tags'][0]

array(['load', 'second', 'im', 'im', 'bout', 'hit', 'shit', 'world',
       'ugh', 'hahahaha', 'make', 'shit', 'stay', 'tune', 'what', 'like',
       'damn', 'since', 'still', 'man', 'fam', 'bitch', 'want', 'want',
       'nothing', 'new', 'im', 'land', 'say', 'time', 'mine', 'get',
       'hand', 'line', 'call', 'red', 'red', 'ear', 'call', 'end', 'ice',
       'like', 'you', 'could', 'say', 'im', 'friend', 'you', 'whats',
       'go', 'get', 'bitch', 'say', 'im', 'daddy', 'look', 'like', 'sing',
       'sing', 'sing', 'know', 'yall', 'know', 'yo', 'im', 'get', 'fuck',
       'hit', 'kitchen', 'fresh', 'fuck', 'so', 'park', 'dont', 'care',
       'car', 'home', 'welcome', 'you', 'welcome', 'problem', 'get',
       'know', 'stand', 'like', 'soon', 'sing', 'sing', 'come', 'movie',
       'ring', 'ooh', 'bring', 'ooh', 'ring', 'bad', 'put', 'im', 'king',
       'keep', 'ya', 'fresh', 'let'], dtype=object)

## CHECK UNIQUE VALUES IN THE LENGHT OF THE TAGS

In [93]:
import matplotlib.pyplot as plt

first_million_clean_tags['tag_len'].unique()

array([103., 145., 137., 165., 157., 167., 177., 138., 163., 220., 162.,
       182., 143., 194., 161., 197., 180., 140., 191., 130., 172., 166.,
       206., 118., 181., 117., 173., 115., 201., 188., 158., 149., 155.,
       202., 227., 144., 175., 156., 133., 207., 152., 146., 123., 153.,
       245., 178., 215., 159., 176., 209., 168., 195., 212., 184., 214.,
       186., 170., 150., 169., 204., 126., 192., 151., 124., 147., 171.,
       190., 233., 183., 200., 164., 236.,  47., 208., 160., 223., 154.,
       211., 199., 187., 174., 198.,  14., 114., 196.,  77., 125., 232.,
        80., 142., 122., 213., 139., 205., 148., 219., 210., 179., 119.,
       203., 185., 218., 221., 120., 132.,  81., 111., 116., 121.,  89.,
        96.,  53., 129., 127., 226., 113.,  57., 234., 255., 136., 193.,
       189., 134., 131., 229., 217., 141., 224., 225., 108., 241., 237.,
       235.,  32., 135.,  31.,  37.,  34.,  85.,  51., 249.,  61.,  92.,
        99.,  97., 128.,  63.,  42., 310.,  93.,  5

In [94]:
def check_tags_in_text_and_list(dataframe, unique_list):
    # Specific variables
    tags = dataframe['tags'][0]
    clean_text = dataframe['clean_text'][0]
    
    # Check if all tags are in clean_text
    all_tags_in_clean_text = all(tag in clean_text for tag in tags)
    
    # Check if all tags are in unique_words_list
    all_tags_in_unique_words_list = all(tag in unique_list for tag in tags)
    
    return all_tags_in_clean_text, all_tags_in_unique_words_list

# Check if all tags are in clean_text and unique_words_list for the first row
all_tags_in_clean_text, all_tags_in_unique_words_list = check_tags_in_text_and_list(first_million_clean_tags, unique_words_list)

# Print results
print(f"All tags in clean_text: {all_tags_in_clean_text}")
print(f"All tags in unique_words_list: {all_tags_in_unique_words_list}")

All tags in clean_text: True
All tags in unique_words_list: True


### DEBUG

In [63]:
def extract_tags(clean_text):
    # Ensure clean_text is a list of strings
    if not isinstance(clean_text, list):
        raise ValueError("clean_text must be a list of words")
    
    # Debug: Print out the clean_text to see the input
    print(f"Clean text input: {clean_text}")

    # Filter words that are in unique_words_list and remove duplicates
    tags = list(set(word for word in clean_text if word in unique_words_list))
    
    # Debug: Print out the tags to see the output
    print(f"Extracted tags: {tags}")

    return tags if tags else None

In [64]:
example_clean_text = ['so', 'they', 'ask', 'me', 'young', 'b']

# Run the function with example input
tags = extract_tags(example_clean_text)
print(f"Tags: {tags}")

Clean text input: ['so', 'they', 'ask', 'me', 'young', 'b']
Extracted tags: ['so', 'young', 'ask']
Tags: ['so', 'young', 'ask']


In [69]:
import pandas as pd
import numpy as np

def check_tags_in_text_and_list_per_row(dataframe, unique_list):
    # Lists to store results
    all_tags_in_clean_text_list = []
    all_tags_in_unique_words_list = []

    # Iterate over each row in the DataFrame
    for index, row in dataframe.iterrows():
        tags = row['tags']
        clean_text = row['clean_text']
        
        # Check if all tags are in clean_text
        all_tags_in_clean_text = all(tag in clean_text for tag in tags)
        
        # Check if all tags are in unique_words_list
        all_tags_in_unique_words = all(tag in unique_list for tag in tags)
        
        # Append results to lists
        all_tags_in_clean_text_list.append(all_tags_in_clean_text)
        all_tags_in_unique_words_list.append(all_tags_in_unique_words)
    
    return all_tags_in_clean_text_list, all_tags_in_unique_words_list

# Example unique words list

# Check if all tags are in clean_text and unique_words_list for each row
all_tags_in_clean_text_list, all_tags_in_unique_words_list = check_tags_in_text_and_list_per_row(first_million_clean_tags, unique_words_list)

# Print results for each row
for idx, (in_clean_text, in_unique_list) in enumerate(zip(all_tags_in_clean_text_list, all_tags_in_unique_words_list)):
    print(f"Row {idx}: All tags in clean_text: {in_clean_text}, All tags in unique_words_list: {in_unique_list}")



Row 0: All tags in clean_text: True, All tags in unique_words_list: True
Row 1: All tags in clean_text: True, All tags in unique_words_list: True
Row 2: All tags in clean_text: True, All tags in unique_words_list: True
Row 3: All tags in clean_text: True, All tags in unique_words_list: True
Row 4: All tags in clean_text: True, All tags in unique_words_list: True
Row 5: All tags in clean_text: True, All tags in unique_words_list: True
Row 6: All tags in clean_text: True, All tags in unique_words_list: True
Row 7: All tags in clean_text: True, All tags in unique_words_list: True
Row 8: All tags in clean_text: True, All tags in unique_words_list: True
Row 9: All tags in clean_text: True, All tags in unique_words_list: True


In [57]:
first_million_clean_tags['clean_text'][0]

array(['killa', 'cam', 'killa', 'cam', 'cam', 'killa', 'cam', 'killa',
       'cam', 'killa', 'cam', 'cam', 'killa', 'cam', 'killa', 'cam',
       'cam', 'killa', 'killa', 'killa', 'cam', 'killa', 'cam', 'cam',
       'killa', 'killa', 'killa', 'cam', 'killa', 'cam', 'cam', 'base',
       'load', 'killa', 'cam', 'killa', 'cam', 'uhhuh', 'killa', 'cam',
       'cam', 'santana', 'on', 'second', 'jim', 'on', 'third', 'killa',
       'cam', 'killa', 'cam', 'cam', 'im', 'at', 'bat', 'killa', 'killa',
       'killa', 'cam', 'killa', 'cam', 'cam', 'killa', 'im', 'bout', 'to',
       'hit', 'this', 'shit', 'out', 'the', 'world', 'killa', 'cam',
       'ugh', 'heatmakerz', 'killa', 'cam', 'cam', 'killa', 'cam',
       'killa', 'cam', 'killa', 'cam', 'cam', 'hahahaha', 'killa', 'cam',
       'killa', 'cam', 'cam', 'killa', 'killa', 'killa', 'cam', 'killa',
       'cam', 'cam', 'killa', 'we', 'make', 'this', 'shit', 'clap',
       'killa', 'cam', 'killa', 'cam', 'cam', 'killa', 'cam', 'killa',
  

In [58]:
first_million_clean_tags['tags'][0]

['load',
 'second',
 'im',
 'im',
 'bout',
 'hit',
 'shit',
 'world',
 'ugh',
 'hahahaha',
 'make',
 'shit',
 'stay',
 'tune',
 'what',
 'like',
 'damn',
 'since',
 'still',
 'man',
 'fam',
 'bitch',
 'want',
 'want',
 'nothing',
 'new',
 'im',
 'land',
 'say',
 'time',
 'mine',
 'get',
 'hand',
 'line',
 'call',
 'red',
 'red',
 'ear',
 'call',
 'end',
 'ice',
 'like',
 'you',
 'could',
 'say',
 'im',
 'friend',
 'you',
 'whats',
 'go',
 'get',
 'bitch',
 'say',
 'im',
 'daddy',
 'look',
 'like',
 'sing',
 'sing',
 'sing',
 'know',
 'yall',
 'know',
 'yo',
 'im',
 'get',
 'fuck',
 'hit',
 'kitchen',
 'fresh',
 'fuck',
 'so',
 'park',
 'dont',
 'care',
 'car',
 'home',
 'welcome',
 'you',
 'welcome',
 'problem',
 'get',
 'know',
 'stand',
 'like',
 'soon',
 'sing',
 'sing',
 'come',
 'movie',
 'ring',
 'ooh',
 'bring',
 'ooh',
 'ring',
 'bad',
 'put',
 'im',
 'king',
 'keep',
 'ya',
 'fresh',
 'let']

### DEBUG

In [75]:
def check_words_in_unique_list(words_list, unique_list):
    # Check if all words in words_list are in unique_list
    all_words_in_unique_list = all(word in unique_list for word in words_list)
    
    return all_words_in_unique_list

# Check if all test_lda words are in unique_words_list
all_words_in_unique_list = check_words_in_unique_list(test_lda, unique_words_list)

# Print result
print(f"All words in unique_words_list: {all_words_in_unique_list}")

All words in unique_words_list: True


# WORK ON LOGIC TO GET SONGS

## LDA TEST RESULT

In [96]:
top_words = ['last', 'feel', 'last night', 'issue', 'ear']

In [70]:
unique_words_list

['aa',
 'able',
 'absolutely',
 'account',
 'actually',
 'ad',
 'add',
 'addict',
 'advice',
 'afternoon',
 'age',
 'ago',
 'agree',
 'ah',
 'ahh',
 'aint',
 'airport',
 'album',
 'alex',
 'alive',
 'almost',
 'alone',
 'along',
 'alot',
 'already',
 'alright',
 'also',
 'always',
 'amaze',
 'angel',
 'annoy',
 'another',
 'answer',
 'anyone',
 'anything',
 'anyway',
 'apparently',
 'apple',
 'application',
 'appreciate',
 'around',
 'arrive',
 'art',
 'ask',
 'asleep',
 'australia',
 'avatar',
 'aw',
 'award',
 'away',
 'awesome',
 'aww',
 'babe',
 'baby',
 'back',
 'back work',
 'background',
 'bad',
 'bag',
 'bake',
 'band',
 'bar',
 'bath',
 'bbq',
 'bday',
 'beach',
 'bear',
 'beat',
 'beautiful',
 'beautiful day',
 'become',
 'bed',
 'beer',
 'begin',
 'behind',
 'believe',
 'best',
 'best friend',
 'bet',
 'better',
 'bf',
 'big',
 'bike',
 'birthday',
 'bitch',
 'bite',
 'black',
 'blast',
 'bless',
 'blog',
 'blood',
 'blue',
 'boo',
 'book',
 'bore',
 'bos',
 'bottle',
 'bout

In [107]:
first_million_clean_tags[0:1500]

Unnamed: 0,title,artist,clean_text,tags
0,Killa Cam,Cam'ron,"[killa, cam, killa, cam, cam, killa, cam, kill...","[load, second, im, im, bout, hit, shit, world,..."
1,Can I Live,JAY-Z,"[yeah, hah, yeah, rocafella, we, invite, you, ...","[yeah, hah, yeah, you, you, know, well, sort, ..."
2,Forgive Me Father,Fabolous,"[maybe, cause, im, eatin, and, these, bastard,...","[maybe, cause, im, like, you, see, cant, so, i..."
3,Down and Out,Cam'ron,"[ugh, killa, baby, kanye, this, that, heron, f...","[ugh, baby, huh, yeah, let, ugh, im, back, ugh..."
4,Fly In,Lil Wayne,"[so, they, ask, me, young, boy, what, you, gon...","[so, ask, young, boy, what, you, second, time,..."
...,...,...,...,...
1495,Dont Stop,Child Rebel Soldier,"[ugh, what, explain, yourself, how, you, sound...","[ugh, what, you, sound, like, know, youre, pla..."
1496,Wild Horses,The Rolling Stones,"[childhood, live, be, easy, to, do, the, thing...","[live, easy, thing, you, want, buy, you, lady,..."
1497,Thieves in the Night,Black Star,"[yo, d, what, come, on, yeah, what, what, come...","[yo, what, come, yeah, what, what, come, yeah,..."
1498,Windows Media Player,Charles Hamilton,"[somewhere, on, the, desktop, live, the, zip, ...","[live, next, pop, kid, let, kid, clear, you, s..."


In [None]:
all(word in first_million_clean_tags[0:1500] for word in words)

In [8]:
import pandas as pd
import pyarrow.feather as feather

# Load the DataFrame from the Feather file
first_million_clean_tags = feather.read_feather('first_million_clean_tags.feather')

# Define top words to check
top_words = ['last', 'feel', 'last night', 'issue', 'ear']

# Define the range of rows to filter
random_sample = first_million_clean_tags.sample(n=500) 

def any_top_word_in_tags(tags, top_words):
    """
    Check if any word in `top_words` is present in `tags`.
    Assumes `tags` is a list-like object (e.g., list or numpy array).
    """
    return any(word in tags for word in top_words)

# Apply boolean indexing to filter rows
filtered_df = random_sample[random_sample['tags'].apply(lambda tags: any_top_word_in_tags(tags, top_words))]

# Print the shape of the filtered DataFrame to verify
print(f"Filtered DataFrame shape: {filtered_df.shape}")

Filtered DataFrame shape: (194, 4)


In [9]:
filtered_df.he

Unnamed: 0,title,artist,clean_text,tags
310261,Ghetto Life,DMX,"[nigga, say, im, bug, and, ask, me, whats, tha...","[say, im, bug, ask, whats, shit, im, cause, wr..."
356550,Ridin,Classic & 86,"[let, s, get, this, party, start, we, ll, get,...","[let, get, party, start, get, crazy, tonight, ..."
373561,Celebrate You,Steven Curtis Chapman,"[im, no, stranger, to, your, gentle, reminder,...","[im, no, world, around, no, you, speak, word, ..."
633567,Lost Connection With The Head,Showbread,"[oh, lord, im, sick, of, myself, id, rather, b...","[oh, im, sick, rather, im, help, mean, follow,..."
178016,Clarissa Volume 1 Letter 12,Samuel Richardson,"[miss, howe, to, miss, clarissa, harlowe, thur...","[miss, miss, thursday, morning, indeed, you, w..."


### IMPLEMENT LOGIC

# LOGIC OMPLETED FOR MUSIXMATCH FUNCTION

In [40]:
#import libraries

import pandas as pd
import pyarrow.feather as feather
import numpy as np


#load feather file
first_million_clean_tags = feather.read_feather('first_million_clean_tags.feather')


top_words = ['im', 'mean', 'kill', 'science', 'today']



def search_lyrics(top_words):
    
    #create lists of song details
    list_of_artists = []
    list_of_tracks = []
    list_of_lyrics = []

    #get a random sample of 500 songs:
    
    random_sample = first_million_clean_tags.sample(n=500) 
    
    def any_top_word_in_tags(tags, top_words):
        return any(word in tags for word in top_words)
    
    
    filtered_data = random_sample[random_sample['tags'].apply(lambda tags: any_top_word_in_tags(tags, top_words))]


    list_of_artists = filtered_data['artist'].tolist()
    list_of_tracks = filtered_data['title'].tolist()
    list_of_lyrics = filtered_data['clean_text'].tolist()

    #turn in dataframe
    new_lyrics_df = pd.DataFrame({
                        'Artist': list_of_artists,
                        'Track': list_of_tracks,
                        'Lyric_Snippet': list_of_lyrics})
    
    return new_lyrics_df

print(search_lyrics(['im', 'mean', 'kill', 'science', 'today']))

               Artist                                            Track  \
0            Dr. Hook                                         Loveline   
1       Kristin Hersh                                   Vanishing Twin   
2           Don Cisco                                 Just Like Mexico   
3             Unwoman                                      Fugue Fugue   
4     New Found Glory                                  Head Over Heels   
..                ...                                              ...   
280     George Strait              A Little Heaven’s Rubbing Off on Me   
281        Soulsister                                          Changes   
282     Doug E. Fresh                           Everybody Loves a Star   
283  Jasmine Mulliken  Maria Edgeworth’s “Practical Education Chap. 1”   
284           KRS-One                                  Down the Charts   

                                         Lyric_Snippet  
0    [if, you, ever, need, me, call, me, on, your, ...

In [80]:
def join_lyrics(text):
    
    if isinstance(text, np.ndarray):
        
        list_of_strings = text.astype(str).tolist()
        
        return ' '.join(list_of_strings)
    
    return ''

In [81]:
result = first_million_clean_tags['clean_text'][0:5].apply(join_lyrics)

In [82]:
print(result)

0    killa cam killa cam cam killa cam killa cam ki...
1    yeah hah yeah rocafella we invite you to somet...
2    maybe cause im eatin and these bastard fiend f...
3    ugh killa baby kanye this that heron flow huh ...
4    so they ask me young boy what you gon do the s...
Name: clean_text, dtype: object
