In [3]:
%pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [13]:
import pandas as pd

# Load the data
df = pd.read_csv('kaggle_spotify_training_set/spotify_10000_sample.csv')

# Display the shape, head, and check for null values
print("Original shape:", df.shape)
print(df.head())
print(df.isnull().sum())

# Randomly sample 10,000 rows, 'random_state' ensures reproducibility
df_sampled = df.sample(n=10000, random_state=42)

# Display the shape and head of the reduced dataset
print("Reduced shape:", df_sampled.shape)
print(df_sampled.head())


Original shape: (10000, 4)
         artist                       song  \
0  Wishbone Ash             Right Or Wrong   
1     Aerosmith  This Little Light Of Mine   
2  Fall Out Boy               Dance, Dance   
3  Janis Joplin                 Easy Rider   
4   Moody Blues                  Peak Hour   

                                                link  \
0       /w/wishbone+ash/right+or+wrong_20147150.html   
1  /a/aerosmith/this+little+light+of+mine_2064448...   
2          /f/fall+out+boy/dance+dance_10113666.html   
3           /j/janis+joplin/easy+rider_10147381.html   
4             /m/moody+blues/peak+hour_20291295.html   

                                                text  
0  Like to have you 'round  \r\nWith all the lies...  
1  This Little Light of Mine (Light of Mine),  \r...  
2  She says she's no good with words but I'm wors...  
3  Hey mama, mama, come a look at sister,  \r\nSh...  
4  I see it all through my window it seems.  \r\n...  
artist    0
song      0
link 

In [14]:
# Randomly sample 1000 rows
df_sampled = df.sample(n=1000, random_state=42)

# Save the sampled data to a new CSV file
df_sampled.to_csv('spotify_1000_sample.csv', index=False)


In [17]:
# Load the data
df = pd.read_csv('kaggle_spotify_training_set/spotify_1000_sample.csv')

#we will get the shape, display the dataframe(first items(head)) and see if there is any null values.
df.shape
df.head()
#df.isnull().sum()

Unnamed: 0,artist,song,link,text
0,Kanye West,Pro Nails,/k/kanye+west/pro+nails_21034368.html,Got her toes done up with her fingernails matc...
1,Nat King Cole,I Must Be Dreaming,/n/nat+king+cole/i+must+be+dreaming_20824749.html,I must be dreaming \r\nI must be dreaming \r...
2,Hank Williams Jr.,All I Can Give You Is My Heart,/h/hank+williams+jr/all+i+can+give+you+is+my+h...,Let's take the final step and get married caus...
3,Talking Heads,People Like Us,/t/talking+heads/people+like+us_20135111.html,In 1950 when I was born \r\nPapa couldn't aff...
4,George Strait,I'll Always Be Loving You,/g/george+strait/ill+always+be+loving+you_2005...,"You can turn me loose, tear me up and run me d..."


In [18]:
#we will drop the link column as it is not useful for our analysis coze we only need the artist, song and text(lyrics)
df = df.drop(['link'], axis=1).reset_index(drop=True)
df.shape

(1000, 3)

In [19]:
df['text'] = df['text'].str.lower().replace(r'^\w\s', ' ', regex=True).replace(r'\n', ' ', regex=True)
df.head(20)

Unnamed: 0,artist,song,text
0,Kanye West,Pro Nails,got her toes done up with her fingernails matc...
1,Nat King Cole,I Must Be Dreaming,must be dreaming \r i must be dreaming \r i...
2,Hank Williams Jr.,All I Can Give You Is My Heart,let's take the final step and get married caus...
3,Talking Heads,People Like Us,in 1950 when i was born \r papa couldn't affo...
4,George Strait,I'll Always Be Loving You,"you can turn me loose, tear me up and run me d..."
5,ZZ Top,Cheap Sunglasses,when you get up in the morning and the light i...
6,Garth Brooks,Papa Loved Mama,papa drove a truck nearly all his life \r you...
7,W.A.S.P.,Sweet Cheetah,one night as i rolled down some long winding r...
8,Faith Hill,Better Days,hard times are fallin' on you \r even when yo...
9,Indiana Bible College,Forevermore,"verse \r forevermore, lord i will worship you..."


In [20]:
%pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [21]:
import nltk
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import PorterStemmer

# Create instances
tokenizer = TreebankWordTokenizer()
stemmer = PorterStemmer()

# Define the tokenize method
def tokenize(txt):
    tokens = tokenizer.tokenize(txt)
    # Use the stemmer to stem each token
    stemmed = [stemmer.stem(t) for t in tokens]
    return " ".join(stemmed)

# Test the tokenize method
print(tokenize("I love your beautiful, beauty and lovely face thats full of love"))
print(tokenize("will you be my lover, babe as my love for you is endless, even if you are not beautiful, baby, i promise to be a lovely husband"))
print(tokenize("I am a lover of beauty and lovely, beautiful things"))

i love your beauti , beauti and love face that full of love
will you be my lover , babe as my love for you is endless , even if you are not beauti , babi , i promis to be a love husband
i am a lover of beauti and love , beauti thing


In [22]:
%pip install scikit-learn




In [23]:
df['text'].apply(lambda x: tokenize(x))

0      got her toe done up with her fingernail matchi...
1      must be dream i must be dream i must be dream ...
2      let 's take the final step and get marri caus ...
3      in 1950 when i wa born papa could n't afford t...
4      you can turn me loos , tear me up and run me d...
                             ...                        
995    been here for day , i 'm amaz ( at ) thi teach...
996    ya better come insid when you 're readi to but...
997    have so much anticip for the futur gener i thi...
998    usual in the morn i 'm fill with sweet belong ...
999    the time ha come for you to make up your own m...
Name: text, Length: 1000, dtype: object

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
# Create a TfidfVectorizer instance
vectorizer = TfidfVectorizer(analyzer='word', stop_words='english')

# Transform the text (lyrics) into a matrix
lyrics_matrix = vectorizer.fit_transform(df['text'])

# Calculate the cosine similarity between the lyrics
cosine_sim = cosine_similarity(lyrics_matrix)
cosine_sim[0]

array([1.00000000e+00, 2.69666447e-03, 2.97164908e-02, 9.54812727e-03,
       2.55324066e-03, 7.99788920e-03, 5.44971240e-03, 1.66376721e-03,
       1.28272238e-02, 1.27392685e-03, 9.66062427e-03, 1.27337881e-02,
       4.80041947e-03, 4.78198835e-03, 2.95544138e-03, 7.17777381e-03,
       2.75036234e-03, 2.22401140e-02, 4.53202366e-02, 3.94356054e-03,
       2.06354624e-03, 9.55753810e-04, 5.97294090e-04, 7.15996525e-03,
       1.69622221e-02, 1.72000989e-02, 3.51090429e-03, 2.15891781e-03,
       2.21243392e-02, 1.34918828e-02, 1.81001580e-03, 8.49250145e-04,
       4.71583486e-03, 0.00000000e+00, 1.44304374e-02, 4.30424739e-04,
       1.15963370e-03, 2.41112453e-03, 2.35238935e-02, 1.57765436e-02,
       4.02786341e-03, 8.91120454e-03, 8.27471975e-03, 4.64765549e-03,
       2.37405292e-02, 2.30632072e-02, 8.59085871e-03, 2.04064344e-03,
       6.07042578e-03, 1.36079136e-02, 2.57290968e-02, 7.18498573e-04,
       4.50957289e-03, 7.38606344e-03, 2.14186932e-02, 8.25709558e-03,
      

In [27]:
df[df['song'] == 'Pro Nails']

Unnamed: 0,artist,song,text
0,Kanye West,Pro Nails,got her toes done up with her fingernails matc...


In [29]:
def recommendation(song_df):
    idx = df[df['song'] == song_df].index[0]
    distances = sorted(list(enumerate(cosine_sim[idx])),reverse=True,key=lambda x:x[1])
    
    songs = []
    for m_id in distances[1:21]:
        songs.append(df.iloc[m_id[0]].song)
        
    return songs

In [31]:
recommendation('Pro Nails')

['Be My Alibi',
 'Got My Mind Made Up',
 'Scatterbrain',
 'Something You Got',
 'Bad Disease',
 'Fallen Angel',
 'Pretty Noose',
 'Stand Down',
 'Best I Can',
 "It Don't Mean A Thing (If It Ain't Got That Swing)",
 'We Got The Love',
 'Pictures',
 'For Once In My Life',
 'Hangover',
 'Turnstyled, Junkpiled',
 "I'm Coming Out",
 'Fit To Be Tied',
 'Amen',
 'All You Get From Love Is A Love Song',
 "We've Got A Groovey Thing Goin'"]

In [33]:
import pickle
pickle.dump(cosine_sim,open('similarity.pkl','wb'))
pickle.dump(df,open('df.pkl','wb'))