In [2]:
import pandas as pd, numpy as np
import ast

In [3]:
dataframe = pd.read_csv("../scraping/out.csv")

In [4]:
dataframe.columns

Index(['title', 'singers', 'directors', 'lyricist', 'genre', 'album',
       'download_link', 'poster', 'year'],
      dtype='object')

In [5]:
dataframe.shape

(3489, 9)

In [None]:
dataframe.tail()

In [6]:
#checking for null values
null_lyricist = dataframe.isna()['lyricist'].sum()
null_directors = dataframe.isna()['directors'].sum()
print(null_directors, null_lyricist)

58 384


In [7]:
#removing brackets 
dataframe = dataframe[~dataframe['title'].str.contains(r'reprise|title|theme|version|male|track|edit|redux|unplugged|duet', case=False)]
dataframe.shape

(3199, 9)

In [8]:
dataframe.dropna(subset=['title','singers','download_link', 'year', 'poster'], inplace= True)
# dataframe.drop_duplicates(subset=['title'], inplace = True) # should not perform as two songs can have same names
dataframe.drop_duplicates(subset=['download_link'], inplace = True)
dataframe.reset_index(drop = True, inplace=True)
dataframe.shape

(2145, 9)

In [11]:
def convert_to_words(words):
    try:
        if not words:
            return []
        list = ast.literal_eval(words)
        return " " + " ".join([one_word(s) for s in list])
    except:
        return " "
def one_word(s):
    return "".join(s.lower().split())

In [12]:
new_df = pd.DataFrame()
new_df['tags'] = dataframe['title']
new_df['tags'] += dataframe['singers'].apply(convert_to_words)
new_df['tags'] += dataframe['directors'].apply(convert_to_words)
new_df['tags'] += dataframe['lyricist'].apply(convert_to_words)
new_df['tags'] += dataframe['genre'].apply(convert_to_words)
new_df['tags'] += " " + dataframe['album']

In [13]:
new_df.head().iloc[0].tags

'Allah Teri Kya Shaan Hai jaannissarlone kamalkhan jaannissarlone sahilfatehpuri filmi sufi/qawwali 18.11 (A Code Of Secrecy)'

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [15]:
vector = cv.fit_transform(new_df['tags']).toarray()

In [16]:
type(new_df['tags'])

pandas.core.series.Series

In [17]:
vector.shape

(2145, 5000)

In [18]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vector)

In [19]:
similarity.shape

(2145, 2145)

In [22]:
sorted_indices = np.array([np.argsort(-row)[1: 51] for row in similarity])
np.save('similarity.npy', sorted_indices)

In [None]:
#loading data from numpy
arr = np.load('similarity.npy')
list(arr[0])

In [None]:
def recommend(song):
    index = dataframe[dataframe['title'] == song].index[0]
    similar = enumerate(similarity[index])
    r_songs = sorted(similar, key = lambda x : -x[1])[1:51]
    list_of_songs = []
    for sng in r_songs:
        list_of_songs.append(dataframe.iloc[sng[0]].to_dict())
    return list_of_songs

In [None]:
r = recommend(dataframe.iloc[0].title)

In [23]:
#for saving the database
save_df = pd.DataFrame()
save_df = dataframe[['title', 'download_link', 'year']]

In [120]:
save_df.tail()

Unnamed: 0,title,download_link,poster,year
2140,Tere Vaaste Falak Se Main Chand Laoonga,https://www.youtube.com/watch?v=g5WZLO8BAC8,https://is1-ssl.mzstatic.com/image/thumb/Music...,2023
2141,Baby Tujhe Paap Lagega,https://www.youtube.com/watch?v=-DkUPoe0HCw,https://is1-ssl.mzstatic.com/image/thumb/Music...,2023
2142,Saanjha Jo Bhi Tha Tera Mera Saanjha,https://www.youtube.com/watch?v=XvktHUWSnqY,https://is1-ssl.mzstatic.com/image/thumb/Music...,2023
2143,Zinda Dili 2.0,https://www.youtube.com/watch?v=iRGEjMPxwKA,https://is2-ssl.mzstatic.com/image/thumb/Music...,2023
2144,Yeh Raat Hi Subah Bulayegi,https://www.youtube.com/watch?v=nSCg_MEfaQM,https://is5-ssl.mzstatic.com/image/thumb/Music...,2023


In [24]:
save_df.to_pickle('songs.pkl')

In [25]:
#loading dataframe
df = pd.read_pickle('songs.pkl')
df

Unnamed: 0,title,download_link,year
0,Allah Teri Kya Shaan Hai,https://www.youtube.com/watch?v=rF7JYlu2mhM,2014
1,Yeh Zamin Yeh Aasman Roshni Se Nahaane Lage,https://www.youtube.com/watch?v=6av6kNVx-0I,2014
2,Mera Yaar Thanedaar,https://www.youtube.com/watch?v=djznDrQ05wA,2014
3,Eagle Sa Ego Hai,https://www.youtube.com/watch?v=mI5qs83q6Z8,2014
4,Aandhi Jaisi Raftar Chal Chala Chal (Chal Chal...,https://www.youtube.com/watch?v=8R1rs5dvqyM,2014
...,...,...,...
2140,Tere Vaaste Falak Se Main Chand Laoonga,https://www.youtube.com/watch?v=g5WZLO8BAC8,2023
2141,Baby Tujhe Paap Lagega,https://www.youtube.com/watch?v=-DkUPoe0HCw,2023
2142,Saanjha Jo Bhi Tha Tera Mera Saanjha,https://www.youtube.com/watch?v=XvktHUWSnqY,2023
2143,Zinda Dili 2.0,https://www.youtube.com/watch?v=iRGEjMPxwKA,2023


### User Search recommendation

In [None]:
def join_words(words):
    try:
        if not words:
            return []
        list = ast.literal_eval(words)
        return " " + " ".join(list)
    except:
        return " "

In [None]:
user_search = 'aaj na jaana'

song_title = pd.DataFrame()
song_titles['title'] = dataframe['title']
song_titles['tags'] = dataframe['title'] + dataframe['singers'].apply(join_words) + dataframe['directors'].apply(join_words) + dataframe['lyricist'].apply(join_words)
song_titles['tags'] = song_titles['tags'] + " " + dataframe['album']
song_titles['tags'] = song_titles['tags'] + " " + dataframe['year'].apply(lambda x : str(x))
print(song_titles.shape)
song_titles.tail()

In [None]:
search_words = np.array(song_titles['tags'])
print(len(search_words))
np.save('search_similarity.npy', search_words)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
vectorizer = TfidfVectorizer()
all_transform = vectorizer.fit_transform(song_titles['tags'])

In [None]:
all_transform

In [None]:
query_transform = vectorizer.transform([user_search])

In [None]:
similar_songs = cosine_similarity(query_transform, all_transform)

In [None]:
songs = np.argsort(-similar_songs[0]).tolist()[:50]

### homepage songs

In [None]:
songs = dataframe.sort_values(by = "year").iloc[-50:]

In [None]:
songs