In [1]:
import json
import pandas as pd
from collections import Counter
import ast
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd /content/drive/MyDrive/apollocate/4300-Final-Project-2023/

/content/drive/MyDrive/apollocate/4300-Final-Project-2023


In [3]:
f = open('backend/dataset/wiki_scraping/wiki_texts.json')
wiki_texts = json.load(f)

In [4]:
big_df = pd.read_csv('backend/dataset/big_df_edited.csv')

In [5]:
big_df.drop(columns=['Unnamed: 0'], inplace=True)

In [6]:
big_df['tknzd_lyrics'] = big_df['tknzd_lyrics'].apply(ast.literal_eval)
big_df['emotions'] = big_df['emotions'].apply(ast.literal_eval)
big_df['social_tags'] = big_df['social_tags'].apply(ast.literal_eval)

In [11]:
drop_rows = []
for row in big_df.iterrows():
    if row[1]['emotions'] == []:
        drop_rows.append(row[0])
big_df.drop(drop_rows, inplace=True)

In [13]:
# my_stop_words = text.ENGLISH_STOP_WORDS.union({'city'})
vectorizer = TfidfVectorizer(max_df = 0.8, min_df = 10, norm='l2', stop_words = list(my_stop_words))
X = vectorizer.fit_transform(big_df['tknzd_lyrics'].apply(lambda x: " ".join(x)))

In [14]:
with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [15]:
index_to_word = {i:v for i, v in enumerate(vectorizer.get_feature_names_out())}
word_to_index = {v:i for i, v in enumerate(vectorizer.get_feature_names_out())}

In [16]:
with open('word_to_index.pkl', 'wb') as f:
    pickle.dump(word_to_index, f)
with open('index_to_word.pkl', 'wb') as f:
    pickle.dump(index_to_word, f)

In [17]:
with open('song_tf_idf.pickle', 'wb') as f:
    pickle.dump(X, f)

In [18]:
song_to_index = {s:i for i, s in enumerate(big_df['title'])}
index_to_song = {i:s for i, s in enumerate(big_df['title'])}

In [19]:
with open('song_to_index.pkl', 'wb') as f:
    pickle.dump(song_to_index, f)
with open('index_to_song.pkl', 'wb') as f:
    pickle.dump(index_to_song, f)

# wiki

In [20]:
from dataset import wiki_scraping

In [22]:
print(len(wiki_texts['Tel Aviv']))

11191


In [23]:
wiki_corpus = []
for ls in list(wiki_texts.values()):
    wiki_corpus.append(" ".join(ls))

In [24]:
vec2 = vectorizer.transform(wiki_corpus)
vec2 = vec2.toarray()
X = X.toarray()

In [25]:
vec2.sum(axis=1)

array([12.76056599, 21.72531696,  8.19793084,  9.00100552,  9.32252566,
       10.03844417,  9.76060739,  3.37538411, 10.61707356, 11.88622367,
       10.46762739,  9.5804043 ,  2.92019587, 19.97348177,  9.74265325,
        9.45624746,  9.24074649,  9.27382606,  2.52462106, 10.36629091,
        8.93393302,  9.10392665,  2.92501758, 11.13005259,  9.17498756,
        8.83580635,  9.01964941, 16.56791844, 13.17944882, 11.22363697,
        7.93065913, 15.53027504, 18.02373864, 13.92065519, 16.39143812,
       17.22400158, 17.95251449, 18.12331534, 13.88649701, 20.67482374,
       17.43832817, 20.20093963, 10.28426069,  9.28731319, 10.50562482,
       19.59373312,  9.87494007])

In [26]:
print(vec2.shape)
print(X.shape)

(47, 13784)
(34554, 13784)


In [27]:
vec2[2,:]

array([0., 0., 0., ..., 0., 0., 0.])

In [28]:
loc_to_index = {cty:i for i, cty in enumerate(wiki_texts.keys())}

In [29]:
def cos_sim(city, song):
    city_i = loc_to_index[city]
    song_i = song_to_index[song]
    city_vec = vec2[city_i, :]
    song_vec = X[song_i, :]
    denom = np.linalg.norm(city_vec) * np.linalg.norm(song_vec)
    num = city_vec @ song_vec
    return (num ) /  (denom )

In [30]:
vec2[2,:].shape

(13784,)

In [31]:
def best_songs_for_city(city):
    best = []
    for song in song_to_index:
        sim = cos_sim(city, song)
        best.append((song, sim))
    srtd = sorted(best, key = lambda x: x[1], reverse=True)
    for t in srtd[:10]:
        print("Song: ", t[0], "  Score: {:.3f}".format(t[1]))

In [32]:
best_songs_for_city("New York City")

  return (num ) /  (denom )


Song:  I Love NYC   Score: 0.707
Song:  New York   Score: 0.685
Song:  King Of New York   Score: 0.517
Song:  New York City Cops   Score: 0.512
Song:  The Incumbent   Score: 0.430
Song:  Stranger Into Starman   Score: 0.333
Song:  Cocaine In My Brain   Score: 0.318
Song:  Feeling Good   Score: 0.298
Song:  The World I Know   Score: 0.240
Song:  New Noise   Score: 0.214


In [33]:
best_songs_for_city("London")

  return (num ) /  (denom )


Song:  London Bridge   Score: 0.762
Song:  London Is The Place For Me   Score: 0.727
Song:  London Calling   Score: 0.618
Song:  Glamorous Glue   Score: 0.523
Song:  London Loves   Score: 0.514
Song:  Street Fighting Man   Score: 0.281
Song:  Your Embrace   Score: 0.225
Song:  The Vanishing   Score: 0.185
Song:  Round Here   Score: 0.150
Song:  Delaney   Score: 0.143


In [34]:
best_songs_for_city("Tel Aviv")

  return (num ) /  (denom )


Song:  Victime de la mode   Score: 0.259
Song:  Redrum   Score: 0.209
Song:  Red Velvet   Score: 0.126
Song:  The Ballot or the Bullet   Score: 0.055
Song:  Hitten   Score: 0.040
Song:  100%   Score: 0.039
Song:  Oh Jerusalem   Score: 0.037
Song:  Poster Princess   Score: 0.037
Song:  Home Life   Score: 0.035
Song:  Intro   Score: 0.035


In [35]:
best_songs_for_city("Tokyo")

  return (num ) /  (denom )


Song:  Tokyo Witch   Score: 0.280
Song:  Panda Bear   Score: 0.163
Song:  Never Ending Summer   Score: 0.142
Song:  Award Tour   Score: 0.104
Song:  Back 4 U   Score: 0.075
Song:  Bodhisattva   Score: 0.072
Song:  Impossible Germany   Score: 0.063
Song:  Harajuku Girls   Score: 0.059
Song:  Da Joint   Score: 0.055
Song:  Losing My Edge   Score: 0.050


In [36]:
best_songs_for_city("Mumbai")

  return (num ) /  (denom )


Song:  Indian Girl   Score: 0.158
Song:  Citysong   Score: 0.129
Song:  The Ballot or the Bullet   Score: 0.124
Song:  Fireworks   Score: 0.112
Song:  30 Century Man   Score: 0.107
Song:  21st Century Digital Boy   Score: 0.088
Song:  Twentieth Century Fox   Score: 0.084
Song:  Suburban Home   Score: 0.083
Song:  The Kids   Score: 0.082
Song:  Flux   Score: 0.080


In [36]:
best_songs_for_city("Seoul")

  return (num ) /  (denom )


Song:  The Ballot or the Bullet   Score: 0.132
Song:  Luton To Lisbon   Score: 0.130
Song:  The World Is Mine   Score: 0.114
Song:  Beechwood Park   Score: 0.110
Song:  We Built This City   Score: 0.100
Song:  Running The World   Score: 0.099
Song:  Da Joint   Score: 0.098
Song:  DVNO   Score: 0.095
Song:  The Hemp Museum   Score: 0.095
Song:  This Is How We Do It   Score: 0.092


In [35]:
with open('wiki_tf_idf.pkl', 'wb') as f:
    pickle.dump(vec2, f)

In [39]:
big_df.loc[big_df['title']=='DVNO']

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,song_id,emotions,social_tags,tknzd_lyrics
1779,DVNO,rock,Justice,2007,30010,"{""Mehdi Pinson""}",[Verse 1]\nIt's always the same\nAlways ashame...,100497,TRNPKRK128F429831C,"[(party, 13), (energetic, 1), (fun, 1), (sexy,...","[(electronic, 100), (dance, 67), (electro, 61)...","[its, always, the, same, always, ashamed, stor..."


In [2]:
vectorizer

NameError: name 'vectorizer' is not defined