In [1]:
import json
import pandas as pd
from collections import Counter
import ast
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text

In [2]:
f = open('backend/dataset/wiki_scraping/wiki_texts.json')
wiki_texts = json.load(f)

In [3]:
big_df = pd.read_csv('backend/dataset/big_df_edited.csv')

In [4]:
big_df.drop(columns=['Unnamed: 0'], inplace=True)

In [5]:
big_df['tknzd_lyrics'] = big_df['tknzd_lyrics'].apply(ast.literal_eval)
big_df['emotions'] = big_df['emotions'].apply(ast.literal_eval)
big_df['social_tags'] = big_df['social_tags'].apply(ast.literal_eval)

In [7]:
big_df.shape

(34462, 14)

In [8]:
drop_rows = []
for row in big_df.iterrows():
    if row[1]['emotions'] == []:
        drop_rows.append(row[0])
big_df.drop(drop_rows, inplace=True)

In [9]:
big_df.shape

(34462, 14)

In [25]:
# my_stop_words = text.ENGLISH_STOP_WORDS.union({'city'})
vectorizer = TfidfVectorizer(max_df = 0.8, min_df = 10, norm='l2', ngram_range=(1,2))#, stop_words = list(my_stop_words))
X = vectorizer.fit_transform(big_df['tknzd_lyrics'].apply(lambda x: " ".join(x)))

with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

In [27]:
index_to_word = {i:v for i, v in enumerate(vectorizer.get_feature_names_out())}
word_to_index = {v:i for i, v in enumerate(vectorizer.get_feature_names_out())}

In [28]:
with open('word_to_index.pkl', 'wb') as f:
    pickle.dump(word_to_index, f)
with open('index_to_word.pkl', 'wb') as f:
    pickle.dump(index_to_word, f)

In [29]:
with open('song_tf_idf.pkl', 'wb') as f:
    pickle.dump(X, f)

In [30]:
song_to_index = {s:i for i, s in enumerate(big_df['title'])}
index_to_song = {i:s for i, s in enumerate(big_df['title'])}

In [31]:
with open('song_to_index.pkl', 'wb') as f:
    pickle.dump(song_to_index, f)
with open('index_to_song.pkl', 'wb') as f:
    pickle.dump(index_to_song, f)

# wiki

In [12]:
from dataset import wiki_scraping

ImportError: cannot import name 'wiki_scraping' from 'dataset' (unknown location)

In [26]:
print(len(wiki_texts['Tel Aviv']))

18721


In [32]:
wiki_corpus = []
for ls in list(wiki_texts.values()):
    wiki_corpus.append(" ".join(ls))

In [33]:
vec2 = vectorizer.transform(wiki_corpus)
with open('wiki_tf_idf.pkl', 'wb') as f:
    pickle.dump(vec2, f)
vec2 = vec2.toarray()
X = X.toarray()

In [34]:
vec2.sum(axis=1)

array([15.95100142, 24.40802229, 24.52448814, 14.18638521, 12.89577233,
       15.79102557, 14.85266237,  4.29339868, 15.76406253, 17.6947217 ,
       16.16169354, 14.74885545,  3.4298487 , 25.06943079, 13.47167073,
       14.91462377, 15.10871523, 14.93484307,  2.96911613, 14.49839891,
       14.30121627, 12.12873148,  3.66473977, 16.59888398, 14.53352165,
       19.90849696, 14.96905919, 21.60481588, 17.0440095 , 17.17163077,
       13.18533379, 20.7252089 , 22.18181566, 18.59858504, 21.9349362 ,
       21.1844326 , 21.86923404, 22.5060142 , 20.18377731, 23.4091524 ,
       22.73092547, 22.40757845, 16.2917886 , 12.55032311, 20.50205988,
       24.00794558, 17.38231169])

In [35]:
print(vec2.shape)
print(X.shape)

(47, 66969)
(34462, 66969)


In [36]:
vec2[2,:]

array([0., 0., 0., ..., 0., 0., 0.])

In [37]:
loc_to_index = {cty:i for i, cty in enumerate(wiki_texts.keys())}

In [38]:
def cos_sim(city, song):
    city_i = loc_to_index[city]
    song_i = song_to_index[song]
    city_vec = vec2[city_i, :]
    song_vec = X[song_i, :]
    denom = np.linalg.norm(city_vec) * np.linalg.norm(song_vec)
    num = city_vec @ song_vec
    return (num ) /  (denom )

In [39]:
vec2[2,:].shape

(66969,)

In [40]:
def best_songs_for_city(city):
    best = []
    for song in song_to_index:
        sim = cos_sim(city, song)
        best.append((song, sim))
    srtd = sorted(best, key = lambda x: x[1], reverse=True)
    for t in srtd[:10]:
        print("Song: ", t[0], "  Score: {:.3f}".format(t[1]))

In [41]:
best_songs_for_city("New York City")

  return (num ) /  (denom )


Song:  I Love NYC   Score: 0.646
Song:  New York City Boy   Score: 0.634
Song:  New York City Cops   Score: 0.546
Song:  New York   Score: 0.537
Song:  King Of New York   Score: 0.396
Song:  The Incumbent   Score: 0.320
Song:  New York Slave   Score: 0.289
Song:  Cocaine In My Brain   Score: 0.256
Song:  Stranger Into Starman   Score: 0.251
Song:  Colour Green   Score: 0.236


In [42]:
best_songs_for_city("London")

  return (num ) /  (denom )


Song:  London Bridge   Score: 0.478
Song:  London Calling   Score: 0.451
Song:  London Loves   Score: 0.311
Song:  Glamorous Glue   Score: 0.282
Song:  London Leatherboys   Score: 0.273
Song:  Streets of London   Score: 0.236
Song:  I Love London   Score: 0.221
Song:  Sweet Thames Flow Softly   Score: 0.175
Song:  Street Fighting Man   Score: 0.155
Song:  In the City   Score: 0.154


In [34]:
best_songs_for_city("Tel Aviv")

  return (num ) /  (denom )


Song:  Victime de la mode   Score: 0.259
Song:  Redrum   Score: 0.209
Song:  Red Velvet   Score: 0.126
Song:  The Ballot or the Bullet   Score: 0.055
Song:  Hitten   Score: 0.040
Song:  100%   Score: 0.039
Song:  Oh Jerusalem   Score: 0.037
Song:  Poster Princess   Score: 0.037
Song:  Home Life   Score: 0.035
Song:  Intro   Score: 0.035


In [35]:
best_songs_for_city("Tokyo")

  return (num ) /  (denom )


Song:  Tokyo Witch   Score: 0.280
Song:  Panda Bear   Score: 0.163
Song:  Never Ending Summer   Score: 0.142
Song:  Award Tour   Score: 0.104
Song:  Back 4 U   Score: 0.075
Song:  Bodhisattva   Score: 0.072
Song:  Impossible Germany   Score: 0.063
Song:  Harajuku Girls   Score: 0.059
Song:  Da Joint   Score: 0.055
Song:  Losing My Edge   Score: 0.050


In [36]:
best_songs_for_city("Mumbai")

  return (num ) /  (denom )


Song:  Indian Girl   Score: 0.158
Song:  Citysong   Score: 0.129
Song:  The Ballot or the Bullet   Score: 0.124
Song:  Fireworks   Score: 0.112
Song:  30 Century Man   Score: 0.107
Song:  21st Century Digital Boy   Score: 0.088
Song:  Twentieth Century Fox   Score: 0.084
Song:  Suburban Home   Score: 0.083
Song:  The Kids   Score: 0.082
Song:  Flux   Score: 0.080


In [31]:
best_songs_for_city("Seoul")

  return (num ) /  (denom )


Song:  City of Night   Score: 0.263
Song:  City Of Love   Score: 0.263
Song:  Sex City   Score: 0.250
Song:  Thin Blue Flame   Score: 0.237
Song:  1% Of One   Score: 0.222
Song:  City of Devils   Score: 0.221
Song:  The Best of Times   Score: 0.221
Song:  We Built This City   Score: 0.219
Song:  The Buzz Kill   Score: 0.219
Song:  An Erotic Alchemy   Score: 0.217


In [33]:
with open('loc_to_index.pkl', 'wb') as f:
    pickle.dump(loc_to_index, f)