In [261]:
import json
import pandas as pd
from collections import Counter
import ast
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text

In [2]:
f = open('dataset/wiki_scraping/wiki_texts.json')
wiki_texts = json.load(f)

In [3]:
big_df = pd.read_csv('big_df.csv')

In [4]:
big_df.drop(columns=['Unnamed: 0'], inplace=True)

In [5]:
big_df['tknzd_lyrics'] = big_df['tknzd_lyrics'].apply(ast.literal_eval)
big_df['emotions'] = big_df['emotions'].apply(ast.literal_eval)
big_df['social_tags'] = big_df['social_tags'].apply(ast.literal_eval)

In [6]:
drop_rows = []
for row in big_df.iterrows():
    if row[1]['emotions'] == []:
        drop_rows.append(row[0])
big_df.drop(drop_rows, inplace=True)

In [262]:
my_stop_words = text.ENGLISH_STOP_WORDS.union({'city'})
vectorizer = TfidfVectorizer(max_df = 0.8, min_df = 10, norm='l2', stop_words = my_stop_words)
X = vectorizer.fit_transform(big_df['tknzd_lyrics'].apply(lambda x: " ".join(x)))

In [263]:
index_to_word = {i:v for i, v in enumerate(vectorizer.get_feature_names_out())}
word_to_index = {v:i for i, v in enumerate(vectorizer.get_feature_names_out())}

In [64]:
with open('word_to_index.pkl', 'wb') as f:
    pickle.dump(word_to_index, f)
with open('index_to_word.pkl', 'wb') as f:
    pickle.dump(index_to_word, f)

In [281]:
with open('song_tf_idf.pickle', 'wb') as f:
    pickle.dump(X, f)

In [264]:
song_to_index = {s:i for i, s in enumerate(big_df['title'])}
index_to_song = {i:s for i, s in enumerate(big_df['title'])}

In [67]:
with open('song_to_index.pkl', 'wb') as f:
    pickle.dump(song_to_index, f)
with open('index_to_song.pkl', 'wb') as f:
    pickle.dump(index_to_song, f)

# wiki

In [90]:
from dataset import wiki_scraping

In [265]:
wiki_corpus = []
for ls in list(wiki_texts.values()):
    wiki_corpus.append(" ".join(ls))

In [266]:
vec2 = vectorizer.transform(wiki_corpus)
vec2 = vec2.toarray()
X = X.toarray()

In [267]:
vec2.sum(axis=1)

array([12.7484876 , 21.72531696,  7.906622  , 17.22400158, 10.24045599,
       19.0786044 ])

In [247]:
print(vec2.shape)
print(X.shape)

(6, 13785)
(34554, 13785)


In [248]:
vec2[2,:]

array([0., 0., 0., ..., 0., 0., 0.])

In [271]:
X=X.toarray()
X[2,:]

array([0., 0., 0., ..., 0., 0., 0.])

In [268]:
loc_to_index = {cty:i for i, cty in enumerate(wiki_texts.keys())}

In [251]:
def cos_sim(city, song):
    city_i = loc_to_index[city]
    song_i = song_to_index[song]
    city_vec = vec2[city_i, :]
    song_vec = X[song_i, :]
    denom = np.linalg.norm(city_vec) * np.linalg.norm(song_vec)
    num = city_vec @ song_vec
    return (num ) /  (denom )

In [252]:
vec2[2,:].shape

(13785,)

In [269]:
def best_songs_for_city(city):
    best = []
    for song in song_to_index:
        sim = cos_sim(city, song)
        best.append((song, sim))
    srtd = sorted(best, key = lambda x: x[1], reverse=True)
    for t in srtd[:10]:
        print("Song: ", t[0], "  Score: {:.3f}".format(t[1]))

In [272]:
best_songs_for_city("New York City")

  return (num ) /  (denom )


Song:  I Love NYC   Score: 0.707
Song:  New York   Score: 0.686
Song:  King Of New York   Score: 0.517
Song:  New York City Cops   Score: 0.512
Song:  Stranger Into Starman   Score: 0.333
Song:  Cocaine In My Brain   Score: 0.319
Song:  Feeling Good   Score: 0.298
Song:  The World I Know   Score: 0.239
Song:  New Noise   Score: 0.214
Song:  Brand New Day   Score: 0.189


In [273]:
best_songs_for_city("London")

  return (num ) /  (denom )


Song:  London Bridge   Score: 0.761
Song:  London Is The Place For Me   Score: 0.727
Song:  London Calling   Score: 0.617
Song:  Glamorous Glue   Score: 0.523
Song:  London Loves   Score: 0.514
Song:  Street Fighting Man   Score: 0.280
Song:  Your Embrace   Score: 0.225
Song:  The Vanishing   Score: 0.185
Song:  Round Here   Score: 0.149
Song:  Delaney   Score: 0.143


In [274]:
best_songs_for_city("Toronto")

  return (num ) /  (denom )


Song:  I Love NYC   Score: 0.180
Song:  North American Scum   Score: 0.168
Song:  North To Alaska   Score: 0.165
Song:  New York   Score: 0.158
Song:  King Of New York   Score: 0.138
Song:  The Ballot or the Bullet   Score: 0.133
Song:  New York City Cops   Score: 0.122
Song:  Looking for Nothing   Score: 0.119
Song:  Shadowplay   Score: 0.118
Song:  Downtown   Score: 0.118


In [275]:
best_songs_for_city("Tokyo")

  return (num ) /  (denom )


Song:  Tokyo Witch   Score: 0.280
Song:  Panda Bear   Score: 0.163
Song:  Never Ending Summer   Score: 0.142
Song:  Award Tour   Score: 0.104
Song:  Back 4 U   Score: 0.075
Song:  Bodhisattva   Score: 0.072
Song:  Impossible Germany   Score: 0.063
Song:  Harajuku Girls   Score: 0.059
Song:  Da Joint   Score: 0.055
Song:  Losing My Edge   Score: 0.050


In [276]:
best_songs_for_city("Mumbai")

  return (num ) /  (denom )


Song:  Indian Girl   Score: 0.158
Song:  Citysong   Score: 0.129
Song:  The Ballot or the Bullet   Score: 0.124
Song:  Fireworks   Score: 0.112
Song:  30 Century Man   Score: 0.107
Song:  21st Century Digital Boy   Score: 0.088
Song:  Twentieth Century Fox   Score: 0.084
Song:  Suburban Home   Score: 0.083
Song:  The Kids   Score: 0.082
Song:  Flux   Score: 0.080


In [277]:
best_songs_for_city("Budapest")

  return (num ) /  (denom )


Song:  Island Home   Score: 0.174
Song:  Total Life Forever   Score: 0.164
Song:  The Ballot or the Bullet   Score: 0.150
Song:  Rock Island Line   Score: 0.140
Song:  Square Biz   Score: 0.139
Song:  Square Dance   Score: 0.126
Song:  Jackson Square   Score: 0.125
Song:  Island   Score: 0.122
Song:  30 Century Man   Score: 0.117
Song:  21st Century Life   Score: 0.114


In [280]:
with open('wiki_tf_idf.pkl', 'wb') as f:
    pickle.dump(vec2, f)

In [278]:
big_df.loc[big_df['title']=='Island Home']

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,song_id,emotions,social_tags,tknzd_lyrics
27228,Island Home,pop,Christine Anu,1995,1286,{},Six years ive been in the city\nAnd everynight...,854752,TRLWWET128F422729C,"[(intense, 50)]","[(90s, 100), (australian, 100), (chillout, 50)...","[six, years, ive, been, in, the, city, and, ev..."


In [282]:
print(len(wiki_texts['Toronto']))

12329


In [283]:
wiki_texts.keys()

dict_keys(['New York City', 'Budapest', 'Tokyo', 'Mumbai', 'London', 'Toronto'])