In [37]:
import json
import pandas as pd
from collections import Counter
import ast
import numpy as np
import matplotlib.pyplot as plt
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
f = open('dataset/wiki_scraping/wiki_texts.json')
wiki_texts = json.load(f)

In [3]:
big_df = pd.read_csv('big_df.csv')

In [4]:
big_df.drop(columns=['Unnamed: 0'], inplace=True)

In [5]:
big_df['tknzd_lyrics'] = big_df['tknzd_lyrics'].apply(ast.literal_eval)
big_df['emotions'] = big_df['emotions'].apply(ast.literal_eval)
big_df['social_tags'] = big_df['social_tags'].apply(ast.literal_eval)

In [6]:
drop_rows = []
for row in big_df.iterrows():
    if row[1]['emotions'] == []:
        drop_rows.append(row[0])
big_df.drop(drop_rows, inplace=True)

In [114]:
vectorizer = TfidfVectorizer(max_df = 0.8, min_df = 10, norm='l2', stop_words = 'english')
X = vectorizer.fit_transform(big_df['tknzd_lyrics'].apply(lambda x: " ".join(x)))

In [63]:
index_to_word = {i:v for i, v in enumerate(vectorizer.get_feature_names_out())}
word_to_index = {v:i for i, v in enumerate(vectorizer.get_feature_names_out())}

In [64]:
with open('word_to_index.pkl', 'wb') as f:
    pickle.dump(word_to_index, f)
with open('index_to_word.pkl', 'wb') as f:
    pickle.dump(index_to_word, f)

In [69]:
with open('song_tf_idf.pickle', 'wb') as f:
    pickle.dump(X, f)

In [66]:
song_to_index = {s:i for i, s in enumerate(big_df['title'])}
index_to_song = {i:s for i, s in enumerate(big_df['title'])}

In [67]:
with open('song_to_index.pkl', 'wb') as f:
    pickle.dump(song_to_index, f)
with open('index_to_song.pkl', 'wb') as f:
    pickle.dump(index_to_song, f)

# wiki

In [90]:
from dataset import wiki_scraping

In [115]:
vec2 = vectorizer.transform(wiki_texts.keys())
vec2 = vec2.toarray()
X = X.toarray()

In [93]:
loc_to_index = {cty:i for i, cty in enumerate(wiki_texts.keys())}

In [116]:
def cos_sim(city, song):
    city_i = loc_to_index[city]
    song_i = song_to_index[song]
    city_vec = vec2[city_i, :]
    song_vec = X[song_i, :]
    denom = np.linalg.norm(city_vec) * np.linalg.norm(song_vec)
    num = city_vec @ song_vec
    return (num ) /  (denom )

In [126]:
best_matches = []
for song in song_to_index:
    sim = cos_sim("New York City", song)
    best_matches.append((song, sim))

  return (num ) /  (denom )


In [127]:
print("New York City Top Songs")
srtd = sorted(best_matches, key = lambda x: x[1], reverse=True)
for t in srtd[:10]:
    print("Song: ", t[0], "  Score: {:.6f}".format(t[1]))

New York City Top Songs
Song:  I Love NYC   Score: 0.897926
Song:  New York City Boy   Score: 0.754490
Song:  New York   Score: 0.750671
Song:  New York City Cops   Score: 0.699394
Song:  King Of New York   Score: 0.519181
Song:  New York Tendaberry   Score: 0.503772
Song:  Sex City   Score: 0.435709
Song:  Suffragette City   Score: 0.373751
Song:  City Of Love   Score: 0.357172
Song:  Moving To New York   Score: 0.347174


In [135]:
best_matches_london = []
for song in song_to_index:
    sim = cos_sim("London", song)
    best_matches_london.append((song, sim))

  return (num ) /  (denom )


In [136]:
print("London Top Songs")
srtd = sorted(best_matches_london, key = lambda x: x[1], reverse=True)
for t in srtd[:10]:
    print("Song: ", t[0], "  Score: {:.6f}".format(t[1]))

London Top Songs
Song:  London Bridge   Score: 0.839286
Song:  London Is The Place For Me   Score: 0.785101
Song:  London Calling   Score: 0.677222
Song:  London Bombs   Score: 0.644769
Song:  The Angel Pool   Score: 0.622987
Song:  Glamorous Glue   Score: 0.576287
Song:  London Loves   Score: 0.567062
Song:  London Skies   Score: 0.525954
Song:  London Leatherboys   Score: 0.465345
Song:  I Love London   Score: 0.434061


In [139]:
with open('wiki_tf_idf.pkl', 'wb') as f:
    pickle.dump(X, f)

In [143]:
X.sum(axis=0)

array([2.27070681, 4.7210389 , 3.65440345, ..., 5.10556192, 0.97849738,
       2.11076308])