In [32]:
import pandas as pd
import numpy as np
import nltk 
nltk.download('punkt')
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('omw-1.4')
from collections import defaultdict

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\yukwa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\yukwa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\yukwa\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [33]:
# Import extracted_languages.csv
languages_df = pd.read_csv('data/extracted_languages.csv', index_col=0)

# Import extracted_superset_genres.csv
genres_df = pd.read_csv('data/extracted_superset_genres.csv', index_col=0)


In [34]:
languages_df.head()

Unnamed: 0,language_iso,language_name
0,en,English
1,de,German
2,ja,Japanese
3,da,Danish
4,sl,Slovenian


In [35]:
genres_df.head()

Unnamed: 0,0
0,adult standards
1,merseybeat
2,british invasion
3,brill building pop
4,rockabilly


In [36]:
language_array = languages_df['language_name'].to_numpy()
genres_array = genres_df['0'].to_numpy()

In [37]:
sentence = input("What would you like your playlist to convey?")
tokens = nltk.word_tokenize(sentence)

In [38]:
tokens

['I', 'am', 'so', 'excited', 'today', '.']

In [39]:
for token in tokens:
    if wordnet.synsets(token)!= []:
        print(wordnet.synsets(token)[:2])


[Synset('iodine.n.01'), Synset('one.n.01')]
[Synset('americium.n.01'), Synset('master_of_arts.n.01')]
[Synset('sol.n.03'), Synset('so.r.01')]
[Synset('excite.v.01'), Synset('stimulate.v.01')]
[Synset('today.n.01'), Synset('today.n.02')]


In [40]:
num_attributes = ["danceability", "energy", "loudness", "mode", "speechiness", "acousticness", "instrumentalness", "liveness", "valence", "tempo"]
cat_attributes = ["genre", "lang", "key"]

attribute_scores = {}
for attribute in num_attributes:
    attribute_scores[attribute] = []
    attribute_synsets = wordnet.synsets(attribute)
    for token in tokens:
        token_synsets = wordnet.synsets(token)
        for token_synset in token_synsets:
            max_score = 0
            for attr_synset in attribute_synsets:
                similarity_score = token_synset.path_similarity(attr_synset)
                if similarity_score and similarity_score > max_score:
                    max_score = similarity_score
            attribute_scores[attribute].append((token_synset, max_score))

attribute_scores


{'danceability': [(Synset('iodine.n.01'), 0),
  (Synset('one.n.01'), 0),
  (Synset('i.n.03'), 0),
  (Synset('one.s.01'), 0),
  (Synset('americium.n.01'), 0),
  (Synset('master_of_arts.n.01'), 0),
  (Synset('amplitude_modulation.n.01'), 0),
  (Synset('be.v.01'), 0),
  (Synset('be.v.02'), 0),
  (Synset('be.v.03'), 0),
  (Synset('exist.v.01'), 0),
  (Synset('be.v.05'), 0),
  (Synset('equal.v.01'), 0),
  (Synset('constitute.v.01'), 0),
  (Synset('be.v.08'), 0),
  (Synset('embody.v.02'), 0),
  (Synset('be.v.10'), 0),
  (Synset('be.v.11'), 0),
  (Synset('be.v.12'), 0),
  (Synset('cost.v.01'), 0),
  (Synset('sol.n.03'), 0),
  (Synset('so.r.01'), 0),
  (Synset('so.r.02'), 0),
  (Synset('so.r.03'), 0),
  (Synset('so.r.04'), 0),
  (Synset('so.r.05'), 0),
  (Synset('thus.r.02'), 0),
  (Synset('so.r.07'), 0),
  (Synset('then.r.01'), 0),
  (Synset('therefore.r.01'), 0),
  (Synset('indeed.r.01'), 0),
  (Synset('excite.v.01'), 0),
  (Synset('stimulate.v.01'), 0),
  (Synset('stimulate.v.06'), 0),
  (S

In [41]:
attribute_scores = defaultdict(int)

for token in tokens:
    token_synsets = wordnet.synsets(token)
    
    for genre in genres_array:
        max_score = -1  # Initialize max_score with a value lower than any possible similarity score
        genre_synsets = wordnet.synsets(genre)
        
        for token_synset in token_synsets:
            for genre_synset in genre_synsets:
                similarity_score = token_synset.path_similarity(genre_synset)
                if similarity_score and similarity_score > max_score:
                    max_score = similarity_score
        
        attribute_scores[genre] += max_score

# Find the top 5 genres based on aggregate similarity scores
top_5_genres = sorted(attribute_scores, key=attribute_scores.get, reverse=True)[:5]
print("Top 5 genres:", top_5_genres)

Top 5 genres: ['metal', 'beats', 'experimental', 'classical', 'pop']


In [42]:
# Define the path of the output file
output_file = 'data/nlp_genre.txt'

# Write the top 5 genres to the text file
with open(output_file, 'w') as f:
    f.write(str(top_5_genres))

In [44]:
from langdetect import detect
count = 0
for i in range(languages_df['language_name'].count()):
    if languages_df['language_name'][i] in sentence:
        print(languages_df['language_name'][i])
    else:
        count += 1
        
if count == languages_df['language_name'].count():
    language = detect(sentence)
    print(language)

en


In [45]:
# Define the path of the output file
output_file = 'data/nlp_language.txt'

# Write the top 5 genres to the text file
with open(output_file, 'w') as f:
    f.write(language)