In [1]:
from bs4 import BeautifulSoup
import requests

In [2]:
jazz_musicians = ['Charlie_Parker',
                  'Dizzy_Gillespie',
                  'Art_Tatum',
                  'Clark_Terry',
                  'Dave_Brubeck',
                  'Thelonious_Monk',
                  'Charles_Mingus',
                  'Benny_Goodman',
                  'Count_Basie',
                  'John_Coltrane',
                  'Miles_Davis',
                  'Sun_Ra',
                  'Nina_Simone',
                  'Fats_Waller',
                  'Duke_Ellington',
                  'Louis_Armstrong']
documents = {}
for musician in jazz_musicians:
    r = requests.get('https://en.wikipedia.org/wiki/' + musician)
    soup = BeautifulSoup(r.content)
    main_div = soup.find('div', attrs={'class':'mw-parser-output'})
    paragraphs = main_div.find_all('p')
    paragraphs = [p.get_text() for p in paragraphs]
    documents[musician] = ''.join(paragraphs)

In [3]:
nina = documents['Nina_Simone'][0:200]
print(nina)


Eunice Kathleen Waymon (February 21, 1933 – April 21, 2003), known professionally as Nina Simone, was an American singer, songwriter, musician, arranger, and civil rights activist. Her music spanned 


In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [5]:
nltk.download(['punkt', 'stopwords'])

[nltk_data] Downloading package punkt to /home/daniel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/daniel/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
nina_tokenized = word_tokenize(nina)
print(nina_tokenized)

['Eunice', 'Kathleen', 'Waymon', '(', 'February', '21', ',', '1933', '–', 'April', '21', ',', '2003', ')', ',', 'known', 'professionally', 'as', 'Nina', 'Simone', ',', 'was', 'an', 'American', 'singer', ',', 'songwriter', ',', 'musician', ',', 'arranger', ',', 'and', 'civil', 'rights', 'activist', '.', 'Her', 'music', 'spanned']


In [7]:
eng_stopwords = set(stopwords.words('english'))
nina_wo_stopwords = [t for t in nina_tokenized if t not in eng_stopwords]
print(nina_wo_stopwords)

['Eunice', 'Kathleen', 'Waymon', '(', 'February', '21', ',', '1933', '–', 'April', '21', ',', '2003', ')', ',', 'known', 'professionally', 'Nina', 'Simone', ',', 'American', 'singer', ',', 'songwriter', ',', 'musician', ',', 'arranger', ',', 'civil', 'rights', 'activist', '.', 'Her', 'music', 'spanned']


Removing tokens consisting of only ponctuation.

In [8]:
punctuation = set('&%$()"<>.,;:?!-[]{}–')
nina_wo_stopwords_and_punct = [t for t in nina_wo_stopwords if t not in punctuation]
print(nina_wo_stopwords_and_punct)

['Eunice', 'Kathleen', 'Waymon', 'February', '21', '1933', 'April', '21', '2003', 'known', 'professionally', 'Nina', 'Simone', 'American', 'singer', 'songwriter', 'musician', 'arranger', 'civil', 'rights', 'activist', 'Her', 'music', 'spanned']


Also applying Stemming.

In [9]:
from nltk.stem import SnowballStemmer

In [10]:
stemmer = SnowballStemmer('english')
nina_wo_stopwords_and_punct_stemmed = [stemmer.stem(t) for t in nina_wo_stopwords_and_punct]
print(nina_wo_stopwords_and_punct_stemmed)

['eunic', 'kathleen', 'waymon', 'februari', '21', '1933', 'april', '21', '2003', 'known', 'profession', 'nina', 'simon', 'american', 'singer', 'songwrit', 'musician', 'arrang', 'civil', 'right', 'activist', 'her', 'music', 'span']


Applying the whole process to the corpus.

In [11]:
processed_corpus = {}
eng_stopwords = set(stopwords.words('english'))
punctuation = set('&%#$()/"<>.,;:?!+-—[]{}–')
stemmer = SnowballStemmer('english')

def process_raw_text(d):
    result = word_tokenize(d)
    result = [t for t in result if t not in eng_stopwords]
    result = [t for t in result if t not in punctuation]
    result = [stemmer.stem(t) for t in result]
    return result

for k, v in documents.items():
    processed_corpus[k] = process_raw_text(v)


In [12]:
unique_words = set()
for d in processed_corpus.values():
    unique_words = unique_words.union(d)

len(unique_words)

7778

In [13]:
import math

def tf(t, d):
    return d.count(t)

def idf(t, D):
    docs_with_t = sum([1 for d in D.values() if t in d])
    return math.log(len(D) / docs_with_t)

def tfidf(t, d, D):
    return tf(t, d) * idf(t, D)

In [14]:
tfidf_dict = {}
D = processed_corpus
for k, d in processed_corpus.items():
    tfidf_dict[k] = {}
    for t in unique_words:
        tfidf_dict[k][t] = tfidf(t, d, D)

In [15]:
import pandas as pd

df = pd.DataFrame(tfidf_dict)
df_t = df.transpose()
df_t

Unnamed: 0,armi,warmed-ov,publish,lode,belong,hall,abrupt,whether,spare,class,...,revisit,grove,jerk,imprison,foul,karl,miss,jam,sorcer,ascent
Charlie_Parker,0.0,0.0,0.207639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.653357,3.489452,0.0,0.0
Dizzy_Gillespie,0.0,0.0,0.207639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.772589,0.0,0.826679,1.163151,0.0,0.0
Art_Tatum,0.0,0.0,0.622918,0.0,0.0,0.0,0.0,1.386294,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,5.545177,1.653357,0.0,0.0,0.0
Clark_Terry,0.0,0.0,0.207639,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Dave_Brubeck,5.021929,0.0,0.415279,2.772589,0.0,0.0,0.0,0.0,2.079442,1.386294,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Thelonious_Monk,0.0,0.0,0.0,0.0,2.772589,0.0,1.673976,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Charles_Mingus,1.673976,0.0,0.415279,0.0,0.0,0.0,0.0,2.772589,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Benny_Goodman,0.0,0.0,0.415279,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Count_Basie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
John_Coltrane,1.673976,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.158883,1.386294,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.163151,0.0,2.772589


In [16]:
df.Nina_Simone.sort_values(ascending=False).head(20)

simon            210.921031
nina              80.405073
goddam            19.408121
stroud            19.408121
lisa              19.408121
mississippi       16.635532
civil             15.065788
spell             14.556091
waymon            13.862944
her               13.862944
hansberri         13.862944
ian               13.862944
she               13.160102
sampl             12.476649
misunderstood     12.476649
curti             12.476649
me                11.631508
nijmegen          11.090355
footag            11.090355
nomin              9.704061
Name: Nina_Simone, dtype: float64

In [17]:
import numpy as np

def cosine_similarity(a, b):
    dot_product = np.linalg.multi_dot([a, b])
    norm_product = np.linalg.norm(a) * np.linalg.norm(a)
    return dot_product / norm_product

Who is the most similar musician to Nina Simone?

In [18]:
df.apply(lambda x: cosine_similarity(x, df.Nina_Simone)).sort_values(ascending=False)

Nina_Simone        1.000000
Dizzy_Gillespie    0.075212
Clark_Terry        0.073985
Charlie_Parker     0.073476
Miles_Davis        0.066546
Louis_Armstrong    0.057167
Duke_Ellington     0.055309
Fats_Waller        0.051782
Thelonious_Monk    0.046492
Charles_Mingus     0.040395
Count_Basie        0.037085
John_Coltrane      0.031663
Dave_Brubeck       0.027046
Art_Tatum          0.018622
Benny_Goodman      0.016499
Sun_Ra             0.007472
dtype: float64

Similarity could be used to create a basic search engine.
We could process a query, e.g. "Famous jazz saxophonist born in Kansas who played bebop", the same way we processed the raw data and then use cosine similarity to find the best match.

In [19]:
query = 'The most influential trumpeter in jazz history'
query_after_preproc = process_raw_text(query)
query_vector = pd.Series({t:tfidf(t, query_after_preproc, D) for t in unique_words})

In [20]:
df.apply(lambda x: cosine_similarity(x, query_vector)).sort_values(ascending=False)

Dizzy_Gillespie    0.000148
Charlie_Parker     0.000107
Clark_Terry        0.000101
Louis_Armstrong    0.000090
Miles_Davis        0.000066
John_Coltrane      0.000043
Charles_Mingus     0.000031
Count_Basie        0.000031
Duke_Ellington     0.000029
Fats_Waller        0.000017
Art_Tatum          0.000008
Nina_Simone        0.000006
Benny_Goodman      0.000003
Sun_Ra             0.000002
Dave_Brubeck       0.000002
Thelonious_Monk    0.000000
dtype: float64