# import

In [42]:
import matplotlib.pyplot as plt
import numpy as np
import nltk
import pandas as pd
import re
import seaborn as sns

import fasttext

from gensim.models import FastText
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from scipy import stats
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

nltk.download('stopwords', quiet=True)

True

# Help Functions

In [2]:
def norm_text(text: str) -> str:
    text = text.lower()
    punctuation = '''!()-[]{};:'"\,<>./?@#$%^&*_~«»»"–+»+"-"”№'''
    
    text = re.sub('\s+', ' ', text)
    text = [i for i in text if i not in punctuation]
    text = ''.join(text).lower()
    
    return text

stop_words = stopwords.words('english')
stop_words = [norm_text(i) for i in stop_words]

def norm_string(string: str, stop_words = []) -> list[str]:
    stemmer = SnowballStemmer("english") 
    punctuation = '''!()-[]{};:'"\,<>./?@#$%^&*_~«»»"–+»+"-"”№'''
    
    string = re.sub('\s+', ' ', string)
    string = [i for i in string if i not in punctuation]
    string = ''.join(string).lower()

    string = nltk.word_tokenize(string)
    string = [i for i in string if i not in stop_words] 
    string = [stemmer.stem(w) for w in string]
    string = [i for i in string if len(i) > 2 and not re.search('^\d+$', i)]

    return string


def analyze_numerical(data: pd.Series) -> dict:
    
    sns.distplot(data)
    plt.show()
    
    stats.probplot(data, dist="norm", plot=plt)
    plt.show()
    
    sns.boxplot(data)
    plt.show()
    
    q1 = data.quantile(0.25)
    q3 = data.quantile(0.75)
    iqr = q3 - q1
    lower = q1 - (1.5 * iqr) 
    upper = q3 + (1.5 * iqr)
    
    outliers = data[(data < lower) | (data > upper)]
    print(data.describe())
    
    return {'outliers_number':  len(outliers), 'lower': lower, 'upper': upper}

# Data Loading

In [3]:
lyrics = pd.read_csv('data/lyrics-data.csv')
lyrics

Unnamed: 0,ALink,SName,SLink,Lyric,language
0,/ivete-sangalo/,Arerê,/ivete-sangalo/arere.html,"Tudo o que eu quero nessa vida,\nToda vida, é\...",pt
1,/ivete-sangalo/,Se Eu Não Te Amasse Tanto Assim,/ivete-sangalo/se-eu-nao-te-amasse-tanto-assim...,Meu coração\nSem direção\nVoando só por voar\n...,pt
2,/ivete-sangalo/,Céu da Boca,/ivete-sangalo/chupa-toda.html,É de babaixá!\nÉ de balacubaca!\nÉ de babaixá!...,pt
3,/ivete-sangalo/,Quando A Chuva Passar,/ivete-sangalo/quando-a-chuva-passar.html,Quando a chuva passar\n\nPra quê falar\nSe voc...,pt
4,/ivete-sangalo/,Sorte Grande,/ivete-sangalo/sorte-grande.html,A minha sorte grande foi você cair do céu\nMin...,pt
...,...,...,...,...,...
379926,/clegg-johnny/,The Waiting,/clegg-johnny/the-waiting.html,Chorus\nHere we stand waiting on the plain\nDa...,en
379927,/clegg-johnny/,Too Early For The Sky,/clegg-johnny/too-early-for-the-sky.html,I nearly disappeared into the mouth of a croco...,en
379928,/clegg-johnny/,Warsaw 1943 (I Never Betrayed The Revolution),/clegg-johnny/warsaw-1943-i-never-betrayed-the...,"Amambuka, amambuka azothengisa izwe lakithi, i...",en
379929,/clegg-johnny/,When The System Has Fallen,/clegg-johnny/when-the-system-has-fallen.html,Sweat in the heat for days on end\nwaiting for...,en


In [4]:
lyrics = lyrics.loc[lyrics['language'] == 'en'][['SName', 'Lyric']].reset_index(drop=True)
lyrics.columns = ['track_name', 'lyric']
lyrics['track_name'] = lyrics['track_name'].apply(lambda x: norm_text(str(x)))
lyrics

Unnamed: 0,track_name,lyric
0,careless whisper,I feel so unsure\nAs I take your hand and lead...
1,could you be loved citação musical do rap se ...,"Don't let them fool, ya\nOr even try to school..."
2,cruisin part saulo,"Baby, let's cruise, away from here\nDon't be c..."
3,easy,"Know it sounds funny\nBut, I just can't stand ..."
4,for your babies the voice cover,You've got that look again\nThe one I hoped I ...
...,...,...
191809,the waiting,Chorus\nHere we stand waiting on the plain\nDa...
191810,too early for the sky,I nearly disappeared into the mouth of a croco...
191811,warsaw 1943 i never betrayed the revolution,"Amambuka, amambuka azothengisa izwe lakithi, i..."
191812,when the system has fallen,Sweat in the heat for days on end\nwaiting for...


In [7]:
lyrics.loc[lyrics['track_name']=='hades']

Unnamed: 0,track_name,lyric
87218,hades,"""Beyond the mountains\nwhere the wind cries ou..."
120743,hades,NIGHT MARE saa otabe CANDY\nNIGHT MARE HETERO ...


In [8]:
track_name_list = lyrics['track_name'].tolist()

index_list = []
sub_list = []

for s, i in enumerate(track_name_list):
    if i not in sub_list:
        sub_list.append(i)
        index_list.append(s)

In [9]:
len(index_list)

133478

In [10]:
lyrics = lyrics.filter(items=index_list, axis=0).reset_index(drop=True)
lyrics

Unnamed: 0,track_name,lyric
0,careless whisper,I feel so unsure\nAs I take your hand and lead...
1,could you be loved citação musical do rap se ...,"Don't let them fool, ya\nOr even try to school..."
2,cruisin part saulo,"Baby, let's cruise, away from here\nDon't be c..."
3,easy,"Know it sounds funny\nBut, I just can't stand ..."
4,for your babies the voice cover,You've got that look again\nThe one I hoped I ...
...,...,...
133473,take my heart away,Chorus\nTake my heart away (repeat)\n\nTo be t...
133474,too early for the sky,I nearly disappeared into the mouth of a croco...
133475,warsaw 1943 i never betrayed the revolution,"Amambuka, amambuka azothengisa izwe lakithi, i..."
133476,when the system has fallen,Sweat in the heat for days on end\nwaiting for...


In [11]:
df = pd.read_csv('data/music_genre.csv')
df

Unnamed: 0,instance_id,artist_name,track_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,obtained_date,valence,music_genre
0,32894.0,Röyksopp,Röyksopp's Night Out,27.0,0.00468,0.652,-1.0,0.941,0.79200,A#,0.115,-5.201,Minor,0.0748,100.889,4-Apr,0.759,Electronic
1,46652.0,Thievery Corporation,The Shining Path,31.0,0.01270,0.622,218293.0,0.890,0.95000,D,0.124,-7.043,Minor,0.0300,115.00200000000001,4-Apr,0.531,Electronic
2,30097.0,Dillon Francis,Hurricane,28.0,0.00306,0.620,215613.0,0.755,0.01180,G#,0.534,-4.617,Major,0.0345,127.994,4-Apr,0.333,Electronic
3,62177.0,Dubloadz,Nitro,34.0,0.02540,0.774,166875.0,0.700,0.00253,C#,0.157,-4.498,Major,0.2390,128.014,4-Apr,0.270,Electronic
4,24907.0,What So Not,Divide & Conquer,32.0,0.00465,0.638,222369.0,0.587,0.90900,F#,0.157,-6.266,Major,0.0413,145.036,4-Apr,0.323,Electronic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50000,58878.0,BEXEY,GO GETTA,59.0,0.03340,0.913,-1.0,0.574,0.00000,C#,0.119,-7.022,Major,0.2980,98.02799999999999,4-Apr,0.330,Hip-Hop
50001,43557.0,Roy Woods,Drama (feat. Drake),72.0,0.15700,0.709,251860.0,0.362,0.00000,B,0.109,-9.814,Major,0.0550,122.04299999999999,4-Apr,0.113,Hip-Hop
50002,39767.0,Berner,Lovin' Me (feat. Smiggz),51.0,0.00597,0.693,189483.0,0.763,0.00000,D,0.143,-5.443,Major,0.1460,131.079,4-Apr,0.395,Hip-Hop
50003,57944.0,The-Dream,Shawty Is Da Shit,65.0,0.08310,0.782,262773.0,0.472,0.00000,G,0.106,-5.016,Minor,0.0441,75.88600000000001,4-Apr,0.354,Hip-Hop


In [12]:
'''Normalizing of track names'''
df['track_name'] = df['track_name'].apply(lambda x: norm_text(str(x)))

In [13]:
data = df.merge(lyrics, how='left', on='track_name')
data.shape

(50005, 19)

In [14]:
data.isna().sum()

instance_id             5
artist_name             5
track_name              0
popularity              5
acousticness            5
danceability            5
duration_ms             5
energy                  5
instrumentalness        5
key                     5
liveness                5
loudness                5
mode                    5
speechiness             5
tempo                   5
obtained_date           5
valence                 5
music_genre             5
lyric               30849
dtype: int64

In [15]:
data = data.dropna().reset_index(drop=True)
data.shape

(19151, 19)

In [16]:
len(data['track_name'].unique())

12607

In [17]:
track_name_list = data['track_name'].tolist()

index_list = []
sub_list = []

for s, i in enumerate(track_name_list):
    if i not in sub_list:
        sub_list.append(i)
        index_list.append(s)

In [18]:
len(index_list)

12607

In [19]:
data = data.filter(items=index_list, axis=0).reset_index(drop=True)
data

Unnamed: 0,instance_id,artist_name,track_name,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,obtained_date,valence,music_genre,lyric
0,46652.0,Thievery Corporation,the shining path,31.0,0.01270,0.622,218293.0,0.890,0.950000,D,0.1240,-7.043,Minor,0.0300,115.00200000000001,4-Apr,0.531,Electronic,Don't care if I choke....\nSacrifice me for my...
1,30097.0,Dillon Francis,hurricane,28.0,0.00306,0.620,215613.0,0.755,0.011800,G#,0.5340,-4.617,Major,0.0345,127.994,4-Apr,0.333,Electronic,Pistol shots ring out in the ballroom night\nE...
2,62177.0,Dubloadz,nitro,34.0,0.02540,0.774,166875.0,0.700,0.002530,C#,0.1570,-4.498,Major,0.2390,128.014,4-Apr,0.270,Electronic,"[LL Cool J]\nCheck this\nI excel, they fell, I..."
3,89064.0,Axel Boman,hello,47.0,0.00523,0.755,519468.0,0.731,0.854000,D,0.2160,-10.517,Minor,0.0412,?,4-Apr,0.614,Electronic,Oh!\nOooooh\nI love to see you walking into th...
4,84950.0,Kayzo,never alone,39.0,0.00299,0.509,292800.0,0.921,0.000276,F,0.1780,-3.175,Minor,0.2680,149.94799999999998,4-Apr,0.273,Electronic,May the angels protect you\nTrouble neglect yo...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12602,74247.0,Saba,grey,53.0,0.06050,0.633,240507.0,0.570,0.000000,F,0.1430,-8.387,Minor,0.1590,83.881,4-Apr,0.564,Hip-Hop,A loss of feeling\nEnding in contempt\nI feel ...
12603,63945.0,Nate Dogg,music me,58.0,0.10500,0.905,240627.0,0.414,0.000366,G#,0.0914,-8.112,Minor,0.0615,?,4-Apr,0.758,Hip-Hop,"[Nate Dogg]\nHey OG, could you tell how to fin..."
12604,66619.0,Aaliyah,at your best you are love,64.0,0.76100,0.626,291400.0,0.441,0.013800,A,0.0685,-10.637,Minor,0.0308,78.094,4-Apr,0.563,Hip-Hop,"Acapella:\nAh - ha, let me know, let me know\n..."
12605,44215.0,Lucidious,face the truth,58.0,0.03840,0.699,216094.0,0.755,0.000000,G#,0.2180,-7.177,Minor,0.2130,128.194,4-Apr,0.204,Hip-Hop,"narrator:\nThe moon, the sign of hope\nIt appe..."


In [21]:
data = data[['lyric', 'music_genre']]

In [23]:
#data.to_csv('data/data_short.csv', index=False)

# Data Preprocessing

In [19]:
data = pd.read_csv('data/data_short.csv')
data

Unnamed: 0,lyric,music_genre
0,Don't care if I choke....\nSacrifice me for my...,Electronic
1,Pistol shots ring out in the ballroom night\nE...,Electronic
2,"[LL Cool J]\nCheck this\nI excel, they fell, I...",Electronic
3,Oh!\nOooooh\nI love to see you walking into th...,Electronic
4,May the angels protect you\nTrouble neglect yo...,Electronic
...,...,...
12602,A loss of feeling\nEnding in contempt\nI feel ...,Hip-Hop
12603,"[Nate Dogg]\nHey OG, could you tell how to fin...",Hip-Hop
12604,"Acapella:\nAh - ha, let me know, let me know\n...",Hip-Hop
12605,"narrator:\nThe moon, the sign of hope\nIt appe...",Hip-Hop


In [20]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12607 entries, 0 to 12606
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   lyric        12607 non-null  object
 1   music_genre  12607 non-null  object
dtypes: object(2)
memory usage: 197.1+ KB


### music genre

In [21]:
data['music_genre'].value_counts()

music_genre
Alternative    2269
Country        1573
Rock           1549
Blues          1466
Electronic     1461
Rap            1428
Jazz           1326
Hip-Hop         718
Anime           630
Classical       187
Name: count, dtype: int64

In [22]:
'''Label Encoding'''
le = preprocessing.LabelEncoder()
data['music_genre'] = ['__label__' + str(i) for i in list(le.fit_transform(data['music_genre']))]
labels_dict = dict(zip(['__label__' + str(i) for i in le.transform(le.classes_)], le.classes_))
labels_dict

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


{'__label__0': 'Alternative',
 '__label__1': 'Anime',
 '__label__2': 'Blues',
 '__label__3': 'Classical',
 '__label__4': 'Country',
 '__label__5': 'Electronic',
 '__label__6': 'Hip-Hop',
 '__label__7': 'Jazz',
 '__label__8': 'Rap',
 '__label__9': 'Rock'}

### lyric

In [24]:
data['lyric'] = data['lyric'].apply(lambda x: norm_string(x, stop_words))
data['lyric']

0        [care, choke, sacrific, sin, belief, spit, die...
1        [pistol, shot, ring, ballroom, night, enter, p...
2        [cool, check, excel, fell, said, well, hell, y...
3        [oooooh, love, see, walk, room, bodi, shine, l...
4        [may, angel, protect, troubl, neglect, heaven,...
                               ...                        
12602    [loss, feel, end, contempt, feel, like, let, e...
12603    [nate, dogg, hey, could, tell, find, good, wee...
12604    [acapella, let, know, let, know, let, know, ve...
12605    [narrat, moon, sign, hope, appear, left, pain,...
12606    [beyond, mountain, wind, cri, pain, desert, va...
Name: lyric, Length: 12607, dtype: object

# FastText

In [26]:
X = data['lyric']
y = data['music_genre']

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [34]:
ft_emb_model = FastText(X_train, vector_size=100, window=5, min_count=5, workers=4, sg=1)

In [36]:
ft_emb_model.wv.most_similar('acapella')

[('ella', 0.894085705280304),
 ('rocafella', 0.8705956339836121),
 ('salmonella', 0.8655419945716858),
 ('isabella', 0.8535481095314026),
 ('goodfella', 0.8528931736946106),
 ('yella', 0.8459404110908508),
 ('margiela', 0.8421093821525574),
 ('hella', 0.8276847004890442),
 ('stella', 0.8271149396896362),
 ('cinderella', 0.8179842829704285)]

In [37]:
'''Получим словарь из обученной модели fasttext'''
words = set(ft_emb_model.wv.index_to_key)

In [38]:
'''Подготовим данные для fasttext - он работает с датафреймами'''
train_dict = {'text': X_train, 'class': y_train}
train_ft = pd.DataFrame(data=train_dict)

test_dict = {'text': X_test, 'class': y_test}
test_ft = pd.DataFrame(data=test_dict)

In [39]:
train_ft.to_csv('train_ft.csv', index=False, sep=' ', header=False, escapechar=" ")
test_ft.to_csv('test_ft.csv', index=False, sep=' ', header=False, escapechar=" ")

In [43]:
'''Обучим модель на наших данных'''
ft_model = fasttext.train_supervised('train_ft.csv', lr = 0.9)

Read 1M words
Number of words:  35134
Number of labels: 10
Progress: 100.0% words/sec/thread: 2888805 lr:  0.000000 avg.loss:  1.840514 ETA:   0h 0m 0s


In [44]:
'''Посмотрим метрики'''
ft_model.test('test_ft.csv')

(3783, 0.246365318530267, 0.246365318530267)

In [45]:
test_ft['true_class'] = test_ft['class'].apply(lambda x: labels_dict.get(x))

# Here

In [46]:
'''Не могу понять, почему в предсказаниях для всех примеров одни и те же классы, хотя X на входе разный'''
[[labels_dict.get(s) for s in ft_model.predict(i, k=1, threshold=0.3)[0][0]] for i in test_ft['text'].tolist()]

[['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'],
 ['Electronic'

# Word2Vec

In [8]:
import gensim.downloader as api
from sklearn.decomposition import PCA

wv = api.load('word2vec-google-news-300')

In [9]:
lyric_embeddings = []

for words in data['lyric'].tolist():
    embeddings = [wv[word] for word in words if word in wv]
    lyric_embeddings.append(embeddings)

In [10]:
pca = PCA()
X = [pca.fit_transform(lyric_embeddings[i]) if len(lyric_embeddings[i]) != 0 else [] for i in range(len(lyric_embeddings))]

  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ratio_ = explained_variance_ / total_var
  explained_variance_ = (S**2) / (n_samples - 1)
  explained_variance_ratio_ = explained_variance_ / total_var


In [11]:
X

[array([[-3.38535101e-01, -1.95560955e-01,  3.09208721e-02, ...,
          3.37615581e-20,  7.49728112e-20, -2.18457797e-19],
        [-2.11376887e-01,  7.33838649e-01,  5.64401909e-01, ...,
          3.37615581e-20,  7.49728112e-20, -2.18457797e-19],
        [-6.88048589e-02, -9.73918409e-02, -1.66754388e-01, ...,
          3.37615581e-20,  7.49728112e-20, -2.18457797e-19],
        ...,
        [-5.38819604e-01,  4.18080075e-01, -4.70883313e-01, ...,
         -1.49377803e-17,  1.45007047e-17,  8.11662979e-18],
        [-4.44858230e-01,  3.77881550e-02,  1.04122049e+00, ...,
          7.25441956e-17,  8.39099633e-17,  1.70025502e-17],
        [-6.16580455e-01, -1.57527686e-01,  9.32874255e-01, ...,
          4.31290384e-17, -1.35722039e-17, -3.84099638e-17]]),
 array([[ 1.50586173e-01,  1.12331028e+00,  2.22954394e-01, ...,
          3.42083900e-18, -9.13936610e-18, -1.36467629e-18],
        [-3.55868332e-01,  4.86850767e-01,  4.78647442e-01, ...,
          3.42083900e-18, -9.13936610e

In [12]:
y = data['music_genre']
y

0        5
1        5
2        5
3        5
4        5
        ..
12602    6
12603    6
12604    6
12605    6
12606    6
Name: music_genre, Length: 12607, dtype: int64

In [18]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=1)

In [25]:
len(X_train[5])

368

In [19]:
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression()
logreg.fit(X_train, y_train)

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (9455,) + inhomogeneous part.

In [None]:
from sklearn.linear_model import LogisticRegression

  logreg = LogisticRegression()
  logreg.fit(X_train, y_train)

In [57]:
wv.similarity('city','town')

0.67237395

In [51]:
wv.most_similar(positive=['weather'], topn=10)

[('wet_weather', 0.6928061842918396),
 ('Weather', 0.6667285561561584),
 ('inclement_weather', 0.6523535847663879),
 ('wintry_weather', 0.6318536400794983),
 ('wintery_weather', 0.6174376010894775),
 ('weatherwise', 0.6150212287902832),
 ('stormy_weather', 0.6111673712730408),
 ('Unusually_mild', 0.5986037850379944),
 ('Unseasonably_warm', 0.5984669327735901),
 ('wintry_conditions', 0.5980417132377625)]