# Лабораторная работа №4
# Что такое tidy-данные?

Hadley в работе (http://vita.had.co.nz/papers/tidy-data.pdf) предлагает три основных принципа, определяющих, является ли набор данных опрятным:

1. Каждая переменная формирует столбец.

2. Каждое наблюдение формирует строку.

3. Каждый тип наблюдения формирует таблицу.

Рассмотрим каждый из этих пунктов относительно данного датасета:

1.  **"Каждая переменная формирует столбец"** - в данном датасете каждый столбец, кроме, пожалуй, `index` и `name`, можно использовать в качестве признака для обучения классификаторов/сегментаторов/регрессоров. Есть как категориальные данные (`artist`, `release_date`, `key`, `time_signature`), так и числовые (остальные). Каждый описывает ту или иную характеристику песни. Столбцы не нуждаются в изменении, за исключением удаления повторяющегося столбца `danceability.1`.
2. **"Каждое наблюдение формирует строку"** - в данном датасете каждая строка отвечает за отдельную песню с представленными её характеристиками (столбцами таблицы). Столбец индекса также присутствует.
3. **"Каждый тип наблюдения формирует таблицу."** - таблица, в сущности, описывает один тип наблюдения (*описание набора характеристик песни*), то смысла разделять таблицу, объединять ее с чем-то и т.п. нет, т.е. pivot, melt и иже с ними использовать не нужно.

Сами данные в ячейках не требуют редактирования, кроме, быть может, замены номера тональности на название тональности.

In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("../data/history-of-rock-spotify.csv")
data

Unnamed: 0,index,name,artist,release_date,length,popularity,danceability,acousticness,danceability.1,energy,instrumentalness,key,liveness,loudness,speechiness,tempo,time_signature,valence
0,0,Smells Like Teen Spirit,Nirvana,1991,5.032000,74,0.502,0.000025,0.502,0.912,0.000173,1,0.1060,-4.556,0.0564,116.761,4,0.720
1,1,Stairway to Heaven - Remaster,Led Zeppelin,1971,8.047167,78,0.338,0.580000,0.338,0.340,0.003200,9,0.1160,-12.049,0.0339,82.433,4,0.197
2,2,Bohemian Rhapsody - Remastered 2011,Queen,1975,5.905333,74,0.392,0.288000,0.392,0.402,0.000000,0,0.2430,-9.961,0.0536,143.883,4,0.228
3,3,Imagine - Remastered 2010,John Lennon,1971,3.131100,77,0.547,0.907000,0.547,0.257,0.183000,0,0.0935,-12.358,0.0252,75.752,4,0.169
4,4,(I Can't Get No) Satisfaction - Mono Version,The Rolling Stones,1965,3.713550,77,0.723,0.038300,0.723,0.863,0.031700,2,0.1280,-7.890,0.0338,136.302,4,0.931
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5479,5479,I'm In Your Mind,King Gizzard & The Lizard Wizard,2014,3.559833,47,0.296,0.005910,0.296,0.776,0.801000,6,0.5970,-5.630,0.0597,93.481,4,0.406
5480,5480,Cellophane,King Gizzard & The Lizard Wizard,2014,3.179750,44,0.432,0.002130,0.432,0.887,0.916000,7,0.1200,-6.175,0.1230,92.965,4,0.357
5481,5481,Hot Water,King Gizzard & The Lizard Wizard,2014,3.396450,40,0.627,0.860000,0.627,0.609,0.890000,9,0.1160,-9.387,0.0332,86.861,4,0.734
5482,5482,Vitamin C - 2004 Remastered Version,CAN,1972,3.567767,52,0.643,0.006690,0.643,0.644,0.673000,4,0.1620,-12.615,0.0462,117.225,4,0.853


In [3]:
data.pop('danceability.1')

0       0.502
1       0.338
2       0.392
3       0.547
4       0.723
        ...  
5479    0.296
5480    0.432
5481    0.627
5482    0.643
5483    0.222
Name: danceability.1, Length: 5484, dtype: float64

In [4]:
data.iloc[:10]

Unnamed: 0,index,name,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,key,liveness,loudness,speechiness,tempo,time_signature,valence
0,0,Smells Like Teen Spirit,Nirvana,1991,5.032,74,0.502,2.5e-05,0.912,0.000173,1,0.106,-4.556,0.0564,116.761,4,0.72
1,1,Stairway to Heaven - Remaster,Led Zeppelin,1971,8.047167,78,0.338,0.58,0.34,0.0032,9,0.116,-12.049,0.0339,82.433,4,0.197
2,2,Bohemian Rhapsody - Remastered 2011,Queen,1975,5.905333,74,0.392,0.288,0.402,0.0,0,0.243,-9.961,0.0536,143.883,4,0.228
3,3,Imagine - Remastered 2010,John Lennon,1971,3.1311,77,0.547,0.907,0.257,0.183,0,0.0935,-12.358,0.0252,75.752,4,0.169
4,4,(I Can't Get No) Satisfaction - Mono Version,The Rolling Stones,1965,3.71355,77,0.723,0.0383,0.863,0.0317,2,0.128,-7.89,0.0338,136.302,4,0.931
5,5,Hotel California - 2013 Remaster,Eagles,1976,6.522933,83,0.579,0.00574,0.508,0.000494,2,0.0575,-9.484,0.027,147.125,4,0.609
6,6,Enter Sandman,Metallica,1991,5.526217,74,0.579,0.00206,0.824,0.00903,6,0.059,-8.71,0.03,123.331,4,0.635
7,7,Whole Lotta Love - 1990 Remaster,Led Zeppelin,1969,5.564883,77,0.412,0.0484,0.902,0.131,9,0.405,-11.6,0.405,89.74,4,0.422
8,8,Comfortably Numb,Pink Floyd,1979,6.3716,74,0.472,0.15,0.366,0.308,11,0.0837,-12.595,0.0286,127.167,4,0.171
9,9,One,U2,1991,4.6031,76,0.392,0.245,0.534,0.00104,0,0.155,-8.793,0.0369,181.305,4,0.325


Поменяем номера тональности на название тональности

In [5]:
data['key'].unique()

array([ 1,  9,  0,  2,  6, 11,  4, 10,  7,  8,  3,  5], dtype=int64)

In [6]:
key_num_to_name_map = {
    0: 'C', 
    1: 'C#', 
    2: 'D', 
    3: 'D#', 
    4: 'E',
    5: 'F',
    6: 'F#',
    7: 'G',
    8: 'G#',
    9: 'A',
    10: 'A#',
    11: 'B'
}
data1 = data.replace({'key': key_num_to_name_map})
data1['key'].unique()

array(['C#', 'A', 'C', 'D', 'F#', 'B', 'E', 'A#', 'G', 'G#', 'D#', 'F'],
      dtype=object)

In [7]:
data1

Unnamed: 0,index,name,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,key,liveness,loudness,speechiness,tempo,time_signature,valence
0,0,Smells Like Teen Spirit,Nirvana,1991,5.032000,74,0.502,0.000025,0.912,0.000173,C#,0.1060,-4.556,0.0564,116.761,4,0.720
1,1,Stairway to Heaven - Remaster,Led Zeppelin,1971,8.047167,78,0.338,0.580000,0.340,0.003200,A,0.1160,-12.049,0.0339,82.433,4,0.197
2,2,Bohemian Rhapsody - Remastered 2011,Queen,1975,5.905333,74,0.392,0.288000,0.402,0.000000,C,0.2430,-9.961,0.0536,143.883,4,0.228
3,3,Imagine - Remastered 2010,John Lennon,1971,3.131100,77,0.547,0.907000,0.257,0.183000,C,0.0935,-12.358,0.0252,75.752,4,0.169
4,4,(I Can't Get No) Satisfaction - Mono Version,The Rolling Stones,1965,3.713550,77,0.723,0.038300,0.863,0.031700,D,0.1280,-7.890,0.0338,136.302,4,0.931
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5479,5479,I'm In Your Mind,King Gizzard & The Lizard Wizard,2014,3.559833,47,0.296,0.005910,0.776,0.801000,F#,0.5970,-5.630,0.0597,93.481,4,0.406
5480,5480,Cellophane,King Gizzard & The Lizard Wizard,2014,3.179750,44,0.432,0.002130,0.887,0.916000,G,0.1200,-6.175,0.1230,92.965,4,0.357
5481,5481,Hot Water,King Gizzard & The Lizard Wizard,2014,3.396450,40,0.627,0.860000,0.609,0.890000,A,0.1160,-9.387,0.0332,86.861,4,0.734
5482,5482,Vitamin C - 2004 Remastered Version,CAN,1972,3.567767,52,0.643,0.006690,0.644,0.673000,E,0.1620,-12.615,0.0462,117.225,4,0.853


OFFTOP

In [8]:
data1.head()

Unnamed: 0,index,name,artist,release_date,length,popularity,danceability,acousticness,energy,instrumentalness,key,liveness,loudness,speechiness,tempo,time_signature,valence
0,0,Smells Like Teen Spirit,Nirvana,1991,5.032,74,0.502,2.5e-05,0.912,0.000173,C#,0.106,-4.556,0.0564,116.761,4,0.72
1,1,Stairway to Heaven - Remaster,Led Zeppelin,1971,8.047167,78,0.338,0.58,0.34,0.0032,A,0.116,-12.049,0.0339,82.433,4,0.197
2,2,Bohemian Rhapsody - Remastered 2011,Queen,1975,5.905333,74,0.392,0.288,0.402,0.0,C,0.243,-9.961,0.0536,143.883,4,0.228
3,3,Imagine - Remastered 2010,John Lennon,1971,3.1311,77,0.547,0.907,0.257,0.183,C,0.0935,-12.358,0.0252,75.752,4,0.169
4,4,(I Can't Get No) Satisfaction - Mono Version,The Rolling Stones,1965,3.71355,77,0.723,0.0383,0.863,0.0317,D,0.128,-7.89,0.0338,136.302,4,0.931


Imagine Леннона написана в До мажоре (https://getsongkey.com/song/imagine/L9LxW), в датасете тоже так.

Bohemian Rhapsody, согласно таблице, тоже в тональности До, ... но в реале она в Ля-диез мажоре (https://getsongkey.com/song/bohemian-rhapsody/Y69Dp).

Smells Like Teen Spirit написана в фа миноре (https://getsongkey.com/song/smells-like-teen-spirit/rkjj4), а по таблице - в до-диез/ре-бемоль миноре...

Похоже, не очень-то и можно доверять качеству данных в этом столбце...