In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv("lastfm.csv")
data[0:18]

Unnamed: 0,user,artist,sex,country
0,1,red hot chili peppers,f,Germany
1,1,the black dahlia murder,f,Germany
2,1,goldfrapp,f,Germany
3,1,dropkick murphys,f,Germany
4,1,le tigre,f,Germany
5,1,schandmaul,f,Germany
6,1,edguy,f,Germany
7,1,jack johnson,f,Germany
8,1,eluveitie,f,Germany
9,1,the killers,f,Germany


In [3]:
data.shape

(289955, 4)

In [4]:
data.country.value_counts().sort_values(ascending = False)[0:20]

United States         59558
United Kingdom        27638
Germany               24251
Poland                17111
Sweden                12379
Brazil                11922
Russian Federation    11676
Finland               10157
Netherlands            9673
Spain                  9322
Canada                 6928
Australia              6407
France                 5962
Italy                  5717
Turkey                 4762
Norway                 4641
Mexico                 3803
Czech Republic         3752
Belgium                3331
Portugal               2882
Name: country, dtype: int64

In [5]:
mx = data[data.country == 'Mexico']
print(mx.shape)
mx.head()

(3803, 4)


Unnamed: 0,user,artist,sex,country
573,41,dream theater,f,Mexico
574,41,kreator,f,Mexico
575,41,scorpions,f,Mexico
576,41,alice in chains,f,Mexico
577,41,iron maiden,f,Mexico


In [6]:
selected = data[data.country == 'Mexico']
print("Transactions: ", selected.shape[0])
selected.artist.value_counts().sort_values(ascending = False)

Transactions:  3803


radiohead              54
muse                   49
coldplay               45
nirvana                39
the beatles            39
                       ..
roxette                 1
bloodbath               1
parkway drive           1
dirty pretty things     1
sean paul               1
Name: artist, Length: 764, dtype: int64

### Подготовка данных для ассоциативного анализа

In [5]:
# проводим группировку по пользователям и указываем "правило склеивания" для значений в столбце artist
groupped = data.groupby('user')['artist'].apply(';'.join)
groupped
#type(groupped)
#data.groupby('user')

user
1        red hot chili peppers;the black dahlia murder;...
3        devendra banhart;boards of canada;cocorosie;ap...
4        tv on the radio;tool;kyuss;dj shadow;air;a tri...
5        dream theater;ac/dc;metallica;iron maiden;bob ...
6        lily allen;kanye west;sigur rós;pink floyd;ste...
                               ...                        
19713    armin van buuren;above & beyond;atb;ferry cors...
19714    misfits;type o negative;arch enemy;red hot chi...
19715    abba;james blunt;jason mraz;amy winehouse;quee...
19717    marilyn manson;beyoncé;madonna;t.a.t.u.;katy p...
19718    beirut;of montreal;black flag;the new pornogra...
Name: artist, Length: 15000, dtype: object

### Бинарное кодирование

In [8]:
# Для бинаризации показателя используется функция get_dummies
simpleSerie = pd.Series([ 'a', 'b', 'a', 'd', 'b', 'c', 'c' ])
simpleSerie

0    a
1    b
2    a
3    d
4    b
5    c
6    c
dtype: object

In [9]:
pd.get_dummies(simpleSerie)

Unnamed: 0,a,b,c,d
0,1,0,0,0
1,0,1,0,0
2,1,0,0,0
3,0,0,0,1
4,0,1,0,0
5,0,0,1,0
6,0,0,1,0


In [6]:
# в нашем случае необхидимо также разбивать значения по указанному разделителю 
binary_data = groupped.str.get_dummies(";")
binary_data.iloc[0:10, 100:110]

Unnamed: 0_level_0,beastie boys,beatsteaks,beck,bee gees,behemoth,beirut,belle and sebastian,ben folds,ben folds five,ben harper
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0
12,0,0,0,1,0,0,0,0,0,0
13,0,0,0,0,0,0,0,0,0,0
14,0,0,0,0,0,0,0,0,0,0


In [11]:
# статистика по кол-ву любимых муз.коллективов для отдельного пользователя
binary_data.apply(sum, axis = 1).describe()

count    15000.000000
mean        19.330200
std         10.500698
min          1.000000
25%         11.000000
50%         19.000000
75%         27.000000
max         76.000000
dtype: float64

In [9]:
# другой вариант формирования таблицы для ассоциативного анализа - сводная таблица
pvt = data.loc[data.country == 'Mexico',  ['artist', 'user']].pivot_table(index = 'user', columns = 'artist', aggfunc = any, fill_value=False).astype('int')
pvt.iloc[0:10, 0:5]


artist,2pac,3 doors down,30 seconds to mars,311,44
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
41,0,0,0,0,0
70,0,1,0,0,0
143,0,0,0,0,0
200,0,0,0,0,0
203,0,0,0,0,0
261,0,0,0,0,0
344,0,0,0,0,1
531,0,0,0,0,0
539,0,0,0,0,0
1106,0,0,0,0,0


### Пакет c функциями ассоциативного анализа mlxtend


In [7]:
# в первый раз необходимо установить пакет следующей командой:
# pip install mlxtend
import mlxtend.frequent_patterns as ml

In [10]:
freq_items = ml.apriori(pvt, min_support = 0.05, use_colnames = True)
print("Найдено %d характерных комбинаций" % len(freq_items))
freq_items

Найдено 205 характерных комбинаций


Unnamed: 0,support,itemsets
0,0.051282,(3 doors down)
1,0.056410,([unknown])
2,0.051282,(ac/dc)
3,0.087179,(aerosmith)
4,0.107692,(air)
...,...,...
200,0.051282,"(muse, coldplay, oasis)"
201,0.051282,"(muse, coldplay, radiohead)"
202,0.051282,"(muse, coldplay, the killers)"
203,0.051282,"(radiohead, coldplay, the killers)"


In [11]:
rules = ml.association_rules(freq_items, metric = "confidence", min_threshold = 0.5)
rules[0:15]

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(aerosmith),(coldplay),0.087179,0.230769,0.061538,0.705882,3.058824,0.04142,2.615385
1,(air),(radiohead),0.107692,0.276923,0.05641,0.52381,1.891534,0.026588,1.518462
2,(röyksopp),(air),0.087179,0.107692,0.051282,0.588235,5.462185,0.041893,2.167033
3,(arctic monkeys),(radiohead),0.107692,0.276923,0.061538,0.571429,2.063492,0.031716,1.687179
4,(the strokes),(arctic monkeys),0.123077,0.107692,0.066667,0.541667,5.029762,0.053412,1.946853
5,(arctic monkeys),(the strokes),0.107692,0.123077,0.066667,0.619048,5.029762,0.053412,2.301923
6,(beck),(radiohead),0.05641,0.276923,0.051282,0.909091,3.282828,0.035661,7.953846
7,(björk),(radiohead),0.138462,0.276923,0.071795,0.518519,1.872428,0.033452,1.501775
8,(daft punk),(coldplay),0.164103,0.230769,0.082051,0.5,2.166667,0.044181,1.538462
9,(depeche mode),(coldplay),0.133333,0.230769,0.066667,0.5,2.166667,0.035897,1.538462
