# Last FM hometask <br>
https://www.kaggle.com/ravichaubey1506/lastfm <br>
1. Выберите данные по странам своей группы (совместно): <br>
    3530203_80101: Spain, Portugal, France, Italy, Belgium<br>
    3530203_80102: Germany, Netherlands <br>
    3530903_80301: Belarus, Ukraine, Poland, Russian Federation<br>
    3530903_80302: Sweden, Finland, Norway, Denmark, Iceland<br>
    
2. Попытайтесь найти полезные с точки зрения продвижения групп (или еще чего-нибудь) и нетривиальные правила, используя алгоритмы Apriori, FPGrowth, FPMax и всевозможные метрики. Найдите и объясните хотя бы 5 правил.
3. Выведите эти правила в отдельных ячейках. 
4. Подумайте, как можно было бы использовать полученные правила на практике.

In [30]:
import pandas as pd
from mlxtend.frequent_patterns import apriori, fpgrowth, association_rules, fpmax

In [3]:
data = pd.read_csv("lastfm.csv")
data.head(7)

Unnamed: 0,user,artist,sex,country
0,1,red hot chili peppers,f,Germany
1,1,the black dahlia murder,f,Germany
2,1,goldfrapp,f,Germany
3,1,dropkick murphys,f,Germany
4,1,le tigre,f,Germany
5,1,schandmaul,f,Germany
6,1,edguy,f,Germany


In [4]:
data_SPFIB=data[(data.country=="Spain")|(data.country=="Portugal")|(data.country=="France")|(data.country=="Italy")|(data.country=="Belgium")]
data_SPFIB

Unnamed: 0,user,artist,sex,country
83,6,lily allen,m,Portugal
84,6,kanye west,m,Portugal
85,6,sigur rós,m,Portugal
86,6,pink floyd,m,Portugal
87,6,stevie wonder,m,Portugal
...,...,...,...,...
289758,19707,the decemberists,m,Spain
289759,19707,my bloody valentine,m,Spain
289760,19707,bloc party,m,Spain
289761,19707,new order,m,Spain


In [5]:
data_SPFIB.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 27214 entries, 83 to 289762
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   user     27214 non-null  int64 
 1   artist   27214 non-null  object
 2   sex      27214 non-null  object
 3   country  27214 non-null  object
dtypes: int64(1), object(3)
memory usage: 1.0+ MB


### Data preprocessing

In [6]:
# dummy encoding for artist
piv_artist = data_SPFIB[['user', 'artist']].pivot_table(index='user', columns='artist',aggfunc=any, fill_value=False).astype(int)
piv_artist.sum()

artist
...and you will know us by the trail of dead     8
2pac                                            16
3 doors down                                    29
30 seconds to mars                              42
311                                              3
                                                ..
yeah yeah yeahs                                 26
yellowcard                                       8
yo la tengo                                     11
zero 7                                          13
Édith piaf                                      17
Length: 994, dtype: int64

In [7]:
# dummy encoding for country
piv_country = data_SPFIB[['user', 'country']].pivot_table(index='user', columns='country',aggfunc=any, fill_value=False).astype(int)
piv_country.sum()

country
Belgium     160
France      327
Italy       304
Portugal    122
Spain       506
dtype: int64

In [8]:
# dummy encoding for sex
piv_sex = data_SPFIB[['user', 'sex']].pivot_table(index='user', columns='sex',aggfunc=any, fill_value=False).astype(int)
piv_sex.sum()

sex
f     312
m    1107
dtype: int64

In [9]:
piv_all = pd.concat([piv_artist, piv_country, piv_sex], axis=1)
piv_all.head()

Unnamed: 0_level_0,...and you will know us by the trail of dead,2pac,3 doors down,30 seconds to mars,311,36 crazyfists,44,50 cent,65daysofstatic,[unknown],...,yo la tengo,zero 7,Édith piaf,Belgium,France,Italy,Portugal,Spain,f,m
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
12,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
36,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
47,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,1,0,0,0,1
74,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1


# Apriori for artists

In [11]:
apriori_artist = apriori(piv_artist, min_support=0.03, use_colnames=True)
apriori_artist

Unnamed: 0,support,itemsets
0,0.041579,([unknown])
1,0.043693,(a perfect circle)
2,0.073996,(ac/dc)
3,0.036646,(aerosmith)
4,0.092319,(air)
...,...,...
264,0.035941,"(u2, radiohead)"
265,0.035941,"(the beatles, the rolling stones)"
266,0.031712,"(the beatles, the strokes)"
267,0.039464,"(the strokes, the killers)"


### Apriori by confidence for artists

In [12]:
apriori_artist_rules = association_rules(apriori_artist, metric="confidence", min_threshold=0.4)
apriori_artist_rules.sort_values('confidence', ascending=False).head(6)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
19,(keane),(coldplay),0.038055,0.188161,0.033122,0.87037,4.625676,0.025961,6.26276
7,(beck),(radiohead),0.047216,0.237491,0.032417,0.686567,2.890916,0.021204,2.432766
14,(blur),(radiohead),0.047921,0.237491,0.030303,0.632353,2.662637,0.018922,2.074024
23,(snow patrol),(coldplay),0.05074,0.188161,0.031712,0.625,3.321629,0.022165,2.164905
40,(kaiser chiefs),(the killers),0.05074,0.109937,0.031008,0.611111,5.558761,0.02543,2.288735
32,(kaiser chiefs),(franz ferdinand),0.05074,0.095842,0.030303,0.597222,6.231311,0.02544,2.244806


### Apriori by support for artists

In [13]:
apriori_artist_rules = association_rules(apriori_artist, metric="support", min_threshold=0.03)
apriori_artist_rules.sort_values('support', ascending=False).head(6)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
66,(coldplay),(radiohead),0.188161,0.237491,0.083157,0.441948,1.860901,0.038471,1.366375
67,(radiohead),(coldplay),0.237491,0.188161,0.083157,0.350148,1.860901,0.038471,1.249269
141,(radiohead),(muse),0.237491,0.174066,0.080338,0.338279,1.943392,0.038999,1.24816
140,(muse),(radiohead),0.174066,0.237491,0.080338,0.461538,1.943392,0.038999,1.416088
184,(the beatles),(radiohead),0.174066,0.237491,0.070472,0.404858,1.70473,0.029133,1.281222
185,(radiohead),(the beatles),0.237491,0.174066,0.070472,0.296736,1.70473,0.029133,1.174429


## Apriori for artists, sex and country


In [14]:
apriori_all = apriori(piv_all, min_support=0.03, use_colnames=True)
apriori_all

Unnamed: 0,support,itemsets
0,0.041579,([unknown])
1,0.043693,(a perfect circle)
2,0.073996,(ac/dc)
3,0.036646,(aerosmith)
4,0.092319,(air)
...,...,...
505,0.032417,"(the beatles, m, the rolling stones)"
506,0.038760,"(Italy, m, the beatles)"
507,0.042283,"(Spain, the beatles, m)"
508,0.031008,"(m, the strokes, the killers)"


In [50]:
apriori_all['itemsets'].apply(lambda x: len(x)).describe()

count    510.000000
mean       1.790196
std        0.649242
min        1.000000
25%        1.000000
50%        2.000000
75%        2.000000
max        3.000000
Name: itemsets, dtype: float64

### Apriori by confidence for artists, sex and country

In [24]:
apriori_all_rules = association_rules(apriori_all, metric="confidence", min_threshold=0.1)
apriori_all_rules.sort_values('confidence', ascending=False).head(5)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
586,"(metallica, pink floyd)",(m),0.031712,0.780127,0.031008,0.977778,1.253357,0.006268,9.894292
581,"(metallica, nirvana)",(m),0.034531,0.780127,0.033122,0.959184,1.229523,0.006183,5.386892
18,(aphex twin),(m),0.034531,0.780127,0.033122,0.959184,1.229523,0.006183,5.386892
532,"(daft punk, France)",(m),0.035941,0.780127,0.033827,0.941176,1.20644,0.005788,3.737844
361,(slayer),(m),0.032417,0.780127,0.030303,0.934783,1.198244,0.005014,3.371388


In [26]:
apriori_all_rules[apriori_all_rules['consequents'].map(lambda x: len(x) > 1)].sort_values('confidence', ascending=False).head()

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
559,(iron maiden),"(metallica, m)",0.073291,0.109232,0.036646,0.5,4.577419,0.02864,1.781536
680,(sigur rós),"(m, radiohead)",0.086681,0.193094,0.042283,0.487805,2.52626,0.025546,1.575388
553,(interpol),"(m, radiohead)",0.079634,0.193094,0.035941,0.451327,2.337349,0.020564,1.470652
428,(air),"(m, radiohead)",0.092319,0.193094,0.041579,0.450382,2.332451,0.023752,1.468121
726,(the rolling stones),"(the beatles, m)",0.072586,0.137421,0.032417,0.446602,3.249888,0.022442,1.558696


# FP-growth

**fp-growth** позволяет установить меньший порог и получить больше наборов, среди которых могут быть интересные правила

In [16]:
fpgrowth_artist = fpgrowth(piv_artist, min_support=0.005, use_colnames=True)
len(fpgrowth_artist)

19860

### FP-growth by confidence for artists


In [17]:
fpgrowth_artist_rules = association_rules(fpgrowth_artist, metric="confidence", min_threshold=0.8)
fpgrowth_artist_rules.sort_values('support', ascending=False).head(10)

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
11611,(keane),(coldplay),0.038055,0.188161,0.033122,0.87037,4.625676,0.025961,6.26276
1030,"(bloc party, muse, the killers)",(coldplay),0.023256,0.188161,0.019732,0.848485,4.509363,0.015356,5.35814
1682,"(interpol, arcade fire)",(the strokes),0.022551,0.087385,0.018323,0.8125,9.297883,0.016352,4.867277
5928,"(oasis, the killers, muse)",(coldplay),0.020437,0.188161,0.016913,0.827586,4.398295,0.013068,4.708668
10005,"(arctic monkeys, coldplay, the killers)",(muse),0.021142,0.174066,0.016913,0.8,4.595951,0.013233,4.129669
993,"(bloc party, muse, radiohead)",(coldplay),0.021142,0.188161,0.016913,0.8,4.251685,0.012935,4.059197
1898,"(death cab for cutie, the killers)",(coldplay),0.021142,0.188161,0.016913,0.8,4.251685,0.012935,4.059197
1232,"(arctic monkeys, bloc party, coldplay)",(muse),0.021142,0.174066,0.016913,0.8,4.595951,0.013233,4.129669
11498,"(massive attack, nirvana)",(radiohead),0.019732,0.237491,0.016209,0.821429,3.458775,0.011522,4.270049
10144,"(interpol, oasis)",(muse),0.019732,0.174066,0.016209,0.821429,4.719057,0.012774,4.625229


### FP-growth by lift for artists

In [27]:
fpgrowth_artist_rules['lift'].describe()

count    13197.000000
mean        12.055654
std          9.386494
min          3.368546
25%          5.314607
50%          9.122143
75%         14.696203
max        105.111111
Name: lift, dtype: float64

In [29]:
fpgrowth_artist_rules_lift = fpgrowth_artist_rules[fpgrowth_artist_rules['lift'] > 14.696203]
print(len(fpgrowth_artist_rules_lift))
fpgrowth_artist_rules_lift.head(10)

3290


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
64,"(snow patrol, the kooks, radiohead)","(muse, the killers)",0.007752,0.049331,0.007047,0.909091,18.428571,0.006665,10.457364
75,"(snow patrol, the kooks, coldplay, radiohead)","(muse, the killers)",0.006342,0.049331,0.005638,0.888889,18.019048,0.005325,8.556025
118,"(snow patrol, bloc party, muse, radiohead)","(the kooks, coldplay)",0.007047,0.028189,0.005638,0.8,28.38,0.005439,4.859056
119,"(snow patrol, the kooks, coldplay, radiohead)","(bloc party, muse)",0.006342,0.03735,0.005638,0.888889,23.798742,0.005401,8.663848
134,"(snow patrol, bloc party, oasis)","(coldplay, the killers)",0.007752,0.055673,0.007047,0.909091,16.329114,0.006616,10.387597
138,"(snow patrol, bloc party, oasis)","(muse, the killers)",0.007752,0.049331,0.006342,0.818182,16.585714,0.00596,5.228682
144,"(snow patrol, bloc party, oasis, coldplay)","(muse, the killers)",0.007752,0.049331,0.006342,0.818182,16.585714,0.00596,5.228682
145,"(snow patrol, bloc party, oasis, muse)","(coldplay, the killers)",0.007047,0.055673,0.006342,0.9,16.165823,0.00595,9.44327
147,"(snow patrol, oasis, the killers, muse)","(bloc party, coldplay)",0.007752,0.03876,0.006342,0.818182,21.109091,0.006042,5.286822
148,"(snow patrol, bloc party, oasis)","(muse, coldplay, the killers)",0.007752,0.029598,0.006342,0.818182,27.642857,0.006113,5.337209


### Fpmax by confidence for artist

In [42]:
fpmax_artists = fpmax(piv_artist,min_support=0.05, use_colnames=True)
len(fpmax_artists)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 72 entries, 0 to 71
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   support   72 non-null     float64
 1   itemsets  72 non-null     object 
dtypes: float64(1), object(1)
memory usage: 1.2+ KB


In [52]:
fpmax_artists_rules = association_rules(fpmax_artists, metric='support', min_threshold=0.05, support_only=True)
print(len(fpmax_artists_rules))
fpmax_artists_rules.sort_values('support', ascending=False)

14


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
12,(coldplay),(radiohead),,,0.083157,,,,
13,(radiohead),(coldplay),,,0.083157,,,,
10,(muse),(radiohead),,,0.080338,,,,
11,(radiohead),(muse),,,0.080338,,,,
6,(the beatles),(radiohead),,,0.070472,,,,
7,(radiohead),(the beatles),,,0.070472,,,,
8,(muse),(coldplay),,,0.068358,,,,
9,(coldplay),(muse),,,0.068358,,,,
0,(coldplay),(the killers),,,0.055673,,,,
1,(the killers),(coldplay),,,0.055673,,,,
