# Association rules mining to find frequent keyword sets in clusters

### Import libraries

In [22]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth

import glob

### Make keyword dataset for each cluster
Each data set is list of lists. Keywords of one article is a list. 

In [None]:
# Example of the dataset
#dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
#           ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
#           ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
#           ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
#           ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]

#### Paths to files for each cluster 

In [23]:
dname='/home/erin/JUDITA/mdpi_sustainability/PAPER_DATA/Web_of_Science_records/'
clust_dirs=glob.glob(dname+"*_clust/Keywords/")
print(clust_dirs)

['/home/erin/JUDITA/mdpi_sustainability/PAPER_DATA/Web_of_Science_records/C4_clust/Keywords/', '/home/erin/JUDITA/mdpi_sustainability/PAPER_DATA/Web_of_Science_records/C10_clust/Keywords/', '/home/erin/JUDITA/mdpi_sustainability/PAPER_DATA/Web_of_Science_records/C9_clust/Keywords/', '/home/erin/JUDITA/mdpi_sustainability/PAPER_DATA/Web_of_Science_records/C5_clust/Keywords/', '/home/erin/JUDITA/mdpi_sustainability/PAPER_DATA/Web_of_Science_records/C8_clust/Keywords/', '/home/erin/JUDITA/mdpi_sustainability/PAPER_DATA/Web_of_Science_records/C1_clust/Keywords/', '/home/erin/JUDITA/mdpi_sustainability/PAPER_DATA/Web_of_Science_records/C6_clust/Keywords/', '/home/erin/JUDITA/mdpi_sustainability/PAPER_DATA/Web_of_Science_records/C7_clust/Keywords/', '/home/erin/JUDITA/mdpi_sustainability/PAPER_DATA/Web_of_Science_records/C2_clust/Keywords/']


#### Prepare dataset for each cluster by reading keywords from each country
Create lists of keywords for each cluster

In [24]:
datasets={}
for ddir in clust_dirs:
    clust_name=ddir.split('/')[-3]
    #print(clust_name)
    clust_keywords=[]
    country_keyword_files=glob.glob(ddir+"*.txt")
    #print(files)
    for country_keyword_file in country_keyword_files:
        with open(country_keyword_file,"r") as f:
            lines=f.readlines()

        for line in lines:
            keywords=line.rstrip().split(' ')
            clust_keywords.append(keywords)

    datasets[clust_name]=clust_keywords

#### Compute frequent itemsets for each cluster

In [25]:
clust_names=datasets.keys()
print(clust_names)

dict_keys(['C4_clust', 'C10_clust', 'C9_clust', 'C5_clust', 'C8_clust', 'C1_clust', 'C6_clust', 'C7_clust', 'C2_clust'])


In [26]:
def get_frequent_itemsets(dataset, min_support=0.5):
    te = TransactionEncoder()
    te_ary = te.fit(dataset).transform(dataset)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    frequent_itemsets = fpgrowth(df, min_support=min_support, use_colnames=True)
    return frequent_itemsets

##### Print frequent itemsets for each cluster. 
###### Support is different in each cluster. Lower support gives more itemsets. 

##### C2 cluster

In [47]:
dataset=datasets['C2_clust']
itemsets=get_frequent_itemsets(dataset,min_support=0.05)
print(itemsets)

     support                      itemsets
0   0.390102                     (suicide)
1   0.105531                        (risk)
2   0.094614                  (prevalence)
3   0.061863                (epidemiology)
4   0.096798                       (rates)
5   0.089520                    (behavior)
6   0.067686                      (health)
7   0.052402                   (attitudes)
8   0.126638                   (mortality)
9   0.053857                      (trends)
10  0.166667                  (depression)
11  0.069869                    (ideation)
12  0.098253                (risk-factors)
13  0.056041                   (disorders)
14  0.080058                  (prevention)
15  0.053857                 (netherlands)
16  0.054585                 (adolescents)
17  0.092431                  (euthanasia)
18  0.077147                       (death)
19  0.155022                       (japan)
20  0.061135  (physician-assisted_suicide)
21  0.058224                     (belgium)
22  0.06113

##### C4 cluster

In [60]:
dataset=datasets['C4_clust']
itemsets4=get_frequent_itemsets(dataset,min_support=0.13)
print(itemsets4)

   support                 itemsets
0   0.5000                (suicide)
1   0.5000                 (guyana)
2   0.2500               (behavior)
3   0.1875            (adolescents)
4   0.1875        (suicide_attempt)
5   0.1875                   (risk)
6   0.2500        (guyana, suicide)
7   0.1875  (risk, suicide_attempt)


##### C5 cluster

In [35]:
dataset=datasets['C5_clust']
itemsets=get_frequent_itemsets(dataset,min_support=0.05)
print(itemsets)

     support                          itemsets
0   0.081481                       (lithuania)
1   0.303704                       (mortality)
2   0.459259                         (suicide)
3   0.074074                            (risk)
4   0.051852                      (prevention)
5   0.103704                      (depression)
6   0.125926                     (adolescents)
7   0.088889                           (death)
8   0.074074                        (children)
9   0.118519                          (trends)
10  0.081481                    (risk-factors)
11  0.133333                           (rates)
12  0.051852                    (inequalities)
13  0.051852                     (former_ussr)
14  0.081481                        (behavior)
15  0.074074                      (prevalence)
16  0.066667                        (homicide)
17  0.111111                         (alcohol)
18  0.155556                          (russia)
19  0.074074                         (belarus)
20  0.051852 

##### C6 cluster

In [37]:
dataset=datasets['C6_clust']
itemsets=get_frequent_itemsets(dataset,min_support=0.05)
print(itemsets)

     support               itemsets
0   0.089109                (rates)
1   0.071287                (death)
2   0.068647               (trends)
3   0.386139              (suicide)
4   0.091749         (risk-factors)
5   0.090429         (epidemiology)
6   0.149175            (mortality)
7   0.058086               (health)
8   0.089109             (behavior)
9   0.052805        (united-states)
10  0.118812                 (risk)
11  0.058086            (disorders)
12  0.050165             (ideation)
13  0.074587           (prevention)
14  0.068647           (euthanasia)
15  0.069967           (prevalence)
16  0.122112           (depression)
17  0.080528              (finland)
18  0.077228   (suicide, mortality)
19  0.052805        (risk, suicide)
20  0.050825  (suicide, depression)


##### C7 cluster

In [38]:
dataset=datasets['C7_clust']
itemsets=get_frequent_itemsets(dataset,min_support=0.05)
print(itemsets)

     support                  itemsets
0   0.425575                 (suicide)
1   0.056012                   (death)
2   0.152187              (depression)
3   0.127756               (mortality)
4   0.078060              (prevention)
5   0.068764            (epidemiology)
6   0.052675                  (trends)
7   0.118937            (risk-factors)
8   0.094268                (behavior)
9   0.070671             (adolescents)
10  0.135264                    (risk)
11  0.078775                  (health)
12  0.052199                (children)
13  0.057562               (disorders)
14  0.085330                   (rates)
15  0.067930           (mental-health)
16  0.064951                (ideation)
17  0.106424              (prevalence)
18  0.213801           (united-states)
19  0.072220     (suicide, depression)
20  0.067453      (suicide, mortality)
21  0.065308           (risk, suicide)
22  0.083542  (suicide, united-states)


##### C8 cluster

In [39]:
dataset=datasets['C8_clust']
itemsets=get_frequent_itemsets(dataset,min_support=0.05)
print(itemsets)

     support                      itemsets
0   0.364764                     (suicide)
1   0.107010                   (mortality)
2   0.102667                       (rates)
3   0.058002                      (trends)
4   0.093362                  (euthanasia)
5   0.109801                        (risk)
6   0.069479                  (prevention)
7   0.092742                    (behavior)
8   0.059243                   (attitudes)
9   0.165943                  (depression)
10  0.132444                  (prevalence)
11  0.066687                (epidemiology)
12  0.142060                (risk-factors)
13  0.065136               (mental-health)
14  0.062035                       (death)
15  0.070409                 (adolescents)
16  0.093983                    (ideation)
17  0.073201                      (health)
18  0.053970                   (disorders)
19  0.052419                      (gender)
20  0.092742                 (netherlands)
21  0.055521  (physician-assisted_suicide)
22  0.10794

##### C9 cluster

In [40]:
dataset=datasets['C9_clust']
itemsets=get_frequent_itemsets(dataset,min_support=0.05)
print(itemsets)

     support                 itemsets
0   0.065476           (epidemiology)
1   0.418651                (suicide)
2   0.119048               (behavior)
3   0.115079           (risk-factors)
4   0.111111             (prevalence)
5   0.063492                  (rates)
6   0.061508                 (health)
7   0.128968             (depression)
8   0.103175            (adolescents)
9   0.097222                   (risk)
10  0.083333               (children)
11  0.057540               (violence)
12  0.101190               (ideation)
13  0.053571        (suicide_attempt)
14  0.061508                 (gender)
15  0.051587                  (death)
16  0.073413              (mortality)
17  0.148810                 (turkey)
18  0.057540                (autopsy)
19  0.055556      (suicide, behavior)
20  0.055556  (suicide, risk-factors)
21  0.059524    (suicide, depression)
22  0.067460        (suicide, turkey)


##### C10 cluster

In [42]:
dataset=datasets['C10_clust']
itemsets=get_frequent_itemsets(dataset,min_support=0.05)
print(itemsets)

     support                                  itemsets
0   0.146331                              (prevalence)
1   0.102389                            (risk-factors)
2   0.086604                                (behavior)
3   0.050341                                   (rates)
4   0.075512                             (adolescents)
5   0.072526                           (mental-health)
6   0.392918                                 (suicide)
7   0.124573                               (mortality)
8   0.139932                                    (iraq)
9   0.142491                                    (risk)
10  0.083618                                (veterans)
11  0.057594                                (military)
12  0.052474                                    (care)
13  0.125853           (posttraumatic-stress-disorder)
14  0.093430                                (ideation)
15  0.091297                                  (health)
16  0.169369                              (depression)
17  0.0665

In [None]:
### Create 

In [93]:
# list of dataframes
dataframes=[]

# create a dataframe for all clusters
for cluster in ['C2', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10']:
    cluster_key=cluster+'_clust'
    dataset=datasets[cluster_key]
    itemsets=get_frequent_itemsets(dataset,min_support=0.05)
    itemsets['cluster']=cluster
    dataframes.append(itemsets)
    

In [94]:
itemsets4['cluster']='C4'
itemsets4

Unnamed: 0,support,itemsets,cluster
0,0.5,(suicide),C4
1,0.5,(guyana),C4
2,0.25,(behavior),C4
3,0.1875,(adolescents),C4
4,0.1875,(suicide_attempt),C4
5,0.1875,(risk),C4
6,0.25,"(guyana, suicide)",C4
7,0.1875,"(risk, suicide_attempt)",C4


In [95]:
dataframes.append(itemsets4)

In [96]:
df=pd.concat(dataframes)
df

Unnamed: 0,support,itemsets,cluster
0,0.390102,(suicide),C2
1,0.105531,(risk),C2
2,0.094614,(prevalence),C2
3,0.061863,(epidemiology),C2
4,0.096798,(rates),C2
...,...,...,...
3,0.187500,(adolescents),C4
4,0.187500,(suicide_attempt),C4
5,0.187500,(risk),C4
6,0.250000,"(guyana, suicide)",C4


In [97]:
def convertTuple(tup):
    return ' | '.join([str(x) for x in tup])

itemstrings=[]
for item in df['itemsets']:
    itemstr=convertTuple(item)
    #print(itemstr)
    itemstrings.append(itemstr)

In [98]:
df['items']=itemstrings
df.drop(columns='itemsets', inplace=True)
df

Unnamed: 0,support,cluster,items
0,0.390102,C2,suicide
1,0.105531,C2,risk
2,0.094614,C2,prevalence
3,0.061863,C2,epidemiology
4,0.096798,C2,rates
...,...,...,...
3,0.187500,C4,adolescents
4,0.187500,C4,suicide_attempt
5,0.187500,C4,risk
6,0.250000,C4,guyana | suicide


In [99]:
node_items=pd.DataFrame({'node': pd.unique(df['items']) })
node_items['property']='keyword'

cluster_items=pd.DataFrame({'node':pd.unique(df['cluster']) })
cluster_items['property']='cluster'

df_node=pd.concat([cluster_items, node_items])
print(df_node)

                      node property
0                       C2  cluster
1                       C5  cluster
2                       C6  cluster
3                       C7  cluster
4                       C8  cluster
..                     ...      ...
80  epidemiology | suicide  keyword
81      afghanistan | iraq  keyword
82                  guyana  keyword
83        guyana | suicide  keyword
84  risk | suicide_attempt  keyword

[93 rows x 2 columns]


In [100]:
#save this dataframe and node table to visualize as network
df.to_csv("clusters_itemsets_net.csv",index=False)
df_node.to_csv("clusters_itemsets_nodes.csv",index=False)