# Association rules mining to find frequent keyword sets in clusters

### Import libraries

In [1]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth

import glob

### Make keyword dataset for each cluster
Each data set is list of lists. Keywords of one article is a list. 

#### Paths to files for each cluster 

In [21]:
dname='/home/erin/Desktop/SUIC/frequent_itemsets/'
clust_dirs=glob.glob(dname+"regions/*")
#clust_dirs_medium=glob.glob(dname+"Medium_*/*/")
#clust_dirs_low=glob.glob(dname+"Low_*/*/")
#clust_dirs=clust_dirs_high + clust_dirs_medium + clust_dirs_low
print(clust_dirs)

['/home/erin/Desktop/SUIC/frequent_itemsets/regions/North_America', '/home/erin/Desktop/SUIC/frequent_itemsets/regions/East_Asia_and_Pacific', '/home/erin/Desktop/SUIC/frequent_itemsets/regions/Europe_and_Central_Acia', '/home/erin/Desktop/SUIC/frequent_itemsets/regions/South_Asia', '/home/erin/Desktop/SUIC/frequent_itemsets/regions/Middle_East_and_North_Africa', '/home/erin/Desktop/SUIC/frequent_itemsets/regions/Sub_Saharan_Africa', '/home/erin/Desktop/SUIC/frequent_itemsets/regions/Latin_America_and_Caribbean']


#### Prepare dataset for each cluster by reading keywords from each country
Create lists of keywords for each cluster

In [25]:
datasets={}
for ddir in clust_dirs:
    #print(ddir)
    clust_name=ddir.split('/')[-1]
    #print(clust_name)
    clust_keywords=[]
    country_keyword_files=glob.glob(ddir+"/*/*.txt")
    #print(country_keyword_files)
    for country_keyword_file in country_keyword_files:
        with open(country_keyword_file,"r") as f:
            lines=f.readlines()

        for line in lines:
            keywords=line.rstrip().split(' ')
            clust_keywords.append(keywords)

    datasets[clust_name]=clust_keywords

In [26]:
print(datasets)



#### Compute frequent itemsets

In [27]:
clust_names=datasets.keys()
print(clust_names)

dict_keys(['North_America', 'East_Asia_and_Pacific', 'Europe_and_Central_Acia', 'South_Asia', 'Middle_East_and_North_Africa', 'Sub_Saharan_Africa', 'Latin_America_and_Caribbean'])


In [16]:
def get_frequent_itemsets(dataset, min_support=0.5):
    te = TransactionEncoder()
    te_ary = te.fit(dataset).transform(dataset)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    frequent_itemsets = fpgrowth(df, min_support=min_support, use_colnames=True)
    return frequent_itemsets

##### Print frequent itemsets for each cluster. 
###### Support is different in each cluster. Lower support gives more itemsets. 

##### Process a cluster

In [32]:
cname='North_America'
dataset=datasets[cname]
itemsets=get_frequent_itemsets(dataset,min_support=0.01)
for item in itemsets:
    print(item)
print(itemsets)

support
itemsets
      support                   itemsets
0    0.280066            (united_states)
1    0.052338               (prevention)
2    0.026907              (association)
3    0.016407                   (deaths)
4    0.011157             (availability)
..        ...                        ...
135  0.016079   (behavior, risk_factors)
136  0.010008           (behavior, risk)
137  0.010664     (behavior, prevalence)
138  0.013126        (violence, suicide)
139  0.010664  (united_states, violence)

[140 rows x 2 columns]


### Create cluster itemsets

In [33]:
# list of dataframes
dataframes=[]

# create a dataframe for all clusters
for cluster in clust_names:
    dataset=datasets[cluster]
    itemsets=get_frequent_itemsets(dataset,min_support=0.01)
    itemsets['cluster']=cluster
    dataframes.append(itemsets)
    

In [34]:
df=pd.concat(dataframes)
df

Unnamed: 0,support,itemsets,cluster
0,0.280066,(united_states),North_America
1,0.052338,(prevention),North_America
2,0.026907,(association),North_America
3,0.016407,(deaths),North_America
4,0.011157,(availability),North_America
...,...,...,...
132,0.021459,"(suicide, mexico)",Latin_America_and_Caribbean
133,0.010014,"(depression, mexico)",Latin_America_and_Caribbean
134,0.011445,"(mexico, mortality)",Latin_America_and_Caribbean
135,0.012160,"(homicide, suicide)",Latin_America_and_Caribbean


In [42]:
#save this dataframe and node table to visualize as network
df.to_csv("clusters_itemsets_net.csv",index=False)
#df_node.to_csv("clusters_itemsets_nodes.csv",index=False)

In [38]:
def convertTuple(tup):
    return ' | '.join([str(x) for x in tup])

itemstrings=[]
for item in df['itemsets']:
    itemstr=convertTuple(item)
    #print(itemstr)
    itemstrings.append(itemstr)

KeyError: 'itemsets'

In [37]:
df['items']=itemstrings
df.drop(columns='itemsets', inplace=True)
df

KeyError: "['itemsets'] not found in axis"

In [35]:
node_items=pd.DataFrame({'node': pd.unique(df['items']) })
node_items['property']='keyword'

cluster_items=pd.DataFrame({'node':pd.unique(df['cluster']) })
cluster_items['property']='cluster'

df_node=pd.concat([cluster_items, node_items])
print(df_node)

                                                   node property
0                           Low_SMR_Europe_Central_Asia  cluster
1                 Medium_higher_SMR_Europe_Central_Asia  cluster
2                        Medium_SMR_Europe_Central_Asia  cluster
3                   High_medium_SMR_Europe_Central_Asia  cluster
4                     High_high_SMR_Europe_Central_Asia  cluster
...                                                 ...      ...
4260                                 behavior | smoking  keyword
4261                                mortality | smoking  keyword
4262                     mortality | behavior | smoking  keyword
4263      vital_statistics | classification | mortality  keyword
4264  vital_statistics | mortality | classification ...  keyword

[4271 rows x 2 columns]
