# Association rule mining

### Frequent itemsets for countries and clusters

In [2]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth


dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
           ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
           ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
           ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]

te = TransactionEncoder()
te_ary = te.fit(dataset).transform(dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)

frequent_itemsets = fpgrowth(df, min_support=0.6, use_colnames=True)
### alternatively:
#frequent_itemsets = apriori(df, min_support=0.6, use_colnames=True)
#frequent_itemsets = fpmax(df, min_support=0.6, use_colnames=True)

frequent_itemsets

Unnamed: 0,support,itemsets
0,1.0,(Kidney Beans)
1,0.8,(Eggs)
2,0.6,(Yogurt)
3,0.6,(Onion)
4,0.6,(Milk)
5,0.8,"(Kidney Beans, Eggs)"
6,0.6,"(Kidney Beans, Yogurt)"
7,0.6,"(Onion, Eggs)"
8,0.6,"(Onion, Kidney Beans)"
9,0.6,"(Onion, Kidney Beans, Eggs)"


### Make datasets for countries and create frequent itemsets for each country

In [3]:
import glob
import pandas as pd

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth

# read files from directory
fdir = '/home/erin/JUDITA/mdpi_sustainability/kwdsuic/'
country = 'kwd_per_countries/train/Lithuania/'
files = glob.glob(fdir + country + '*.txt')

# read countries list
f = open(fdir+'countries_list','r')
countries = f.readlines()
countries_list=[country.rstrip() for country in countries[1:] ]
print(countries_list)

# Cluster list
clust_dir='/home/erin/JUDITA/mdpi_sustainability/kwdsuic/kwd_per_clusters/train/'
clusters=['C10', 'C2', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9']
print(clusters)

['Afghanistan', 'Albania', 'Algeria', 'Argentina', 'Armenia', 'Australia', 'Austria', 'Azerbaijan', 'Bahamas', 'Bahrain', 'Bangladesh', 'Barbados', 'Belarus', 'Belgium', 'Benin', 'Bhutan', 'Bolivia', 'Bosnia_and_Herzegovina', 'Brazil', 'Brunei_Darussalam', 'Bulgaria', 'Burkina_Faso', 'Cambodia', 'Cameroon', 'Canada', 'Central_African_Republic', 'Chad', 'Chile', 'China', 'Colombia', 'Congo', 'Congo_Democratic_Republic', 'Costa_Rica', 'Croatia', 'Cuba', 'Cyprus', 'Czechia', 'Denmark', 'Dominican_Republic', 'Ecuador', 'Egypt', 'El_Salvador', 'Eritrea', 'Estonia', 'Ethiopia', 'Fiji', 'Finland', 'France', 'Gambia', 'Georgia', 'Germany', 'Ghana', 'Greece', 'Guatemala', 'Guinea', 'Guinea-Bissau', 'Guyana', 'Haiti', 'Honduras', 'Hungary', 'Iceland', 'India', 'Indonesia', 'Iran', 'Iraq', 'Ireland', 'Israel', 'Italy', 'Jamaica', 'Japan', 'Jordan', 'Kazakhstan', 'Kenya', 'Kiribati', 'Kuwait', 'Latvia', 'Lebanon', 'Liberia', 'Libya', 'Lithuania', 'Luxembourg', 'Madagascar', 'Malawi', 'Malaysia', '

### Make datasets for each country

In [8]:
frequent_itemsets.shape[0]

35

In [22]:
import glob
import os
from pathlib import Path
import pandas as pd

from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth

# Cluster list
fdir = '/home/erin/JUDITA/mdpi_sustainability/kwdsuic/'
countries_dir='/home/erin/JUDITA/mdpi_sustainability/kwdsuic/kwd_per_countries/train/'

# Save dir
save_dir='/home/erin/JUDITA/mdpi_sustainability/kwdsuic/kwd_per_countries/freq_itemsets/'
Path(save_dir).mkdir(parents=True, exist_ok=True)


# read countries list
f = open(fdir+'countries_list','r')
countries = f.readlines()
countries_list=[country.rstrip() for country in countries[1:] ]
print(len(countries_list) )


for country in countries_list:
    files = glob.glob(countries_dir + country + '/*.txt')
    # read the words in the file and append to the dataset
    dset=[]
    for file in files:
        with open(file,'r') as f:
            line=f.read().rstrip().split(" ")
            #print(line)
            dset.append(line)
    print(country + " =  " + str(len(dset)) )
    # try to find frequent itemset
    te = TransactionEncoder()
    te_ary = te.fit(dset).transform(dset)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    frequent_itemsets = fpgrowth(df, min_support=0.02, use_colnames=True)
    frequent_itemsets['country']=[country]*frequent_itemsets.shape[0]
    frequent_itemsets['n_articles']=[str(len(dset)) ]*frequent_itemsets.shape[0]
    frequent_itemsets.sort_values(by='support', ascending=False, inplace=True)
    frequent_itemsets.to_csv(save_dir+ country + "_freq_itemset.csv")
    print(frequent_itemsets)
    

156
Afghanistan =  449
      support                               itemsets      country n_articles
0    0.273942        (posttraumatic_stress_disorder)  Afghanistan        449
8    0.269488                          (afghanistan)  Afghanistan        449
1    0.236080                                 (iraq)  Afghanistan        449
9    0.233853                              (suicide)  Afghanistan        449
16   0.178174                             (veterans)  Afghanistan        449
..        ...                                    ...          ...        ...
71   0.020045                           (care, iraq)  Afghanistan        449
72   0.020045  (care, posttraumatic_stress_disorder)  Afghanistan        449
74   0.020045                           (care, risk)  Afghanistan        449
141  0.020045                (afghanistan, soldiers)  Afghanistan        449
155  0.020045               (personnel, afghanistan)  Afghanistan        449

[172 rows x 4 columns]
Albania =  15
       support 

In [17]:
# read the words in the file and append to the dataset
dset=[]
for file in files:
    with open(file,'r') as f:
        line=f.read().rstrip().split(" ")
        #print(line)
        dset.append(line)

print(len(dset) )

99


In [31]:
# try to find frequent itemset
te = TransactionEncoder()
te_ary = te.fit(dset).transform(dset)
df = pd.DataFrame(te_ary, columns=te.columns_)

#print(df)

In [21]:
frequent_itemsets = fpgrowth(df, min_support=0.3, use_colnames=True)

frequent_itemsets

Unnamed: 0,support,itemsets


### Make datasets for clusters where one item is country 

In [21]:
# Cluster list
clust_dir='/home/erin/JUDITA/mdpi_sustainability/kwdsuic/kwd_per_cluster_organized_by_paper/train/'
clusters=['C10', 'C2', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9']
print(clusters)

# Save dir
save_dir='/home/erin/JUDITA/mdpi_sustainability/kwdsuic/kwd_per_cluster_organized_by_paper/freq_itemsets/'
Path(save_dir).mkdir(parents=True, exist_ok=True)


for cluster in clusters:
    files = glob.glob(clust_dir + cluster + '/*.txt')
    # read the words in the file and append to the dataset
    dset=[]
    for file in files:
        with open(file,'r') as f:
            line=f.read().rstrip().split(" ")
            #print(line)
            dset.append(line)
    print(cluster + " = " + str(len(dset)) )
    # try to find frequent itemset
    te = TransactionEncoder()
    te_ary = te.fit(dset).transform(dset)
    df = pd.DataFrame(te_ary, columns=te.columns_)
    frequent_itemsets = fpgrowth(df, min_support=0.005, use_colnames=True)
    frequent_itemsets['cluster']=[cluster]*frequent_itemsets.shape[0]
    frequent_itemsets['cluster_size']=[str(len(dset)) ]*frequent_itemsets.shape[0]
    frequent_itemsets.sort_values(by='support', ascending=False, inplace=True)
    frequent_itemsets.to_csv(save_dir + cluster + '_freq_itemset.csv')
    print(frequent_itemsets)




['C10', 'C2', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9']
C10 = 3859
      support                         itemsets cluster cluster_size
19   0.246178                        (suicide)     C10         3859
50   0.111169                     (depression)     C10         3859
7    0.090438                     (prevalence)     C10         3859
0    0.088624  (posttraumatic_stress_disorder)     C10         3859
51   0.086810                           (risk)     C10         3859
..        ...                              ...     ...          ...
294  0.005183         (resilience, deployment)     C10         3859
158  0.005183                      (childhood)     C10         3859
126  0.005183                      (migration)     C10         3859
160  0.005183                    (parasuicide)     C10         3859
229  0.005183                (mortality, iraq)     C10         3859

[353 rows x 4 columns]
C2 = 2293
      support                            itemsets cluster cluster_size
3    0.242913     