-
Notifications
You must be signed in to change notification settings - Fork 0
/
mining.py
144 lines (114 loc) · 5.69 KB
/
mining.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from fb.preprocessing import text_to_list, clean_twitter_text
from pymining import itemmining
import csv
import sys
import copy
def mine_frequent(transactions, min_sup):
# Find frequent itemsets in transactions
input = itemmining.get_relim_input(transactions)
min_sup_count = int(round(min_sup * len(transactions)))
return itemmining.relim(input, min_support=min_sup_count)
def mine_interesting_and_frequent(transactions, contrasting_transactions, min_sup, max_sup):
# Find frequent itemsets in transactions
frequent_itemsets = mine_frequent(transactions, min_sup)
# Count frequency in contrasting set
max_sup_count = int(round(max_sup * len(contrasting_transactions)))
contrasting_count = {}
for itemset in frequent_itemsets:
contrasting_matches = [row for row in contrasting_transactions if contains(row, itemset)]
contrasting_count[itemset] = len(contrasting_matches)
return {k: v for k, v in frequent_itemsets.items() if contrasting_count[k] < max_sup_count}
def contains(transaction, itemset):
"""
Returns True if the transaction contains the itemset
:param transaction:
:param itemset:
:return:
"""
return all([item in transaction for item in itemset])
def uniquify(items):
items = copy.deepcopy(items)
counts = {}
for i in range(len(items)):
if items[i] in counts:
counts[items[i]] += 1
items[i] = items[i] + '_' + str(counts[items[i]] - 1)
else:
counts[items[i]] = 1
return items
if __name__ == "__main__":
try:
data_file = sys.argv[1]
param = sys.argv[2]
effectiveness_threshold = float(param)
min_sup = float(sys.argv[3])
max_sup = float(sys.argv[4])
if max_sup > min_sup:
print('<min-sup-b> must be less than or equal to <min-sup-a>')
else:
print('Finding patterns in dataset ' + data_file)
print('effectiveness_threshold=' + str(effectiveness_threshold))
print('min_sup_a=' + str(min_sup))
print('min_sup_b=' + str(max_sup))
print()
with open(data_file) as file:
reader = csv.reader(file, delimiter=',', escapechar='\\')
data = [row for row in reader]
header = data[0]
data = data[1:]
# Handles two cases:
# 1. 'effective' attribute is given
# 2. 'effectiveness' attribute is given and must be compared to threshold to label as effective or ineffective
if 'effective' not in header:
header.append('effective')
effectiveness_col = header.index('effectiveness')
# Label each data point as effective or ineffective
for row in data:
if row[effectiveness_col] == '':
effective = None
else:
effective = float(row[effectiveness_col]) >= effectiveness_threshold
row.append(effective)
else:
effective_col = header.index('effective')
for row in data:
if row[effective_col] == 'True' or row[effective_col] == 'T':
row[effective_col] = True
elif row[effective_col] == 'False' or row[effective_col] == 'F':
row[effective_col] = False
else:
row[effective_col] = None
# Column indices
# author_col = header.index('author')
# topic_col = header.index('topic')
text_col = header.index('text')
effective_col = header.index('effective')
# Split effective posts and ineffective posts
effective_data = [row for row in data if row[effective_col]]
ineffective_data = [row for row in data if not row[effective_col]]
if 'twitter' in data_file:
# Convert text to a list of words
effective_text = [uniquify(text_to_list(clean_twitter_text(row[text_col]), True)) for row in
effective_data]
ineffective_text = [uniquify(text_to_list(clean_twitter_text(row[text_col]), True)) for row in
ineffective_data]
else:
# Convert text to a list of words
effective_text = [uniquify(text_to_list(row[text_col], True)) for row in effective_data]
ineffective_text = [uniquify(text_to_list(row[text_col], True)) for row in ineffective_data]
# Find frequent itemsets in both sets
frequent_effective = mine_frequent(effective_text, min_sup)
frequent_ineffective = mine_frequent(ineffective_text, min_sup)
# Find interesting frequent itemsets
interesting_effective = mine_interesting_and_frequent(effective_text, ineffective_text, min_sup, max_sup)
interesting_ineffective = mine_interesting_and_frequent(ineffective_text, effective_text, min_sup, max_sup)
print('Discriminating itemsets for effective posts:')
for k, v in interesting_effective.items():
print(str(set(k)) + ': frequency=' + str(v) + ', support=' + str(round(v / len(effective_data), 4)))
print()
print('Discriminating itemsets for ineffective posts:')
for k, v in interesting_ineffective.items():
print(str(set(k)) + ': frequency=' + str(v) + ', support=' + str(round(v / len(ineffective_data), 4)))
except (ValueError, IndexError):
print('usage:')
print('\tpython3 mining.py <dataset> <effectiveness-threshold> <min-sup-a> <max-sup-b>')