In [1]:
import csv
import datetime
import pickle
import sys
import timeit

import pandas as pd
from mlxtend.frequent_patterns import association_rules
from mlxtend.frequent_patterns import apriori

In [2]:
X_TRAIN_FILE = '../X_train_without_biochem.pkl'
X_TEST_FILE = '../X_test_without_biochem.pkl'
Y_TRAIN_FILE = '../y_train_without_biochem.pkl'
Y_TEST_FILE = '../y_test_without_biochem.pkl'

MIN_SUPPORT = 0.025
MIN_THRESHOLD = 1.0

APRIORI_FREQUENT_ITEMSETS_FILE_CSV = 'frequent_itemsets_without_biochem.csv'
APRIORI_FREQUENT_ITEMSETS_FILE_PICKLE = 'frequent_itemsets_without_biochem.pkl'
APRIORI_RULES_FILE_CSV = 'rules_without_biochem.csv'
APRIORI_RULES_FILE_PICKLE = 'rules_without_biochem.pkl'
APRIORI_OUTPUT_FILE_CSV = 'output_without_biochem.csv'
APRIORI_OUTPUT_FILE_PICKLE = 'output_without_biochem.pkl'

In [3]:
script_start_time = datetime.datetime.now()
print('{} started at {}'.format(sys.argv[0], script_start_time))

/Users/dan/.local/share/virtualenvs/machine-learning-for-quality-improvement-NjV9ptfu/lib/python3.6/site-packages/ipykernel_launcher.py started at 2018-06-07 00:46:26.968330


In [4]:
# Generate the rubric dictionary
print('Generating rubric dictionary...', end='')
start_time = timeit.default_timer()
in_read_file = open('../input/large_anon_test_records_for_sharing.csv', mode='r')
out_read_file = open('../input/large_anon_test_records_for_sharing.csv', mode='r')
in_read_csv = csv.reader(in_read_file)
out_read_csv = csv.reader(out_read_file)
in_rubrics = {row[4]: row[3] for row in in_read_csv}
out_rubrics = {row[4]: row[3] for row in out_read_csv}
rubrics = {**in_rubrics, **out_rubrics}
in_read_file.close()
out_read_file.close()

LABELS = ['ENTERING_INDICATOR', 'LEAVING_INDICATOR']

def get_rubric(read_code):
    if read_code in LABELS: 
        return read_code
    
    return rubrics.get(read_code, 'unknown')

def is_entering_leaving(itemset):
    return any(s in LABELS for s in itemset)
print(' done in {:.2f}s'.format(timeit.default_timer() - start_time), flush=True)

Generating rubric dictionary... done in 4.48s


In [5]:
# Fetch all records first
print('Reading data...', end='')
start_time = timeit.default_timer()
X_train, X_test = pd.read_pickle(X_TRAIN_FILE), pd.read_pickle(X_TEST_FILE)
y_train, y_test = pd.read_pickle(Y_TRAIN_FILE), pd.read_pickle(Y_TEST_FILE)
print(' done in {:.2f}s'.format(timeit.default_timer() - start_time), flush=True)

Reading data... done in 0.03s


In [6]:
# Fetch all records first
print('Preprocessing...', end='')
start_time = timeit.default_timer()
train_df = pd.concat([X_train, y_train.rename('label')], axis=1)
test_df = pd.concat([X_test, y_test.rename('label')], axis=1)
merged_df = pd.concat([train_df, test_df], axis=0)
merged_df['ENTERING_INDICATOR'] = merged_df.label.apply(lambda x: x)
merged_df['LEAVING_INDICATOR'] = merged_df.label.apply(lambda x: not x)
merged_df.drop('label', axis=1, inplace=True)
print(' done in {:.2f}s'.format(timeit.default_timer() - start_time), flush=True)

Preprocessing... done in 0.04s


In [7]:
print('Calculating frequent itemsets...', end='')
start_time = timeit.default_timer()
frequent_itemsets = apriori(merged_df, min_support=MIN_SUPPORT, use_colnames=True)
print(' done in {:.2f}s'.format(timeit.default_timer() - start_time), flush=True)

# Log to pickle and CSV
with open(APRIORI_FREQUENT_ITEMSETS_FILE_PICKLE, mode='wb') as file:
    pickle.dump(frequent_itemsets, file)
frequent_itemsets.to_csv(APRIORI_FREQUENT_ITEMSETS_FILE_CSV)
print('Frequent itemsets saved at {} and {}'.format(APRIORI_FREQUENT_ITEMSETS_FILE_PICKLE, APRIORI_FREQUENT_ITEMSETS_FILE_CSV))

Calculating frequent itemsets... done in 0.91s
Frequent itemsets saved at frequent_itemsets_without_biochem.pkl and frequent_itemsets_without_biochem.csv


In [8]:
print('Generating rules...', end='')
start_time = timeit.default_timer()
rules = association_rules(frequent_itemsets, metric='lift', min_threshold=MIN_THRESHOLD)
print(' done in {:.2f}s'.format(timeit.default_timer() - start_time), flush=True)

# Calculate length of each itemset in the antecedents and the consequents
rules['antecedant length'] = rules['antecedants'].apply(lambda x: len(x))
rules['consequent length'] = rules['consequents'].apply(lambda x: len(x))

# Filter itemsets to only include those with 2 or more items
rules = rules.loc[(rules['antecedant length'] >= 2) | (rules['consequent length'] >= 2), :]

rules.drop(['antecedant length', 'consequent length'], axis='columns', inplace=True)

# Add rubric for easier interpretation
print('Converting Read codes to rubric...')
start_time = timeit.default_timer()
rules.drop(['antecedent support', 'consequent support', 'leverage', 'conviction'], axis='columns', inplace=True)
rules['antecedants rubric'] = rules['antecedants'].apply(lambda itemset: [get_rubric(item) for item in itemset])
rules['consequents rubric'] = rules['consequents'].apply(lambda itemset: [get_rubric(item) for item in itemset])
print(' done in {:.2f}s'.format(timeit.default_timer() - start_time), flush=True)

# Sort by lift
rules.sort_values('lift', ascending=False, inplace=True)

# Log to pickle and CSV
with open(APRIORI_RULES_FILE_PICKLE, mode='wb') as file:
    pickle.dump(rules, file)
rules.to_csv(APRIORI_RULES_FILE_CSV)
print('Rules saved at {} and {}'.format(APRIORI_RULES_FILE_PICKLE, APRIORI_RULES_FILE_CSV))

# Filter to those rules containing a label in LABELS
is_antecedent_entering_leaving = rules.antecedants.apply(is_entering_leaving)
is_consequent_entering_leaving = rules.consequents.apply(is_entering_leaving)

output_df = rules.loc[(is_antecedent_entering_leaving) | (is_consequent_entering_leaving), :]

# Drop the antecedants and consequents columns
output_df.drop(['antecedants', 'consequents'], axis='columns', inplace=True)

with open(APRIORI_OUTPUT_FILE_PICKLE, mode='wb') as file:
    pickle.dump(output_df, file)
output_df.to_csv(APRIORI_OUTPUT_FILE_CSV)
print('Output saved at {} and {}'.format(APRIORI_OUTPUT_FILE_PICKLE, APRIORI_OUTPUT_FILE_CSV))

Generating rules... done in 0.05s
Converting Read codes to rubric...
 done in 0.01s
Rules saved at rules_without_biochem.pkl and rules_without_biochem.csv
Output saved at output_without_biochem.pkl and output_without_biochem.csv


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [9]:
script_end_time = datetime.datetime.now()
print('{} completed at {}'.format(
    sys.argv[0], 
    script_end_time)
)
print('Total time: {}'.format(script_end_time - script_start_time))

/Users/dan/.local/share/virtualenvs/machine-learning-for-quality-improvement-NjV9ptfu/lib/python3.6/site-packages/ipykernel_launcher.py completed at 2018-06-07 00:46:32.882789
Total time: 0:00:05.914459
