In [1]:
# Exercise 5

In [2]:
import math
import numpy as np
import pandas as pd

import lpatid as lpi

In [3]:
pres_data = pd.read_csv("datasets/USPresidency.csv")
pres_data.head()

Unnamed: 0,Year,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12,Target
0,1864,0,0,0,0,1,0,0,1,1,0,0,0,1
1,1868,1,1,0,0,0,0,1,1,1,0,1,0,1
2,1872,1,1,0,0,1,0,1,0,0,0,1,0,1
3,1880,1,0,0,1,0,0,1,1,0,0,0,0,1
4,1888,0,0,0,0,1,0,0,0,0,0,0,0,1


In [4]:
class_labels = pres_data.iloc[:, 13]
years = pres_data.iloc[:, 0]
pres_data = pres_data.iloc[:, 1:13]
pres_data.head()

Unnamed: 0,Q1,Q2,Q3,Q4,Q5,Q6,Q7,Q8,Q9,Q10,Q11,Q12
0,0,0,0,0,1,0,0,1,1,0,0,0
1,1,1,0,0,0,0,1,1,1,0,1,0
2,1,1,0,0,1,0,1,0,0,0,1,0
3,1,0,0,1,0,0,1,1,0,0,0,0
4,0,0,0,0,1,0,0,0,0,0,0,0


In [5]:
# Convert instances into strings and generate alphabet
inst_str = []
alphabet = set()
for i, instance in enumerate(pres_data.iterrows()):
    string = ""
    for item in instance[1].values:
        string += str(item)
        alphabet.add(str(item))
        
    inst_str.append(string)
    
print(alphabet)

{'0', '1'}


In [6]:
# Generate two disjoint sets, one for each target class
set_incumbent = set()
set_challenger = set()
for i, string in zip(class_labels, inst_str):
    if (i == 1):  # Target = incumbent victory
        set_incumbent.add(string)
    else:  # Target = challenger victory
        set_challenger.add(string)

In [7]:
# Generate patterns
l = 7
patterns = lpi.lpattern(set_incumbent, set_challenger, l)
print(patterns)

Successfully found small enough set of patterns
(['***0*******0', '100100110000', '010010100001'], ['***1***0****', '***10**110*0', '***0**0**001'])


In [8]:
# Double-check patterns describe the data set
matches = set()
for pattern in patterns[0]:
    for string in set_incumbent:
        if lpi.compatible([string], [pattern]):
            matches.add(string)
            
        if lpi.compatible(set_challenger, [pattern], mode='any'):
            print("Error: matches opposite class", pattern)

if matches == set_incumbent:
    print("success")
else:
    print("failure")
    
# Double-check patterns describe the data set
matches = set()
for pattern in patterns[1]:
    for string in set_challenger:
        if lpi.compatible([string], [pattern]):
            matches.add(string)
            
        if lpi.compatible(set_incumbent, [pattern], mode='any'):
            print("Error: matches opposite class", pattern)

if matches == set_challenger:
    print("success")
else:
    print("failure")

success
success


In [9]:
lpi.reduce_patterns(patterns)
print(patterns)

(['***0*******0', '100100110000', '010010100001'], ['***1***0****', '***10**110*0', '***0**0**001'])


In [22]:
# The local search heuristic can also be used to find a set of patterns, not necessarily fulfilling |P|<=l"
patterns = (list(set_incumbent), list(set_challenger))
lpi.reduce_patterns(patterns)
print(patterns)

(['*00**0*10000', '*1001*1*00**', '*1*0***1***0', '***0*0***0*0'], ['1**10**11000', '00*1*1******', '110*01000*0*', '******0***01', '1**1*0*0***0'])


In [11]:
# Test reduce_patterns function as results for presidency dataset aren't being reduced
test_patterns = (["0111", "0011", "1100", "0000"], ["0101", "0100"])
lpi.reduce_patterns(test_patterns)
print(test_patterns)

(['1100', '0000', '0*11'], ['010*'])


In [12]:
# Voting dataset
vote_data = pd.read_csv("datasets/house-votes-84.csv", header=None)
vote_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,republican,n,y,n,y,y,y,n,n,n,y,?,y,y,y,n,y
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?
2,democrat,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n
3,democrat,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y
4,democrat,y,y,y,n,y,y,n,n,n,n,y,?,y,y,y,y


In [13]:
v_class_labels = vote_data.iloc[:, 0]
vote_data = vote_data.iloc[:, 1:]
vote_data.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,n,y,n,y,y,y,n,n,n,y,?,y,y,y,n,y
1,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?
2,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n
3,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y
4,y,y,y,n,y,y,n,n,n,n,y,?,y,y,y,y


In [14]:
# Convert instances into strings and generate alphabet
v_inst_str = []
v_alphabet = set()
for i, instance in enumerate(vote_data.iterrows()):
    string = ""
    for item in instance[1].values:
        string += str(item)
        v_alphabet.add(str(item))
        
    v_inst_str.append(string)
    
print(v_alphabet)

{'?', 'y', 'n'}


In [15]:
# Generate two disjoint sets, one for each target class
set_democrats = set()
set_republicans = set()
j = 0
for i, string in zip(v_class_labels, v_inst_str):
    # Only find patterns for a subset, due to runtime of lpatterns algorithm
    # Dataset appears to be randomly ordered, taking first 90 for consistent results
    if j > 90:
        break
        
    j += 1
    
    if (i == 'democrat'):
        set_democrats.add(string)
    else:
        set_republicans.add(string)

In [23]:
print(len(set_democrats), len(set_republicans))

45 31


In [16]:
# Generate patterns
l = 13
v_patterns = lpi.lpattern(set_democrats, set_republicans, l)
print(v_patterns)

Successfully found small enough set of patterns
(['**yn************', 'nyyyyyn**yy*yyn*', 'nynyyynnnnnn?yyy', '?yy?yynnnnynyynn'], ['**nyy*********n*', 'nynyyynnnn**yy?*', 'y*yy*ny*yy*n*yny'])


In [17]:
lpi.reduce_patterns(v_patterns)
print(v_patterns)

(['**yn************', 'nynyyynnnnnn?yyy', '*yy*yyn***y*yyn*'], ['**nyy*********n*', 'nynyyynnnn**yy?*', 'y*yy*ny*yy*n*yny'])


In [18]:
# The local search heuristic can also be used to find a set of patterns, not necessarily fulfilling |P|<=l"
patterns = (list(set_democrats), list(set_republicans))
lpi.reduce_patterns(patterns)
print(patterns)

(['nynyyynnnnnn?yyy', 'nyy**ynnn*yny*n*', '*yy**y**nny**yn*', 'nyy****yy***y***', '**yn************', '**yn************', '**yn************'], ['y**y*****y***yny', '**nyy**nn*****n*', '***yy**n**n*yyn*', '**nyy*******yy**'])
