In [1]:
import pandas as pd
import numpy as np

## Data Sets

3 datasets that contain categorical attributes only. 

- **Congressional Voting Records Data Set**: 435 instances (Small). Contains missing values.
- **Tic-Tac-Toe Endgame Data Set**: 958 instances(Medium). Doesn't contain missing values.
- **Mushroom Data Set**: 8124 instances(Large).  Contains missing values.

In [2]:
path = "/Users/clararivadulla/Repositories/MAI-SEL-2022-23/PW1-PRISM/"

### Congressional Voting Records Data Set

In [3]:
congressional_voting_records = pd.read_csv(f"{path}/data/house-votes-84.csv")
congressional_voting_records.rename({'Class Name': 'class'}, axis=1, inplace=True)
congressional_voting_records.columns = congressional_voting_records.columns.str.replace(' ', '')
congressional_voting_records.head()

Unnamed: 0,class,handicapped-infants,water-project-cost-sharing,adoption-of-the-budget-resolution,physician-fee-freeze,el-salvador-aid,religious-groups-in-schools,anti-satellite-test-ban,aid-to-nicaraguan-contras,mx-missile,immigration,synfuels-corporation-cutback,education-spending,superfund-right-to-sue,crime,duty-free-exports,export-administration-act-south-africa
0,republican,n,y,n,y,y,y,n,n,n,y,?,y,y,y,n,y
1,republican,n,y,n,y,y,y,n,n,n,n,n,y,y,y,n,?
2,democrat,?,y,y,?,y,y,n,n,n,n,y,n,y,y,n,n
3,democrat,n,y,y,n,?,y,n,n,n,n,y,n,y,n,n,y
4,democrat,y,y,y,n,y,y,n,n,n,n,y,?,y,y,y,y


In [4]:
congressional_voting_records = congressional_voting_records.replace('?', np.nan)
missing_values_count = congressional_voting_records.isnull().sum(axis=1)
num_rows_missing_values = (missing_values_count > 0).sum()
print(f'# Missing Values: {num_rows_missing_values}')

# Missing Values: 203


In [5]:
congressional_voting_records = congressional_voting_records.dropna().reset_index()

### Tic-Tac-Toe Endgame Data Set

In [6]:
tic_tac_toe_endgame = pd.read_csv(f"{path}/data/tic-tac-toe-endgame.csv")
tic_tac_toe_endgame.rename({'V10': 'class'}, axis=1, inplace=True)
tic_tac_toe_endgame.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,class
0,x,x,x,x,o,o,x,o,o,positive
1,x,x,x,x,o,o,o,x,o,positive
2,x,x,x,x,o,o,o,o,x,positive
3,x,x,x,x,o,o,o,b,b,positive
4,x,x,x,x,o,o,b,o,b,positive


### Mushroom Data Set

**Missing Values**
2480, all for attribute number 11 (`'stalk-root'`, column 10).

**Classes**
- *e*: edible
- *p*: poisonous

In [7]:
mushroom = pd.read_csv(f"{path}/data/mushroom.csv")
mushroom.rename({'bruises%3F': 'bruises'}, axis=1, inplace=True)
mushroom.head()

Unnamed: 0,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,...,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat,class
0,x,s,n,t,p,f,c,n,k,e,...,w,w,p,w,o,p,k,s,u,p
1,x,s,y,t,a,f,c,b,k,e,...,w,w,p,w,o,p,n,n,g,e
2,b,s,w,t,l,f,c,b,n,e,...,w,w,p,w,o,p,n,n,m,e
3,x,y,w,t,p,f,c,n,n,e,...,w,w,p,w,o,p,k,s,u,p
4,x,s,g,f,n,f,w,b,k,t,...,w,w,p,w,o,e,n,a,g,e


In [8]:
num_mv = mushroom['stalk-root'].isnull().sum()
print(f'# Missing Values: {num_mv}')

# Missing Values: 2480


In [9]:
mushroom = mushroom.dropna().reset_index()

## PRISM Algorithm Implementation

In [30]:
def fit(x):
    
    P = {} # Prism ← ∅ 
    C = df['class'].unique() # unique classes
    
    for C_i in C: # for each class Ci do
        
        E = df[df['class'] == C_i]
        
        P[C_i] = {}
        
        i = 0
        while not E.empty: # while E ≠ ∅
            
            perfect = False
            
            available_attr = list(E.columns)
            available_attr.remove('class')
            if 'index' in available_attr:
                available_attr.remove('index')
            
            rule_attr_vals = []
            
            while not perfect and available_attr:
                
                max_val_tmp = 0
                max_attr_vals = (None, None)
                
                for attr in available_attr:
                    
                    values = list(E[attr].unique())
                    
                    for val in values:
                        
                        temp_rule_attr_vals = rule_attr_vals.copy()
                        temp_rule_attr_vals.append((attr, val))
                        
                        aux_E = E.copy()
                        for attr, val in temp_rule_attr_vals:
                            aux_E = aux_E.loc[aux_E[attr] == val]
                        positive = len(aux_E) 
                        
                        aux_df = df.copy()
                        for attr, val in temp_rule_attr_vals:
                            aux_df = aux_df.loc[aux_df[attr] == val]
                        total = len(aux_df)
                        
    
                        if total > 0:
                            p_t = positive/total
                        else: 
                            p_t = 0
                            
                        if p_t > max_val_tmp:
                            
                            max_val_tmp = p_t
                            max_attr_vals = (attr, val)
                            
                rule_attr_vals.append(max_attr_vals)
                available_attr.remove(max_attr_vals[0])
                
                if(max_val_tmp) == 1:
                    perfect = True
            
            aux_E = E.copy()
            for attr, val in rule_attr_vals:
                aux_E = aux_E.loc[aux_E[attr] == val]
                
            E = pd.merge(E, aux_E, how='outer', indicator=True)
            E = E[E['_merge'] == 'left_only'].drop('_merge', axis=1)

            P[C_i][i] = rule_attr_vals
            
            i += 1
            
            
    return P

In [31]:
prism(congressional_voting_records)

{'democrat': {0: [('physician-fee-freeze', 'n'), ('handicapped-infants', 'n')],
  1: [('handicapped-infants', 'y'),
   ('physician-fee-freeze', 'n'),
   ('water-project-cost-sharing', 'y')],
  2: [('handicapped-infants', 'y'),
   ('water-project-cost-sharing', 'n'),
   ('crime', 'n')],
  3: [('synfuels-corporation-cutback', 'y'),
   ('crime', 'y'),
   ('religious-groups-in-schools', 'n'),
   ('water-project-cost-sharing', 'n')],
  4: [('synfuels-corporation-cutback', 'y'),
   ('physician-fee-freeze', 'y'),
   ('mx-missile', 'y'),
   ('handicapped-infants', 'n')],
  5: [('synfuels-corporation-cutback', 'y'),
   ('export-administration-act-south-africa', 'n'),
   ('adoption-of-the-budget-resolution', 'y')],
  6: [('handicapped-infants', 'y'),
   ('crime', 'y'),
   ('physician-fee-freeze', 'n'),
   ('water-project-cost-sharing', 'n'),
   ('el-salvador-aid', 'y')],
  7: [('synfuels-corporation-cutback', 'y'),
   ('physician-fee-freeze', 'y'),
   ('el-salvador-aid', 'n'),
   ('adoption-of-t