import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from collections import Counter, OrderedDict

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
from collections import OrderedDict
import numpy as np

In [12]:
cols = ['age', 'workclass', 'fnlwgt', 'education', 'education.num', 'marital', 'occupation','relationship', 'race', 'gender', 'capgain', 'caploss', 'hours', 'country', 'income']

In [13]:
def process_adult(df):
    # replace missing values (?) to nan and then drop the columns
    df['country'] = df['country'].replace(' ?',np.nan)
    df['workclass'] = df['workclass'].replace(' ?',np.nan)
    df['occupation'] = df['occupation'].replace(' ?',np.nan)
    # dropping the NaN rows now
    df.dropna(how='any',inplace=True)
    df['income'] = df['income'].map({' <=50K': 0, ' >50K': 1}).astype(int)
    df['age'] = df['age'].apply(lambda x : 1 if x >= 45 else 0) # 1 if old, 0 if young
    df['workclass'] = df['workclass'].map({' Never-worked': 0, ' Without-pay': 1, ' State-gov': 2, ' Local-gov': 3, ' Federal-gov': 4, ' Self-emp-inc': 5, ' Self-emp-not-inc': 6, ' Private': 7}).astype(int)
    df['education'] = df['education'].map({' Preschool': 0, ' 1st-4th': 1, ' 5th-6th': 2, ' 7th-8th': 3, ' 9th': 4, ' 10th': 5, ' 11th': 6, ' 12th': 7, ' HS-grad':8, ' Some-college': 9, ' Bachelors': 10, ' Prof-school': 11, ' Assoc-acdm': 12, ' Assoc-voc': 13, ' Masters': 14, ' Doctorate': 15}).astype(int)
    df['marital'] = df['marital'].map({' Married-civ-spouse': 2, ' Divorced': 1, ' Never-married': 0, ' Separated': 1, ' Widowed': 1, ' Married-spouse-absent': 2, ' Married-AF-spouse': 2}).astype(int)
    df['relationship'] = df['relationship'].map({' Wife': 1 , ' Own-child': 0 , ' Husband': 1, ' Not-in-family': 0, ' Other-relative': 0, ' Unmarried': 0}).astype(int)
    df['race'] = df['race'].map({' White': 1, ' Asian-Pac-Islander': 0, ' Amer-Indian-Eskimo': 0, ' Other': 0, ' Black': 0}).astype(int)
    df['gender'] = df['gender'].map({' Male': 1, ' Female': 0}).astype(int)
    # process hours
    df.loc[(df['hours'] <= 40), 'hours'] = 0
    df.loc[(df['hours'] > 40), 'hours'] = 1
    df = df.drop(columns=['fnlwgt', 'education.num', 'occupation', 'country', 'capgain', 'caploss'])
    df = df.reset_index(drop=True)
    return df

In [14]:
df_train = pd.read_csv('adult.data', names = cols, sep=",")

In [15]:
df_train = process_adult(df_train)

In [16]:
df_train.head()

Unnamed: 0,age,workclass,education,marital,relationship,race,gender,hours,income
0,0,2,10,0,0,1,1,0,0
1,1,6,10,2,1,1,1,0,0
2,0,7,8,1,0,1,1,0,0
3,1,7,6,2,1,0,1,0,0
4,0,7,10,2,1,0,0,0,0


In [18]:
df = df_train.drop(columns='income')

In [19]:
df.head()

Unnamed: 0,age,workclass,education,marital,relationship,race,gender,hours
0,0,2,10,0,0,1,1,0
1,1,6,10,2,1,1,1,0
2,0,7,8,1,0,1,1,0
3,1,7,6,2,1,0,1,0
4,0,7,10,2,1,0,0,0


In [20]:
for column in df.columns:
    unique_values = df[column].unique()
    print(f'Unique values in column "{column}": {unique_values}')

Unique values in column "age": [0 1]
Unique values in column "workclass": [2 6 7 4 3 5 1]
Unique values in column "education": [10  8  6 14  4  9 12  3 15 13 11  2  5  0  7  1]
Unique values in column "marital": [0 2 1]
Unique values in column "relationship": [0 1]
Unique values in column "race": [1 0]
Unique values in column "gender": [1 0]
Unique values in column "hours": [0 1]


In [21]:
def match_pattern(string1, string2):
    if len(string1) == len(string2):
        for i in range(len(string1)):
            if string1[i].isdigit() and string2[i].isdigit():
                if string1[i] != string2[i]:
                    return 0
        else:
            return 1
        return 0

In [25]:

#Return the inverted index matrix for the dataset
def preprocessing(dataset):
    
    #Get cardinalities out of the dataset
    cardinalities = []
#     cardinalities = [[0,1],[0,1],[0,1]]
    for col in dataset.columns:
        cardinalities.append(dataset[col].unique().tolist())

    for cardinality in cardinalities:
        cardinality.sort()
        
    num = 1
    for cardinality in cardinalities:
        num *= len(cardinality)+1
        
    print('Total number of patterns in the dataset: ',num)


    #Get unique value combinations count
    dataset_string = []
#     dataset_string = ['010','001','000','011','001']
    for i in range(len(dataset)):
        dataset_string.append("".join(str(x) for x in dataset.iloc[i].values.tolist()))
        
    counts = OrderedDict()

    for item in dataset_string:
        counts[item] = counts.get(item, 0) + 1

    data_unique_values = list(counts.keys())
    data_value_counts = list(counts.values())
    
    inverted_ind = []
    
    #create inverted index matrix
    for i,cardinality in enumerate(cardinalities):
        for cardinality_val in cardinality:
            new_row = []
            for val in data_unique_values:
                if cardinality_val == int(val[i]):
                    new_row.append(1)
                else:
                    new_row.append(0) 
            inverted_ind.append(new_row)
            
    return inverted_ind, data_value_counts, cardinalities
        

def cov(pattern, dataset):  #tested_OK
    """
    Returns the number of instances in the dataset covered by the given pattern.
    """
    count = 0

    for i in range(len(dataset)):
        res = match_pattern(pattern, "".join(str(x) for x in df.iloc[i].values.tolist()))
        if res == 1:
            count += 1
#     print(count)
    return count


def coverage_optimized(pattern, inverted_ind, data_value_counts, cardinalities):  #invertedindices  #tested_OK for the example given in the research paper

    result_and = [1] * len(inverted_ind[0])
    row_index = 0
    for i,x in enumerate(pattern):
        if x!= 'X':
            
            #find the value x in cardinalities[i]
            index = cardinalities[i].index(int(x))
            row_index += index
            inverted_ind_row = inverted_ind[row_index]
            for j in range(len(inverted_ind_row)):
                result_and[j] = int(inverted_ind_row[j] and result_and[j])
            row_index -= index
            row_index += len(cardinalities[i])
        else:
            row_index += len(cardinalities[i])      

    # DOT Product between the above result and the count array for the datapoints
    coverage = sum([x*y for x, y in zip(data_value_counts, result_and)])
    return coverage
    

def generate_parent_nodes(pattern):  #tested_OK
    """
    Generates all parent nodes of the given pattern by replacing one deterministic
    cell with a wildcard character.
    """
    parents = []
    for i in range(len(pattern)):
        new_string = pattern[:i] + "X" + pattern[i+1:]
        if new_string != pattern:
            parents.append(new_string)
    return parents


def generate_nodes(pattern, cardinalities): #tested_OK
    """
    Generates all nodes on the given pattern and cardinalities based on Rule 1.
    """
    
    """
    TODO: Make cardinalities 2D vector, so that it can be traversed to find the children of a pattern
    """

    # Find the index of the right-most deterministic element in the pattern
    index = len(pattern) - 1
    rm_deter = -1
    while index >= 0:
        if pattern[index] != 'X':
            rm_deter = index
            break
        index -= 1

    candidate_nodes = []
    rm_deter += 1
    if rm_deter >= 0:
        while rm_deter < len(pattern):
            index = rm_deter
            for value in cardinalities[index]:
                candidate_node = pattern
                candidate_node = pattern[:index] + str(value) + pattern[index+1:] 
                candidate_nodes.append(candidate_node)
            rm_deter += 1

    return candidate_nodes


def dominance(pattern, mups):
    #iterate through the mups, find if any of the mups is an ancestor for the pattern p return 0
    # if pattern p is an ancestor for any of the mups return 1
    # else return -1
    
    for m in mups:
        for i,x in enumerate(m):
            if m[i] != pattern[i]:
                if m[i] == 'X' and pattern[i] != 'X':
                    return 0
                elif m[i] != 'X' and pattern[i] == 'X':
                    return 1
    
    return -1

    
def deepdiver(dataset, threshold):
    """
    Finds the maximal uncovered patterns in the dataset.
    """ 
    inverted_index, unique_value_counts, cardinalities = preprocessing(dataset);

    stack = ['XXXXXXXX']
    maximal_uncovered = []
    while stack:
        pattern = stack.pop()
        uncovered_flag = False
        if dominance(pattern, maximal_uncovered) == 0:
            continue
        elif dominance(pattern, maximal_uncovered) == 1:
            uncovered_flag = False
        else: 
            count = coverage_optimized(pattern, inverted_index, unique_value_counts, cardinalities)
            if count < threshold:
                uncovered_flag = True
        if uncovered_flag:
            stack0 = []
            stack0.append(pattern)
            while stack0:
                pattern0 = stack0.pop()
                parent_nodes = generate_parent_nodes(pattern0)
                for p in parent_nodes:
                    count0 = coverage_optimized(p, inverted_index, unique_value_counts, cardinalities )
                    if count0 < threshold:
                        stack0.append(p)
                        break                   
                maximal_uncovered.append(pattern)
        else:
            stack.extend(generate_nodes(pattern, cardinalities))
            
    print('MUPs are: ', maximal_uncovered)
    return maximal_uncovered


In [26]:
inv, d, cardinalities = preprocessing(df)
print(coverage_optimized('XXXXXXXX',inv,d,cardinalities))


Total number of patterns in the dataset:  132192
30162


In [28]:
deepdiver(df, 20)

Total number of patterns in the dataset:  132192
MUPs are:  ['XXX20101', 'XXX11111', 'XXX11111', 'XXX11111', 'XXX11110', 'XXX11110', 'XXX11110', 'XXX11101', 'XXX11101', 'XXX11101', 'XXX11100', 'XXX11100', 'XXX11100', 'XXX11001', 'XXX11001', 'XXX11001', 'XXX11000', 'XXX11000', 'XXX11000', 'XXX01111', 'XXX01111', 'XXX01111', 'XXX01110', 'XXX01110', 'XXX01110', 'XXX01101', 'XXX01101', 'XXX01101', 'XXX01100', 'XXX01100', 'XXX01100']


['XXX20101',
 'XXX11111',
 'XXX11111',
 'XXX11111',
 'XXX11110',
 'XXX11110',
 'XXX11110',
 'XXX11101',
 'XXX11101',
 'XXX11101',
 'XXX11100',
 'XXX11100',
 'XXX11100',
 'XXX11001',
 'XXX11001',
 'XXX11001',
 'XXX11000',
 'XXX11000',
 'XXX11000',
 'XXX01111',
 'XXX01111',
 'XXX01111',
 'XXX01110',
 'XXX01110',
 'XXX01110',
 'XXX01101',
 'XXX01101',
 'XXX01101',
 'XXX01100',
 'XXX01100',
 'XXX01100']

In [None]:
def getStringsMatchingWithPattern(pattern, df):
    
    
    #Get unique value combinations count
    dataset_string = []
    for i in range(len(df)):
        dataset_string.append("".join(str(x) for x in df.iloc[i].values.tolist()))
        
    
    strings = []
    flag = True
    for s in dataset_string:
        flag = True
        for i,x in enumerate(s):
            if x != pattern[i]:
                if pattern[i] != 'X':
#                     print('string:',s[i])
#                     print('string:',pattern[i])
                    flag = False
                    
        if flag:
            strings.append(s)
            
    print(len(strings))       
    return strings
        
            