# Pandas for association rule mining

I created this algorithm at https://github.com/FlightDataServices/carmine before which has been discontinued. I want to re-create it here

In [48]:
import pandas as pd
import seaborn as sns
from itertools import combinations

In [28]:
df = sns.load_dataset("titanic")

In [29]:
df.head()
df = df.drop(columns=["alive"])

In [30]:
for x in ["age", "fare"]:
    df[x] = pd.qcut(df[x],10).astype(str)

In [31]:
y_col = "survived"
feature_cols = [x for x in df.columns if x != y_col]

In [32]:
for col in feature_cols:
    df[col] = "(" + str(col) + " = " + df[col].astype(str) + ")"

In [39]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alone
0,0,(pclass = 3),(sex = male),"(age = (19.0, 22.0])",(sibsp = 1),(parch = 0),"(fare = (-0.001, 7.55])",(embarked = S),(class = Third),(who = man),(adult_male = True),(deck = nan),(embark_town = Southampton),(alone = False)
1,1,(pclass = 1),(sex = female),"(age = (36.0, 41.0])",(sibsp = 1),(parch = 0),"(fare = (39.688, 77.958])",(embarked = C),(class = First),(who = woman),(adult_male = False),(deck = C),(embark_town = Cherbourg),(alone = False)
2,1,(pclass = 3),(sex = female),"(age = (25.0, 28.0])",(sibsp = 0),(parch = 0),"(fare = (7.854, 8.05])",(embarked = S),(class = Third),(who = woman),(adult_male = False),(deck = nan),(embark_town = Southampton),(alone = True)
3,1,(pclass = 1),(sex = female),"(age = (31.8, 36.0])",(sibsp = 1),(parch = 0),"(fare = (39.688, 77.958])",(embarked = S),(class = First),(who = woman),(adult_male = False),(deck = C),(embark_town = Southampton),(alone = False)
4,0,(pclass = 3),(sex = male),"(age = (31.8, 36.0])",(sibsp = 0),(parch = 0),"(fare = (7.854, 8.05])",(embarked = S),(class = Third),(who = man),(adult_male = True),(deck = nan),(embark_town = Southampton),(alone = True)


In [83]:
N = df.shape[0]
supp_y = df[y_col].mean()

In [102]:
def create_column_combinations(columns, limit=2):
    """
    Returns all possible column combinations up to limit.
    """
    combintaion_list = []
    for i in range(1,limit+1):
        combintaion_list += list(combinations(columns,i))
    return combintaion_list

# create_column_combinations(feature_cols,2)

In [108]:
def rule_miner(df, limit=2):
    """
    uses pandas groupby for association rule mining
    """
    results = []
    for combination in create_column_combinations(feature_cols,limit):
        _df = df.groupby(combination)[y_col].agg(["count","mean"]).reset_index()
        _df["itemset"] = _df.iloc[:,:-2].sum(axis=1)
        _df = _df.iloc[:,-3:]
        results.append(_df)
        
    df_total = pd.concat(results,axis=0).reset_index(drop=True)

    
    df_total = df_total.rename(columns={"mean": "confidence(X,Y)", "count":"count(X)"})
    df_total["support(X)"] = df_total["count(X)"] / N
    df_total["lift(X,Y)"] = df_total["confidence(X,Y)"] / supp_y
    df_total = df_total[['itemset','count(X)', 'support(X)', 'confidence(X,Y)', 'lift(X,Y)']]
    return df_total
    

In [109]:
test= rule_miner(df, limit=3)

  import sys


In [110]:
test

Unnamed: 0,itemset,count(X),support(X),"confidence(X,Y)","lift(X,Y)"
0,(pclass = 1),216,0.242424,0.629630,1.640351
1,(pclass = 2),184,0.206510,0.472826,1.231836
2,(pclass = 3),491,0.551066,0.242363,0.631418
3,(sex = female),314,0.352413,0.742038,1.933205
4,(sex = male),577,0.647587,0.188908,0.492155
...,...,...,...,...,...
14776,(deck = nan)(embark_town = Cherbourg)(alone = ...,57,0.063973,0.315789,0.822715
14777,(deck = nan)(embark_town = Queenstown)(alone =...,18,0.020202,0.333333,0.868421
14778,(deck = nan)(embark_town = Queenstown)(alone =...,55,0.061728,0.400000,1.042105
14779,(deck = nan)(embark_town = Southampton)(alone ...,184,0.206510,0.375000,0.976974


In [77]:
results = []
for combination in create_column_combinations(feature_cols,2):
    _df = df.groupby(combination)[y_col].agg(["count","mean"]).reset_index()
    _df["itemset"] = _df.iloc[:,:-2].sum(axis=1)
    _df = _df.iloc[:,-3:]

    results.append(_df)
#     _df.columns = ["itemset", "count", "confidence"]
    
# results
# _df.columns = ["itemset", "count", "confidence"]
# _df["support"] = _df["count"] / N # this can be done at the end
# _df

  This is separate from the ipykernel package so we can avoid doing imports until


In [95]:
df_total = pd.concat(results,axis=0).reset_index(drop=True)
# _df["itemset"] = _df.iloc[:,:-2].sum(axis=1)
# _df

In [96]:
df_total = df_total.rename(columns={"mean": "confidence(X,Y)", "count":"count(X)"})
df_total["support(X)"] = df_total["count(X)"] / N
df_total["lift(X,Y)"] = df_total["confidence(X,Y)"] / supp_y
df_total = df_total[['itemset','count(X)', 'support(X)', 'confidence(X,Y)', 'lift(X,Y)']]

In [97]:
df_total

Unnamed: 0,itemset,count(X),support(X),"confidence(X,Y)","lift(X,Y)"
0,(pclass = 1),216,0.242424,0.629630,1.640351
1,(pclass = 2),184,0.206510,0.472826,1.231836
2,(pclass = 3),491,0.551066,0.242363,0.631418
3,(sex = female),314,0.352413,0.742038,1.933205
4,(sex = male),577,0.647587,0.188908,0.492155
...,...,...,...,...,...
1476,(embark_town = Queenstown)(alone = False),20,0.022447,0.350000,0.911842
1477,(embark_town = Queenstown)(alone = True),57,0.063973,0.403509,1.051247
1478,(embark_town = Southampton)(alone = False),251,0.281706,0.462151,1.204026
1479,(embark_town = Southampton)(alone = True),393,0.441077,0.256997,0.669546


In [101]:
df_total[df_total["count(X)"]>200].sort_values("lift(X,Y)",ascending=False)

Unnamed: 0,itemset,count(X),support(X),"confidence(X,Y)","lift(X,Y)"
297,(sex = female)(who = woman),271,0.304153,0.756458,1.970771
1374,(who = woman)(adult_male = False),271,0.304153,0.756458,1.970771
49,(who = woman),271,0.304153,0.756458,1.970771
3,(sex = female),314,0.352413,0.742038,1.933205
300,(sex = female)(adult_male = False),314,0.352413,0.742038,1.933205
...,...,...,...,...,...
1334,(class = Third)(who = man),319,0.358025,0.119122,0.310345
183,(pclass = 3)(adult_male = True),319,0.358025,0.119122,0.310345
176,(pclass = 3)(who = man),319,0.358025,0.119122,0.310345
1388,(who = man)(deck = nan),438,0.491582,0.114155,0.297404
