In [70]:
import pandas as pd
import numpy as np
from itertools import permutations
import copy

# 데이터 셋 생성

In [5]:
def arrayToDf(phase_name: str, array, col_name: str) -> pd.DataFrame: 
    
    """
    array를 받아서 원하는 범주에 할당하여 데이터프레임을 생성합니다.
    """
    
    dfFromArray = pd.DataFrame(data=array, index=[phase_name]*len(np_array), columns=[col_name]).reset_index()
    dfFromArray.columns = ['category', col_name]
    
    return dfFromArray
    

In [6]:
user_a = np.arange(100)
user_b = np.arange(70)
user_c = np.arange(50)
user_d = np.arange(30)

In [7]:
user_df = pd.DataFrame()

for np_array, category in zip([user_a, user_b, user_c, user_d], ['phase1', 'phase2', 'phase3', 'phase4']):
    user_df = pd.concat([user_df, arrayToDf(category, np_array, 'userId')])

In [37]:
user_df.loc[(user_df.category.isin(['phase1']))]

Unnamed: 0,category,userId
0,phase1,0
1,phase1,1
2,phase1,2
3,phase1,3
4,phase1,4
...,...,...
95,phase1,95
96,phase1,96
97,phase1,97
98,phase1,98


In [8]:
user_df.head()

Unnamed: 0,category,userId
0,phase1,0
1,phase1,1
2,phase1,2
3,phase1,3
4,phase1,4


# 퍼널 1(순서대로 진행 되는 경우)

In [33]:
def make_funnel(df: pd.DataFrame, category: str, userid: str, phase_list: list) -> pd.DataFrame:
    
    """
    퍼널 데이터 형식의 데이터프레임을 출력해줍니다.
    
    -------------param--------------
    df: 데이터프레임
    category: 범주형변수 이름
    userid : 유저아이디
    phase_list: 퍼널 구성 단계
    """
    
    empty_list = []
    
    intersectArray = np.array(df.loc[(df[category] == phase_list[0]), userid]) #array 초기화
    initialCnt = len(intersectArray) #모수
    
    phase_name = phase_list[0] #시작 퍼널
    
    for phase in phase_list:
        
        phaseArray = np.array(df.loc[(df[category] == phase), userid])
        intersectArray = np.intersect1d(intersectArray, phaseArray) #퍼널 통과 유저 array 
        intersectCnt = len(intersectArray)
        
        phase_name = str(phase_list.index(phase) + 1) +'.' + phase
        
        empty_list.append(["->".join(phase_list), phase_name, intersectCnt, intersectCnt/initialCnt])
            
    df = pd.DataFrame(data=empty_list, columns = ['퍼널이름', '퍼널단계', '유저수', '전환율'])
    
    return df
        

In [29]:
testDf = make_funnel(user_df, 'category', 'userId', ['phase1', 'phase2', 'phase3', 'phase4'])

In [16]:
np.array(user_df.loc[(user_df.category == 'phase4'), 'userId'])

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29])

In [32]:
#한 번에 여러 스타일을 적용해야함.
testDf.style.format({'전환율':'{:,.1%}'.format})\
        .set_properties(**{'text-align': 'left'})\
        .bar(subset=['전환율'], width=100, align='left', vmin=0, vmax=1)\
        .set_table_styles(
                [{"selector": "", "props": [("border", "1px solid grey")]},
                 {"selector": "tbody td", "props": [("border", "1px solid grey")]},
                 {"selector": "th", "props": [("border", "1px solid grey")]}])

Unnamed: 0,퍼널이름,퍼널단계,유저수,전환율
0,phase1->phase2->phase3->phase4,1.phase1,100,100.0%
1,phase1->phase2->phase3->phase4,2.phase2,70,70.0%
2,phase1->phase2->phase3->phase4,3.phase3,52,52.0%
3,phase1->phase2->phase3->phase4,4.phase4,36,36.0%


# 퍼널 2(건너뛸 수 있는 경우)

In [34]:
def funnelNameList(cntPhase: int, check: list = []) -> list:

    """
    퍼널 리스트를 인덱스 형식으로 구성합니다.
    
    1. permutation으로 인덱스 개수별 순열 리스트를 만듭니다.
    2. sorted와 set을 활용하여 [1, 3, 4] / [1, 4, 3] 다음과 같은 리스트를 [1, 3, 4]하나의 인덱스 리스트로 만들어 줍니다.
    3. 다시, str을 int로 변환하여 인덱스 리스트를 만들어 줍니다.
    4. 이때, 반드시 통과해야할 퍼널 단계가 있는지 체크해줍니다.
    5. 최초 인덱스 [0] 는 제일 마지막 앞자리에 붙여줍니다. 
    
    생각보다 복잡합니다. 단순한 방법이 있을까요?
    
    -------------param--------------
    cntphase: 퍼널 구성 요소 수
    check: 반드시 통과해야할 퍼널 단계를 리스트로 입력해줍니다.
    """
    
    empty_list = []
    
    array = [i for i in range(1, cntPhase)]
    
    for i in range(1, len(array)+1):
        empty_list.extend(permutations(array, i))
        
    sorted_list = list(set(["".join(sorted([str(k) for k in element])) for element in empty_list]))
    final_list = [[0] + [int(k) for k in list_a] for list_a in sorted_list if set(check).issubset([int(k) for k in list_a]) == True]
    
    return final_list

In [19]:
user_a = np.arange(100)
user_b = np.random.choice(range(10, 90), size=70, replace=False)
user_c = np.random.choice(range(10, 90), size=60, replace=False)
user_d = np.random.choice(range(10, 90), size=50, replace=False)

In [20]:
user_df = pd.DataFrame()

for np_array, category in zip([user_a, user_b, user_c, user_d], ['phase1', 'phase2', 'phase3', 'phase4']):
    user_df = pd.concat([user_df, arrayToDf(category, np_array, 'userId')])

In [21]:
funnelNameList(4, [1])

[[0, 1, 3], [0, 1, 2, 3], [0, 1], [0, 1, 2]]

In [22]:
phase_list = ['phase1', 'phase2', 'phase3', 'phase4']
testDF2 = pd.DataFrame()

for funnel in funnelNameList(4, [1]):
    
    df = make_funnel(user_df, 'category', 'userId', [phase_list[i] for i in funnel])
    
    testDF2 = pd.concat([testDF2, df])


In [23]:
testDF2['idx1'] = testDF2['퍼널이름'].map(lambda x : len(x))

In [24]:
testDF2 = testDF2.sort_values(by=['idx1'], ascending=[False]).iloc[:, [0, 1, 2, 3]]

In [25]:
testDF2.reset_index(drop=True).style.format({'전환율':'{:,.1%}'.format})\
        .set_properties(**{'text-align': 'left'})\
        .bar(subset=['전환율'], width=100, align='left', vmin=0, vmax=1)\
        .set_table_styles(
                [{"selector": "", "props": [("border", "1px solid grey")]},
                 {"selector": "tbody td", "props": [("border", "1px solid grey")]},
                 {"selector": "th", "props": [("border", "1px solid grey")]}])

Unnamed: 0,퍼널이름,퍼널단계,유저수,전환율
0,phase1->phase2->phase3->phase4,1.phase1,100,100.0%
1,phase1->phase2->phase3->phase4,2.phase2,70,70.0%
2,phase1->phase2->phase3->phase4,3.phase3,52,52.0%
3,phase1->phase2->phase3->phase4,4.phase4,36,36.0%
4,phase1->phase2->phase4,1.phase1,100,100.0%
5,phase1->phase2->phase4,2.phase2,70,70.0%
6,phase1->phase2->phase4,3.phase4,47,47.0%
7,phase1->phase2->phase3,1.phase1,100,100.0%
8,phase1->phase2->phase3,2.phase2,70,70.0%
9,phase1->phase2->phase3,3.phase3,52,52.0%


뭔가 허전 end_point가 필요하다.

In [40]:
list(set([1, 2, 3, 4]) - set([1, 2, 3]))

[4]

In [111]:
def make_funnel2(df: pd.DataFrame, category: str, userid: str, phase_funnel: list, allPhase: list) -> pd.DataFrame:
    
    """
    퍼널 데이터 형식의 데이터프레임을 출력해줍니다.
    
    -------------param--------------
    df: 데이터프레임
    category: 범주형변수 이름
    userid : 유저아이디
    phase_funnel: 퍼널 구성 단계
    """
    
    empty_list = []
    
    intersectArray = np.array(df.loc[(df[category] == phase_funnel[0]), userid]) #array 초기화
    initialCnt = len(intersectArray) #모수
    
    phase_name = phase_funnel[0] #시작 퍼널
    
    if phase_funnel[-1] == allPhase[-1]:

        for phase in phase_funnel:
            
            phaseArray = np.array(df.loc[(df[category] == phase), userid])
            intersectArray = np.intersect1d(intersectArray, phaseArray) #퍼널 통과 유저 array 
            intersectCnt = len(intersectArray)

            phase_name = str(phase_list.index(phase) + 1) +'.' + phase

            empty_list.append(["->".join(phase_funnel), phase_name, intersectCnt, intersectCnt/initialCnt])
            
    else:
        
        funnel = copy.deepcopy(phase_funnel)
        funnel.append(list(set(allPhase) - set(phase_funnel)))
        
        for phase in funnel:
            
            if phase != funnel[-1]:
                
                phaseArray = np.array(df.loc[(df[category] == phase), userid])
                phase_name = str(funnel.index(phase) + 1) +'.' + phase

                intersectArray = np.intersect1d(intersectArray, phaseArray) #퍼널 통과 유저 array 
                intersectCnt = len(intersectArray)                
                
            else:
                
                phaseArray = np.array(df.loc[(df[category].isin(phase)), userid])
                phase_name = str(funnel.index(phase) + 1) +'.' + '이탈'
                
                intersectArray = np.setdiff1d(intersectArray, phaseArray) #퍼널 통과 유저 array 
                intersectCnt = len(intersectArray)

            empty_list.append(["->".join(phase_funnel) + "->이탈", phase_name, intersectCnt, intersectCnt/initialCnt])        
    
    df = pd.DataFrame(data=empty_list, columns = ['퍼널이름', '퍼널단계', '유저수', '전환율'])
    
    return df

In [112]:
phase_list = ['phase1', 'phase2', 'phase3', 'phase4']
testDF2 = pd.DataFrame()

for funnel in funnelNameList(4):
    
    df = make_funnel2(user_df, 'category', 'userId', [phase_list[i] for i in funnel], phase_list)
    
    testDF2 = pd.concat([testDF2, df])


In [113]:
testDF2['idx1'] = testDF2['퍼널이름'].map(lambda x : len(x))
testDF2 = testDF2.sort_values(by=['idx1', '퍼널이름', '퍼널단계'], ascending=[False, True, True]).iloc[:, [0, 1, 2, 3]]

In [115]:
testDF2.reset_index(drop=True).style.format({'전환율':'{:,.1%}'.format})\
        .set_properties(**{'text-align': 'left'})\
        .bar(subset=['전환율'], width=100, align='left', vmin=0, vmax=1)\
        .set_table_styles(
                [{"selector": "", "props": [("border", "1px solid grey")]},
                 {"selector": "tbody td", "props": [("border", "1px solid grey")]},
                 {"selector": "th", "props": [("border", "1px solid grey")]}])

Unnamed: 0,퍼널이름,퍼널단계,유저수,전환율
0,phase1->phase2->phase3->phase4,1.phase1,100,100.0%
1,phase1->phase2->phase3->phase4,2.phase2,70,70.0%
2,phase1->phase2->phase3->phase4,3.phase3,52,52.0%
3,phase1->phase2->phase3->phase4,4.phase4,36,36.0%
4,phase1->phase2->phase3->이탈,1.phase1,100,100.0%
5,phase1->phase2->phase3->이탈,2.phase2,70,70.0%
6,phase1->phase2->phase3->이탈,3.phase3,52,52.0%
7,phase1->phase2->phase3->이탈,4.이탈,16,16.0%
8,phase1->phase2->phase4,1.phase1,100,100.0%
9,phase1->phase2->phase4,2.phase2,70,70.0%
