In [1]:
!pip install pm4py

import pandas as pd
import pm4py
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.util import dataframe_utils
from pm4py.visualization.process_tree import visualizer as pt_visualizer
from pm4py.algo.discovery.alpha import algorithm as alpha_miner
from pm4py.visualization.petri_net import visualizer as pn_visualizer
from pm4py.algo.discovery.heuristics import algorithm as heuristics_miner
from pm4py.visualization.heuristics_net import visualizer as hn_visualizer

Collecting pm4py
  Downloading pm4py-2.7.15-py3-none-any.whl.metadata (4.8 kB)
Collecting deprecation (from pm4py)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl.metadata (4.6 kB)
Collecting intervaltree (from pm4py)
  Downloading intervaltree-3.1.0.tar.gz (32 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading pm4py-2.7.15-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m54.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading deprecation-2.1.0-py2.py3-none-any.whl (11 kB)
Building wheels for collected packages: intervaltree
  Building wheel for intervaltree (setup.py) ... [?25l[?25hdone
  Created wheel for intervaltree: filename=intervaltree-3.1.0-py2.py3-none-any.whl size=26098 sha256=c4428ef9165d4bea2b5597ce14308733d8934e64ab13736cc0b6a5be4b3fbd83
  Stored in directory: /root/.cache/pip/wheels/31/d7/d9/eec6891f78cac19a693bd40ecb8365d2f4613318c145ec9816
Successfully built intervaltree
Installing collecte

In [3]:
df = pd.read_csv('/content/drive/MyDrive/DEV/PROJECT/201608.csv')

In [4]:
df['date'].unique()

array([20160820, 20160827, 20160816, 20160824, 20160807, 20160829,
       20160823, 20160801, 20160821, 20160828, 20160818, 20160826,
       20160805, 20160814, 20160819, 20160809, 20160825, 20160811,
       20160804, 20160830, 20160810, 20160803, 20160808, 20160806,
       20160831, 20160822, 20160812, 20160802, 20160813, 20160817,
       20160815])

In [5]:
w1_df = df[(df['date'] >= 20160801) & (df['date'] <= 20160807)]

In [6]:
w1_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 792487 entries, 33 to 4253333
Data columns (total 75 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   fullVisitorId             792487 non-null  uint64 
 1   visitId                   792487 non-null  int64  
 2   channelGrouping           792487 non-null  object 
 3   visitNumber               792487 non-null  int64  
 4   visitStartTime            792487 non-null  int64  
 5   date                      792487 non-null  int64  
 6   hits                      792487 non-null  int64  
 7   pageviews                 792484 non-null  float64
 8   timeOnSite                756932 non-null  float64
 9   newVisits                 530742 non-null  float64
 10  transactionRevenue        37540 non-null   float64
 11  transactions              37540 non-null   float64
 12  bounces                   35376 non-null   float64
 13  totalTransactionRevenue   37540 non-null   floa

In [7]:
# 세션 ID를 fullVisitorId와 visitId를 합쳐서 만듦
w1_df['session_id'] = w1_df['fullVisitorId'].astype(str) + '_' + w1_df['visitId'].astype(str)

# 프로세스 마이닝용 컬럼으로 이름 변경
w1_df = w1_df.rename(columns={
    'session_id': 'case:concept:name',   # 프로세스의 케이스 ID # 하나의 세션/사례(case)를 구분하는 ID로 설정
    'pagePath': 'concept:name',          # 이벤트 이름 # 이벤트(Activity) 이름. 페이지 경로가 곧 이벤트 이름
    'time': 'time:timestamp'        # 타임스탬프  # 이벤트가 발생한 시간. 프로세스 순서를 위한 기준
})

# 타임스탬프 컬럼을 datetime 형식으로 변환 (ms 단위)
# hits.time은 millisecond 단위의 숫자로 사람이 이해할 수 있는 날짜/시간 포맷을 변경
w1_df['time:timestamp'] = pd.to_datetime(w1_df['time:timestamp'], unit='ms', errors='coerce')

#필요컬럼만 선택
df_log_ready = w1_df[['case:concept:name', 'concept:name', 'time:timestamp']].dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  w1_df['session_id'] = w1_df['fullVisitorId'].astype(str) + '_' + w1_df['visitId'].astype(str)


In [8]:
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.util import dataframe_utils

# 1. 필요한 컬럼만 정제
df_log_ready = w1_df[['case:concept:name', 'concept:name', 'time:timestamp']].copy()

# 2. time:timestamp 컬럼이 datetime인지 확인하고 변환 (ms 단위라면)
#datetime 인지 확인 하는 과정
if not pd.api.types.is_datetime64_any_dtype(df_log_ready['time:timestamp']):
    df_log_ready['time:timestamp'] = pd.to_datetime(df_log_ready['time:timestamp'], unit='ms', errors='coerce')

# 3. null 제거 (시간이 없는 경우 제거)
df_log_ready = df_log_ready.dropna(subset=['time:timestamp'])

In [9]:
from pm4py.objects.conversion.log import converter as log_converter

#이벤트 로그 형태로 변경
# pandas DataFrame → Event Log
event_log = log_converter.apply(df_log_ready, variant=log_converter.Variants.TO_EVENT_LOG)

In [10]:
from pm4py.algo.discovery.dfg import algorithm as dfg_discovery
from pm4py.visualization.dfg import visualizer as dfg_visualization

# DFG 생성
dfg = dfg_discovery.apply(event_log)

# 시각화
gviz = dfg_visualization.apply(dfg, variant=dfg_visualization.Variants.FREQUENCY)
dfg_visualization.view(gviz)

KeyboardInterrupt: 

In [11]:
from pm4py.algo.discovery.heuristics import algorithm as heuristics_miner
from pm4py.visualization.heuristics_net import visualizer as hn_visualizer

heu_net = heuristics_miner.apply_heu(event_log)
gviz = hn_visualizer.apply(heu_net)
hn_visualizer.view(gviz)

KeyboardInterrupt: 