In [1]:
import os
import io
import pandas as pd
from google.cloud import storage
import ast
import matplotlib.pyplot as plt 
import seaborn as sns
import networkx as nx
import warnings
import dask.dataframe as dd
pd.set_option('display.max_columns', None)
warnings.filterwarnings('ignore')

key1 = os.getenv('GCP_KEY_1')
key2 = os.getenv('GCP_KEY_2')


#### 친구

In [2]:
accounts_user = dd.read_parquet('gs://sprint_final_project/final_project/processed/processed_accounts_user_processed.parquet', engine='pyarrow', storage_options={'token': key2})

In [3]:
accounts_friendrequest = dd.read_parquet('gs://sprintda05_final_project/votes/accounts_friendrequest.parquet', engine='pyarrow',storage_options={'token': key1})

In [4]:
accounts_user_contacts = dd.read_parquet('gs://sprintda05_final_project/votes/accounts_user_contacts.parquet', engine='pyarrow',storage_options={'token': key1})

In [None]:
accounts_user = accounts_user[['id','friend_count']]

In [9]:
accounts_friendrequest = accounts_friendrequest[['send_user_id','receive_user_id','status']]

In [None]:
accounts_user_contacts = accounts_user_contacts[['user_id','contacts_count']]

In [11]:
accounts_friendrequest = accounts_friendrequest.compute()

In [17]:
accounts_friendrequest

Unnamed: 0,send_user_id,receive_user_id,status
0,837521,831962,P
1,837521,832151,A
2,837521,832340,A
3,837521,833041,A
4,837521,834415,P
...,...,...,...
17147170,1583731,1583730,P
17147171,1583673,1583731,A
17147172,1575671,1299129,P
17147173,1392729,1304403,R


In [None]:
# series가 concat될 때는 index를 기준으로 됨

In [12]:
# 1. friend_requests_sent: 유저가 보낸 요청 수
sent = accounts_friendrequest.groupby('send_user_id').size().rename('friend_requests_sent')

# 2. friend_requests_received: 유저가 받은 요청 수
received = accounts_friendrequest.groupby('receive_user_id').size().rename('friend_requests_received')

# 3. friend_requests_accepted: 유저가 보낸 요청 중 수락된 횟수
accepted = accounts_friendrequest[accounts_friendrequest['status'] == 'A'].groupby('send_user_id').size().rename('friend_requests_accepted')

# 4. friend_request_success_rate: 수락률 = 수락된 요청 수 / 보낸 요청 수
# 먼저 join
summary = pd.concat([sent, received, accepted], axis=1).fillna(0)

# 수락률 계산
summary['friend_request_success_rate'] = summary['friend_requests_accepted'] / summary['friend_requests_sent']
summary['friend_request_success_rate'] = summary['friend_request_success_rate'].fillna(0)

# 결과 확인
summary = summary.reset_index().rename(columns={'send_user_id': 'user_id'})


In [24]:
summary

Unnamed: 0,user_id,friend_requests_sent,friend_requests_received,friend_requests_accepted,friend_request_success_rate
0,831962,1.0,26.0,1.0,1.000000
1,832151,10.0,35.0,2.0,0.200000
2,832340,26.0,25.0,7.0,0.269231
3,833024,2.0,35.0,1.0,0.500000
4,833041,35.0,26.0,8.0,0.228571
...,...,...,...,...,...
673700,1583381,0.0,1.0,0.0,0.000000
673701,1583392,0.0,1.0,0.0,0.000000
673702,1583645,0.0,1.0,0.0,0.000000
673703,1583660,0.0,1.0,0.0,0.000000


In [21]:
summary = summary.rename(columns={'index':'user_id'})

In [14]:
accounts_user = accounts_user.rename(columns={'id':'user_id'})

In [16]:
accounts_user = accounts_user.compute()
accounts_user_contacts = accounts_user_contacts.compute()

In [19]:
vote_df = pd.merge(accounts_user,accounts_user_contacts,how='inner',on='user_id').drop(columns=['friend_id_list','invite_user_id_list'])

In [22]:
vote_df = pd.merge(vote_df,summary,how='inner',on='user_id')

In [23]:
vote_df

Unnamed: 0,user_id,friend_count,contacts_count,friend_requests_sent,friend_requests_received,friend_requests_accepted,friend_request_success_rate
0,847375,165,67,31.0,130.0,23.0,0.741935
1,849436,98,31,26.0,71.0,20.0,0.769231
2,849438,131,37,53.0,78.0,42.0,0.792453
3,849439,104,50,16.0,86.0,12.0,0.750000
4,849441,113,47,26.0,82.0,21.0,0.807692
...,...,...,...,...,...,...,...
5043,1582145,16,0,16.0,0.0,0.0,0.000000
5044,1582558,30,22,30.0,0.0,0.0,0.000000
5045,1582765,30,6,30.0,0.0,0.0,0.000000
5046,1582865,0,19,1.0,0.0,0.0,0.000000


#### 투표 대상 경험 효과

In [2]:
accounts_userquestionrecord = dd.read_parquet('gs://sprint_final_project/final_project/processed/processed_accounts_userquestionrecord_processed.parquet', engine='pyarrow', storage_options={'token': key2})

In [3]:
polls_usercandidate = dd.read_parquet('gs://sprint_final_project/final_project/votes/polls_usercandidate.parquet', engine='pyarrow', storage_options={'token': key2})

In [15]:
accounts_user = dd.read_parquet('gs://sprint_final_project/final_project/votes/accounts_user.parquet', engine='pyarrow', storage_options={'token': key2})

In [6]:
accounts_userquestionrecord

Unnamed: 0_level_0,id,status,created_at,chosen_user_id,question_id,user_id,question_piece_id,has_read,answer_status,answer_updated_at,report_count,opened_times,is_self_love
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
,int64,string,datetime64[ns],int64,int64,int64,int64,int64,string,datetime64[ns],int64,int64,bool
,...,...,...,...,...,...,...,...,...,...,...,...,...


In [4]:
accounts_userquestionrecord = accounts_userquestionrecord[['chosen_user_id','question_id','opened_times']]

In [7]:
polls_usercandidate

Unnamed: 0_level_0,id,created_at,question_piece_id,user_id
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,int64,datetime64[ns],int64,int64
,...,...,...,...


In [5]:
polls_usercandidate = polls_usercandidate[['user_id','question_piece_id']]

In [6]:
accounts_userquestionrecord = accounts_userquestionrecord.compute()
polls_usercandidate = polls_usercandidate.compute()

In [13]:
accounts_userquestionrecord

Unnamed: 0,chosen_user_id,question_id,opened_times
0,849469,252,0
1,849446,244,0
2,849454,183,0
3,847375,101,0
4,849477,209,0
...,...,...,...
1217553,945560,2614,0
1217554,850774,1679,0
1217555,855509,2605,0
1217556,855526,3801,0


In [14]:
polls_usercandidate

Unnamed: 0,user_id,question_piece_id
0,849444,998458
1,849454,998458
2,849460,998458
3,849469,998458
4,849446,998459
...,...,...
4769604,857296,200139933
4769605,850774,200139934
4769606,856446,200139934
4769607,857101,200139934


In [7]:
import pandas as pd


# 1. 선택된 횟수 및 평균 opened_times
chosen_agg = (
    accounts_userquestionrecord
    .groupby('chosen_user_id')
    .agg(times_chosen=('question_id', 'count'),
         average_opened_times=('opened_times', 'mean'))
)

# 2. 후보로 등장한 횟수
candidate_agg = (
    polls_usercandidate
    .groupby('user_id')
    .agg(times_as_candidate=('question_piece_id', 'count'))
)

# 3. 병합 (user_id 기준)
summary = pd.merge(
    chosen_agg, 
    candidate_agg, 
    left_index=True, 
    right_index=True, 
    how='outer'
).fillna(0)

# 4. popularity_score 계산
summary['popularity_score'] = summary['times_chosen'] / summary['times_as_candidate']
summary['popularity_score'] = summary['popularity_score'].fillna(0)

# 인덱스를 컬럼으로
summary = summary.reset_index().rename(columns={'chosen_user_id': 'user_id'})




In [None]:
summary = summary.rename(columns={'index' : 'user_id'})

In [10]:
summary

Unnamed: 0,user_id,times_chosen,average_opened_times,times_as_candidate,popularity_score
0,833112,2.0,0.00,5,0.400000
1,833113,20.0,0.05,69,0.289855
2,833154,0.0,0.00,19,0.000000
3,833202,0.0,0.00,2,0.000000
4,833203,116.0,0.00,256,0.453125
...,...,...,...,...,...
19989,1579418,0.0,0.00,1,0.000000
19990,1579422,4.0,0.00,6,0.666667
19991,1579659,0.0,0.00,3,0.000000
19992,1580578,0.0,0.00,3,0.000000


In [20]:
accounts_user = accounts_user[['id']].compute()
accounts_user = accounts_user.rename(columns={'id':'user_id'})

In [22]:
summary = pd.merge(summary,accounts_user,how='outer',on='user_id')

In [None]:
summary.to_parquet('gs://sprintda05_final_project/machine_learning/votes/vote_for.parquet',engine='pyarrow', storage_options={'token': key1})

#### reject

In [27]:
accounts_user = dd.read_parquet('gs://sprint_final_project/final_project/processed/processed_accounts_user_processed.parquet', engine='pyarrow', storage_options={'token': key2})

In [28]:
hackle_properties = dd.read_parquet('gs://sprintda05_final_project/hackle/hackle_properties.parquet', engine='pyarrow', storage_options={'token': key1})

In [29]:
hackle_events = dd.read_parquet('gs://sprintda05_final_project/hackle/hackle_events.parquet', engine='pyarrow', storage_options={'token': key1})

In [30]:
accounts_user_id = accounts_user[['id']].compute()

In [31]:
accounts_user_id = accounts_user_id.rename(columns={'id':'user_id'})

In [32]:
accounts_user_id['user_id'] = accounts_user_id['user_id'].astype(str)

In [33]:
hackle_properties = hackle_properties[['session_id','user_id']].compute()

In [34]:
unique_id = pd.merge(hackle_properties,accounts_user_id,how='inner',on='user_id').drop_duplicates()

In [35]:
hackle_events = hackle_events[['event_datetime','event_key','session_id']].compute()

In [36]:
events = pd.merge(hackle_events,unique_id,on='session_id',how='inner')

In [37]:
df = events.drop_duplicates()

In [2]:
hackle = dd.read_parquet('gs://sprintda05_final_project/final/FINAL_COMPLETE_EVENTS_MAPPED.parquet', engine='pyarrow', storage_options={'token': key1})

In [3]:
df = hackle

In [4]:
df= df.loc[(df['event_key'] == 'click_appbar_friend_plus') | (df['event_key'] == 'click_friend_invite')].sort_values(by=['event_datetime']).reset_index(drop=True)

In [5]:
df = df.compute()

In [6]:
df.to_parquet('gs://sprintda05_final_project/machine_learning/hackle/reject.parquet',engine='pyarrow', storage_options={'token': key1})

#### 친구추가

In [None]:
# 친구 관련 이벤트:
# - click_appbar_friend_plus: 친구 추가 버튼 클릭
# - click_autoadd_contact: 연락처 자동 친구 추가
# - click_friend_invite: 친구 초대 클릭
# - click_invite_friend: ask에서 친구 초대
# - click_copy_profile_link_ask, click_copy_profile_link_profile: 프로필 링크 공유
# - view_friendplus_tap: 친구 추천 화면 진입

# 파생 변수:
# - total_friend_actions: 모든 친구 관련 액션 합계
# - friend_acquisition_effort: 친구 추가 시도 총합

In [30]:
accounts_user = dd.read_parquet('gs://sprint_final_project/final_project/processed/processed_accounts_user_processed.parquet', engine='pyarrow', storage_options={'token': key2})

In [31]:
hackle_properties = dd.read_parquet('gs://sprintda05_final_project/hackle/hackle_properties.parquet', engine='pyarrow', storage_options={'token': key1})

In [32]:
hackle_events = dd.read_parquet('gs://sprintda05_final_project/hackle/hackle_events.parquet', engine='pyarrow', storage_options={'token': key1})

In [33]:
accounts_user_id = accounts_user[['id']].compute()

In [34]:
accounts_user_id = accounts_user_id.rename(columns={'id':'user_id'})

In [35]:
accounts_user_id['user_id'] = accounts_user_id['user_id'].astype(str)

In [36]:
hackle_properties = hackle_properties[['session_id','user_id']].compute()

In [37]:
unique_id = pd.merge(hackle_properties,accounts_user_id,how='inner',on='user_id').drop_duplicates()

In [38]:
hackle_events = hackle_events[['event_datetime','event_key','session_id']].compute()

In [39]:
events = pd.merge(hackle_events,unique_id,on='session_id',how='inner')

KeyboardInterrupt: 

In [None]:
df = events.drop_duplicates()

In [None]:
event=['click_appbar_friend_plus','click_autoadd_contact','click_friend_invite','click_invite_friend','click_copy_profile_link_ask',\
    'click_copy_profile_link_profile','view_friendplus_tap']

In [None]:
friends_relation = df[df['event_key'].isin(event)]

In [None]:
add_try = friends_relation[friends_relation['event_key']=='click_appbar_friend_plus']
add = (add_try.groupby('user_id').agg(friend_acquisition_effort = ('session_id','count')))
total = (friends_relation.groupby('user_id').agg(total_friend_actions =('session_id','count')))

In [None]:
summary = pd.merge(
    add, 
    total, 
    left_index=True, 
    right_index=True, 
    how='outer'
).fillna(0)

In [None]:
summary

Unnamed: 0_level_0,friend_acquisition_effort,total_friend_actions
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1000013,3.0,3
1000072,1.0,1
1000145,1.0,1
1000175,6.0,6
1000267,1.0,1
...,...,...
999892,1.0,1
999909,1.0,1
999926,2.0,3
999942,1.0,1


In [None]:
summary.to_parquet('gs://sprintda05_final_project/machine_learning/hackle/friends_relationship.parquet',engine='pyarrow', storage_options={'token': key1})

##### 2

In [2]:
hackle = dd.read_parquet('gs://sprintda05_final_project/final/FINAL_COMPLETE_EVENTS_MAPPED.parquet', engine='pyarrow', storage_options={'token': key1})

In [None]:
# 친구 관련 이벤트:
# - click_appbar_friend_plus: 친구 추가 버튼 클릭
# - click_autoadd_contact: 연락처 자동 친구 추가
# - click_friend_invite: 친구 초대 클릭
# - click_invite_friend: ask에서 친구 초대
# - click_copy_profile_link_ask, click_copy_profile_link_profile: 프로필 링크 공유
# - view_friendplus_tap: 친구 추천 화면 진입

# 파생 변수:
# - total_friend_actions: 모든 친구 관련 액션 합계
# - friend_acquisition_effort: 친구 추가 시도 총합

In [3]:
hackle[['user_id','session_id','signup_date','event_datetime','event_key']].compute()

Unnamed: 0,user_id,session_id,signup_date,event_datetime,event_key
0,833202.0,d3PU5u3G3JM0v7wnCwbKgdALdMu1,2023-03-31 15:58:12.136011,2023-04-01 00:58:05.838,$session_start
1,833203.0,Ha3pmVTQw3XhVC9cKwKm6VGTN6s1,2023-03-31 15:58:14.619962,2023-04-01 00:58:08.816,$session_start
2,833202.0,d3PU5u3G3JM0v7wnCwbKgdALdMu1,2023-03-31 15:58:12.136011,2023-04-01 00:58:09.797,button_다음
3,833202.0,d3PU5u3G3JM0v7wnCwbKgdALdMu1,2023-03-31 15:58:12.136011,2023-04-01 00:58:11.050,button_여자
4,833203.0,Ha3pmVTQw3XhVC9cKwKm6VGTN6s1,2023-03-31 15:58:14.619962,2023-04-01 00:58:12.519,button_다음
...,...,...,...,...,...
339735,1352161.0,XDbQMrsuMDcMtOgZIRPhJprlVxo2,2023-05-17 11:53:44.094835,2023-09-21 23:59:58.937,view_friend_plus_tap
339736,1153206.0,KNXixwjInnXAY9wEMix6srY1zfp2,2023-05-11 23:39:17.509678,2023-09-21 23:59:59.250,view_home_tap
339737,1490721.0,86FC8620-D10B-44AB-9ACC-4E6160B5DC49,2023-05-24 10:04:14.846109,2023-09-21 23:59:59.676,skip_question_question
339738,1537169.0,781b1c83-b35e-471d-b702-e850b49bceff,2023-05-28 12:07:02.791703,2023-09-21 23:59:59.942,$session_start


In [None]:
# - `button_내 학교 찾아보기`
# - `button_친구 불러오기`
# - `button_연락처 불러오기`
# - `button_사진첩에서 불러오기`
# - `button_사진첩에서 추가`
# - `button_직접 촬영`
# - `button_직접 촬영하기`
# - `button_친구 초대하고 바로 받기`
# - `button_초대 링크 복사`
# - `button_초대링크 복사하기!`
# - `button_친구들에게 알리기`

In [4]:
df = hackle[['user_id','session_id','signup_date','event_datetime','event_key']]

In [5]:
event=['click_appbar_friend_plus','click_autoadd_contact','click_friend_invite','click_invite_friend','click_copy_profile_link_ask',\
    'click_copy_profile_link_profile','view_friendplus_tap','button_친구 불러오기','button_친구들에게 알리기','button_초대 링크 복사',\
    'button_초대링크 복사하기!']

In [6]:
add_event=['click_appbar_friend_plus','click_autoadd_contact','click_friend_invite','click_invite_friend','button_친구 불러오기','button_초대 링크 복사','button_초대링크 복사하기!']

In [7]:
friends_relation = df[df['event_key'].isin(event)]

In [8]:
add_try = friends_relation[friends_relation['event_key'].isin(add_event)]
add = (add_try.groupby('user_id').agg(friend_acquisition_effort = ('session_id','count')))
total = (friends_relation.groupby('user_id').agg(total_friend_actions =('session_id','count')))

In [9]:
add = add.compute()

In [11]:
add

Unnamed: 0_level_0,friend_acquisition_effort
user_id,Unnamed: 1_level_1
833202.0,4
833303.0,2
847177.0,2
835057.0,1
832340.0,3
...,...
1291928.0,1
1198957.0,1
1084797.0,1
1219091.0,1


In [10]:
total = total.compute()

In [12]:
total

Unnamed: 0_level_0,total_friend_actions
user_id,Unnamed: 1_level_1
833202.0,13
833303.0,4
847177.0,2
835057.0,1
832340.0,5
...,...
1270157.0,1
1084797.0,1
1219091.0,1
1059551.0,1


In [13]:
summary = pd.merge(
    add, 
    total, 
    left_index=True, 
    right_index=True, 
    how='outer'
).fillna(0)

In [14]:
summary

Unnamed: 0_level_0,friend_acquisition_effort,total_friend_actions
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
832340.0,3.0,5
833041.0,3.0,6
833202.0,4.0,13
833203.0,25.0,32
833303.0,2.0,4
...,...,...
1579924.0,1.0,1
1579925.0,7.0,7
1579927.0,1.0,1
1579929.0,1.0,1


In [15]:
summary.to_parquet('gs://sprintda05_final_project/machine_learning/hackle/friends_relationship.parquet',engine='pyarrow', storage_options={'token': key1})

#### 타임라인 공개 답변 효과

In [3]:
df = dd.read_parquet('gs://sprintda05_final_project/final/FINAL_COMPLETE_EVENTS_MAPPED.parquet', engine='pyarrow', storage_options={'token': key1})

In [None]:
event = ['complete_question','skip_question','click_question_open','click_question_share','view_questions_tap']

In [4]:
df = hackle[['user_id','session_id','signup_date','event_datetime','event_key']]

In [21]:
for key in df['event_key'].unique():
    print(key)

$session_start
button_다음
button_여자
button_남자
button_칭찬 알림 받기
button_확인
button_친구 불러오기
button_설정으로 이동하기
button_about
button_홈
button_votes
button_친구들에게 알리기
button_title
$session_end
button_로그아웃
button_vote start
button_건너뛰기
button_앱바 닫기
button_계속하기
button_홈으로
button_앱바 뒤로가기
button_초기화하고 첫 화면으로
button_내 학교 찾아보기
button_학교 선택
button_자동으로 친구추가
button_동의하고 계속
button_초대링크 복사하기!
button_취소
button_vote 선택
button_닫기
button_초성 힌트 받기
button_인스타그램
button_개인정보 처리방침
button_라이선스
button_이름 셔플!
button_친구선택
button_다음 질문
button_완료
button_이미 가입했다면 로그인!
button_전체 이름 확인하기
button_첫 화면 으로
button_vote 시작
button_오픈하면 알림 받기
button_정말 떠날거니?
button_기타 이유
button_계정 삭제하기
button_프로필 사진 수정
button_사진첩에서 추가
button_이름
button_문의하기
button_친구 초대하고 바로 받기
button_내 실명은 ‘송동건' 맞아!
button_내 실명은 ‘조민서' 맞아!
button_프로필
button_내 실명은 ‘채린' 맞아!
button_내 실명은 ‘구민정' 맞아!
button_내 실명은 ‘박지율' 맞아!
button_사진첩에서 불러오기
button_친구 toggle
button_내 실명은 ‘장정우' 맞아!
button_친구+
button_내 실명은 ‘이두영' 맞아!
button_다시 선택
button_직접 촬영하기
button_내 실명은 ‘박은규' 맞아!
button_내 

In [None]:
event = ['complete_question','skip_question','click_question_open','click_question_share','view_questions_tap']

Unnamed: 0,user_id,session_id,signup_date,event_datetime,event_key
2563450,1534670.0,9d7167b3-6516-4728-9c7a-670315175607,2023-05-28 07:21:48.127030,2023-06-20 23:13:26.919,view_questions_tap
2563456,1534670.0,9d7167b3-6516-4728-9c7a-670315175607,2023-05-28 07:21:48.127030,2023-06-20 23:13:30.160,view_questions_tap
2563488,1576258.0,d76e0335-fe7c-48a7-af5e-e9a139e39192,2023-06-17 09:36:04.590192,2023-06-20 23:13:37.461,view_questions_tap
2563492,1265819.0,bdcoO3CrJnQ6nS0yeUZnK4TzVr22,2023-05-14 13:45:03.890600,2023-06-20 23:13:38.102,view_questions_tap
2563494,1576258.0,d76e0335-fe7c-48a7-af5e-e9a139e39192,2023-06-17 09:36:04.590192,2023-06-20 23:13:39.236,click_question_open
...,...,...,...,...,...
339659,1352161.0,XDbQMrsuMDcMtOgZIRPhJprlVxo2,2023-05-17 11:53:44.094835,2023-09-21 23:59:34.647,view_questions_tap
339661,1486875.0,GZZuPSowauaXYbmDh1WMgUeLqmm2,2023-05-24 06:38:45.801156,2023-09-21 23:59:35.331,view_questions_tap
339674,1385702.0,rHo7BvvQ2vcHdITxeUxyWF8h4403,2023-05-19 12:13:51.080124,2023-09-21 23:59:42.978,view_questions_tap
339682,1394687.0,7UQfXGvCjTX8fhLiI0CuK4ToU5x2,2023-05-20 03:16:52.530339,2023-09-21 23:59:46.191,view_questions_tap


In [4]:
import dask.dataframe as dd

# event_key별 집계 함수: user_id, event_key 그룹별 count
event_counts = df.groupby(['user_id', 'event_key']).size().reset_index()
event_counts = event_counts.rename(columns={0: 'count'})

# 이벤트별 집계 값을 분리해서 각각 user_id 기준으로 만들기
def extract_event_count(df, event_name):
    filtered = df[df['event_key'] == event_name][['user_id', 'count']]
    filtered = filtered.rename(columns={'count': event_name})
    return filtered

complete_df = extract_event_count(event_counts, 'complete_question')
skip_df = extract_event_count(event_counts, 'skip_question')
open_df = extract_event_count(event_counts, 'click_question_open')
share_df = extract_event_count(event_counts, 'click_question_share')

# user_id 기준으로 병합 (left join)
merged = complete_df.merge(skip_df, on='user_id', how='outer')
merged = merged.merge(open_df, on='user_id', how='outer')
merged = merged.merge(share_df, on='user_id', how='outer')

# 결측치는 0으로 채우기
merged = merged.fillna(0)

# 파생 변수 컬럼 추가
merged['question_completion_rate'] = merged['complete_question'] / (merged['complete_question'] + merged['skip_question'] + 1e-9)
merged['question_engagement'] = merged['click_question_open'] + merged['click_question_share']




In [9]:
merged = merged.compute()

In [10]:
merged.to_parquet('gs://sprintda05_final_project/machine_learning/hackle/public_timeline.parquet',engine='pyarrow', storage_options={'token': key1})