In [9]:
import pandas as pd

In [10]:
# 5/8~5/14 cohort의 기준 유저 수
base_users = 1000
retention_rates = {
    "Week 0": 1.00,
    "Week 1": 0.55,
    "Week 2": 0.39,
    "Week 3": 0.30,
    "Week 4": 0.23,
    "Week 5": 0.19,
    "Week 6": 0.16
}

# Week 0 기준 사용자 ID 리스트 생성
user_ids = [f"user_B{i+1:04d}" for i in range(base_users)]

# 각 주차별로 리텐션에 따라 사용자 추출 (동일 user_id가 여러 주차에 나타날 수 있음)
cohort_rows = []

for week, rate in retention_rates.items():
    retained_user_count = int(base_users * rate)
    retained_users = user_ids[:retained_user_count]  # 앞에서부터 자르기
    for uid in retained_users:
        cohort_rows.append({
            "user_id": uid,
            "signup_cohort": "5/8~5/14",
            "week": week
        })

df_5_8_to_5_14 = pd.DataFrame(cohort_rows)


In [11]:
df_5_8_to_5_14

Unnamed: 0,user_id,signup_cohort,week
0,user_B0001,5/8~5/14,Week 0
1,user_B0002,5/8~5/14,Week 0
2,user_B0003,5/8~5/14,Week 0
3,user_B0004,5/8~5/14,Week 0
4,user_B0005,5/8~5/14,Week 0
...,...,...,...
2815,user_B0156,5/8~5/14,Week 6
2816,user_B0157,5/8~5/14,Week 6
2817,user_B0158,5/8~5/14,Week 6
2818,user_B0159,5/8~5/14,Week 6


In [12]:
import random

# seed를 고정해서 재현 가능한 결과 생성
random.seed(42)

# 각 주차별로 사용자 ID를 랜덤하게 섞어서 선택
cohort_rows_randomized = []

for week, rate in retention_rates.items():
    retained_user_count = int(base_users * rate)
    sampled_users = random.sample(user_ids, retained_user_count)
    for uid in sampled_users:
        cohort_rows_randomized.append({
            "user_id": uid,
            "signup_cohort": "5/8~5/14",
            "week": week
        })

df_5_8_to_5_14 = pd.DataFrame(cohort_rows_randomized)

In [13]:
df_5_8_to_5_14

Unnamed: 0,user_id,signup_cohort,week
0,user_B0655,5/8~5/14,Week 0
1,user_B0115,5/8~5/14,Week 0
2,user_B0026,5/8~5/14,Week 0
3,user_B0760,5/8~5/14,Week 0
4,user_B0282,5/8~5/14,Week 0
...,...,...,...
2815,user_B0618,5/8~5/14,Week 6
2816,user_B0110,5/8~5/14,Week 6
2817,user_B0344,5/8~5/14,Week 6
2818,user_B0292,5/8~5/14,Week 6


In [14]:
# 5/15~5/21 cohort의 기준 유저 수
base_users = 1000
retention_rates = {
    "Week 0": 1.00,
    "Week 1": 0.52,
    "Week 2": 0.37,
    "Week 3": 0.29,
    "Week 4": 0.21,
    "Week 5": 0.18
}

# Week 0 기준 사용자 ID 리스트 생성
user_ids = [f"user_C{i+1:04d}" for i in range(base_users)]

# 각 주차별로 리텐션에 따라 사용자 추출 (동일 user_id가 여러 주차에 나타날 수 있음)
cohort_rows = []

for week, rate in retention_rates.items():
    retained_user_count = int(base_users * rate)
    retained_users = user_ids[:retained_user_count]  # 앞에서부터 자르기
    for uid in retained_users:
        cohort_rows.append({
            "user_id": uid,
            "signup_cohort": "5/15~5/21",
            "week": week
        })

df_5_15_to_5_21 = pd.DataFrame(cohort_rows)

In [15]:
import random

# seed를 고정해서 재현 가능한 결과 생성
random.seed(42)

# 각 주차별로 사용자 ID를 랜덤하게 섞어서 선택
cohort_rows_randomized = []

for week, rate in retention_rates.items():
    retained_user_count = int(base_users * rate)
    sampled_users = random.sample(user_ids, retained_user_count)
    for uid in sampled_users:
        cohort_rows_randomized.append({
            "user_id": uid,
            "signup_cohort": "5/15~5/21",
            "week": week
        })

df_5_15_to_5_21 = pd.DataFrame(cohort_rows_randomized)

In [16]:
# 5/22~5/28 cohort의 기준 유저 수
base_users = 1000
retention_rates = {
    "Week 0": 1.00,
    "Week 1": 0.50,
    "Week 2": 0.35,
    "Week 3": 0.27,
    "Week 4": 0.20
}

# Week 0 기준 사용자 ID 리스트 생성
user_ids = [f"user_D{i+1:04d}" for i in range(base_users)]

# 각 주차별로 리텐션에 따라 사용자 추출 (동일 user_id가 여러 주차에 나타날 수 있음)
cohort_rows = []

for week, rate in retention_rates.items():
    retained_user_count = int(base_users * rate)
    retained_users = user_ids[:retained_user_count]  # 앞에서부터 자르기
    for uid in retained_users:
        cohort_rows.append({
            "user_id": uid,
            "signup_cohort": "5/22~5/28",
            "week": week
        })

df_5_22_to_5_28 = pd.DataFrame(cohort_rows)

In [17]:
import random

# seed를 고정해서 재현 가능한 결과 생성
random.seed(42)

# 각 주차별로 사용자 ID를 랜덤하게 섞어서 선택
cohort_rows_randomized = []

for week, rate in retention_rates.items():
    retained_user_count = int(base_users * rate)
    sampled_users = random.sample(user_ids, retained_user_count)
    for uid in sampled_users:
        cohort_rows_randomized.append({
            "user_id": uid,
            "signup_cohort": "5/22~5/28",
            "week": week
        })

df_5_22_to_5_28 = pd.DataFrame(cohort_rows_randomized)

In [18]:
# 5/29~6/4 cohort의 기준 유저 수
base_users = 1000
retention_rates = {
    "Week 0": 1.00,
    "Week 1": 0.47,
    "Week 2": 0.32,
    "Week 3": 0.25
}

# Week 0 기준 사용자 ID 리스트 생성
user_ids = [f"user_E{i+1:04d}" for i in range(base_users)]

# 각 주차별로 리텐션에 따라 사용자 추출 (동일 user_id가 여러 주차에 나타날 수 있음)
cohort_rows = []

for week, rate in retention_rates.items():
    retained_user_count = int(base_users * rate)
    retained_users = user_ids[:retained_user_count]  # 앞에서부터 자르기
    for uid in retained_users:
        cohort_rows.append({
            "user_id": uid,
            "signup_cohort": "5/29~6/4",
            "week": week
        })

df_5_29_to_6_4 = pd.DataFrame(cohort_rows)

In [19]:
import random

# seed를 고정해서 재현 가능한 결과 생성
random.seed(42)

# 각 주차별로 사용자 ID를 랜덤하게 섞어서 선택
cohort_rows_randomized = []

for week, rate in retention_rates.items():
    retained_user_count = int(base_users * rate)
    sampled_users = random.sample(user_ids, retained_user_count)
    for uid in sampled_users:
        cohort_rows_randomized.append({
            "user_id": uid,
            "signup_cohort": "5/29~6/4",
            "week": week
        })

df_5_29_to_6_4 = pd.DataFrame(cohort_rows_randomized)

In [20]:
# 6/5~6/11 cohort의 기준 유저 수
base_users = 1000
retention_rates = {
    "Week 0": 1.00,
    "Week 1": 0.45,
    "Week 2": 0.30
}

# Week 0 기준 사용자 ID 리스트 생성
user_ids = [f"user_F{i+1:04d}" for i in range(base_users)]

# 각 주차별로 리텐션에 따라 사용자 추출 (동일 user_id가 여러 주차에 나타날 수 있음)
cohort_rows = []

for week, rate in retention_rates.items():
    retained_user_count = int(base_users * rate)
    retained_users = user_ids[:retained_user_count]  # 앞에서부터 자르기
    for uid in retained_users:
        cohort_rows.append({
            "user_id": uid,
            "signup_cohort": "6/5~6/11",
            "week": week
        })

df_6_5_to_6_11 = pd.DataFrame(cohort_rows)

In [21]:
import random

# seed를 고정해서 재현 가능한 결과 생성
random.seed(42)

# 각 주차별로 사용자 ID를 랜덤하게 섞어서 선택
cohort_rows_randomized = []

for week, rate in retention_rates.items():
    retained_user_count = int(base_users * rate)
    sampled_users = random.sample(user_ids, retained_user_count)
    for uid in sampled_users:
        cohort_rows_randomized.append({
            "user_id": uid,
            "signup_cohort": "6/5~6/11",
            "week": week
        })

df_6_5_to_6_11 = pd.DataFrame(cohort_rows_randomized)

In [22]:
# 6/12~6/18 cohort의 기준 유저 수
base_users = 1000
retention_rates = {
    "Week 0": 1.00,
    "Week 1": 0.43
}

# Week 0 기준 사용자 ID 리스트 생성
user_ids = [f"user_G{i+1:04d}" for i in range(base_users)]

# 각 주차별로 리텐션에 따라 사용자 추출 (동일 user_id가 여러 주차에 나타날 수 있음)
cohort_rows = []

for week, rate in retention_rates.items():
    retained_user_count = int(base_users * rate)
    retained_users = user_ids[:retained_user_count]  # 앞에서부터 자르기
    for uid in retained_users:
        cohort_rows.append({
            "user_id": uid,
            "signup_cohort": "6/12~6/18",
            "week": week
        })

df_6_12_to_6_18 = pd.DataFrame(cohort_rows)

In [23]:
import random

# seed를 고정해서 재현 가능한 결과 생성
random.seed(42)

# 각 주차별로 사용자 ID를 랜덤하게 섞어서 선택
cohort_rows_randomized = []

for week, rate in retention_rates.items():
    retained_user_count = int(base_users * rate)
    sampled_users = random.sample(user_ids, retained_user_count)
    for uid in sampled_users:
        cohort_rows_randomized.append({
            "user_id": uid,
            "signup_cohort": "6/12~6/18",
            "week": week
        })

df_6_12_to_6_18 = pd.DataFrame(cohort_rows_randomized)

In [24]:
# 6/19~6/25 cohort의 기준 유저 수
base_users = 1000
retention_rates = {
    "Week 0": 1.00
}

# Week 0 기준 사용자 ID 리스트 생성
user_ids = [f"user_H{i+1:04d}" for i in range(base_users)]

# 각 주차별로 리텐션에 따라 사용자 추출 (동일 user_id가 여러 주차에 나타날 수 있음)
cohort_rows = []

for week, rate in retention_rates.items():
    retained_user_count = int(base_users * rate)
    retained_users = user_ids[:retained_user_count]  # 앞에서부터 자르기
    for uid in retained_users:
        cohort_rows.append({
            "user_id": uid,
            "signup_cohort": "6/19~6/25",
            "week": week
        })

df_6_19_to_6_25 = pd.DataFrame(cohort_rows)

In [None]:
import random

# seed를 고정해서 재현 가능한 결과 생성
random.seed(42)

# 각 주차별로 사용자 ID를 랜덤하게 섞어서 선택
cohort_rows_randomized = []

for week, rate in retention_rates.items():
    retained_user_count = int(base_users * rate)
    sampled_users = random.sample(user_ids, retained_user_count)
    for uid in sampled_users:
        cohort_rows_randomized.append({
            "user_id": uid,
            "signup_cohort": "6/19~6/25",
            "week": week
        })

df_6_19_to_6_25 = pd.DataFrame(cohort_rows_randomized)

In [27]:
combined_df = pd.concat([
  df_5_8_to_5_14,
  df_5_15_to_5_21,
  df_5_22_to_5_28,
  df_5_29_to_6_4,
  df_6_5_to_6_11,
  df_6_12_to_6_18,
  df_6_19_to_6_25
], ignore_index=True)

In [29]:
df = pd.read_csv("D:/김동영/94_sqldata/Randomized_5_1_to_5_7_Cohort_Users_with_A.csv")


In [32]:
merged_df = pd.concat([combined_df, df], ignore_index=True)
merged_df.to_csv("D:/김동영/94_sqldata/Cohort_Users.csv", index=False)