<a href="https://colab.research.google.com/github/eunShim/BigDataCapstone/blob/main/bigdatacapstone.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import zipfile
import random
import pandas as pd
import matplotlib.pyplot as plt

# =======================================
# 1. Load KT4 sample data (500 users)
# =======================================
zip_path = "/content/EdNet-KT4.zip"
zf = zipfile.ZipFile(zip_path)


file_list = [f for f in zf.namelist() if f.endswith(".csv")]
print("총 사용자 수:", len(file_list))

# --------------------------
# 2. 무작위로 500명 샘플링
# --------------------------
sample_files = random.sample(file_list, 500)

user_stats = []

for f in sample_files:
    with zf.open(f) as fp:
        df = pd.read_csv(fp)

        # 빈 파일 처리
        if df.empty:
            continue

        # user_id 추출
        user_id = f.split("/")[-1].replace(".csv","")

        # timestamp 정렬 및 소요시간 계산
        if "timestamp" in df.columns:
            df = df.sort_values("timestamp")
            duration = df["timestamp"].iloc[-1] - df["timestamp"].iloc[0]
        else:
            duration = np.nan

        # quit 횟수
        quit_count = (df["action_type"] == "quit").sum()

        # 플랫폼 비율
        if "platform" in df.columns:
            total_actions = len(df)
            mobile_ratio = (df["platform"] == "mobile").sum() / total_actions
            web_ratio = (df["platform"] == "web").sum() / total_actions
        else:
            mobile_ratio = np.nan
            web_ratio = np.nan

        # 문제 수 대비 submit 비율
        if "action_type" in df.columns and "item_id" in df.columns:
            submit_count = (df["action_type"] == "submit").sum()
            unique_questions = df[df["item_id"].astype(str).str.startswith("q")]["item_id"].nunique()
            submit_ratio = submit_count / unique_questions if unique_questions > 0 else np.nan
        else:
            submit_count = np.nan
            submit_ratio = np.nan

        # 사용자 단위 통계 저장
        user_stats.append({
            "user_id": user_id,
            "quit_count": quit_count,
            "duration_ms": duration,
            "mobile_ratio": mobile_ratio,
            "web_ratio": web_ratio,
            "submit_count": submit_count,
            "unique_questions": unique_questions,
            "submit_ratio": submit_ratio
        })

# --------------------------
# 3. 결과 요약
# --------------------------
stats_df = pd.DataFrame(user_stats)
print("\n✅ 사용자별 통계 미리보기:")
print(stats_df.head())

# 전체 통계 요약
print("\n📈 전체 사용자 통계 요약:")
print(stats_df.describe())

# --------------------------
# 4. 추가 시각화 (선택)
# --------------------------
import matplotlib.pyplot as plt

plt.figure(figsize=(8,4))
stats_df["quit_count"].hist(bins=30)
plt.title("Quit 행동 횟수 분포")
plt.xlabel("quit_count")
plt.ylabel("사용자 수")
plt.show()

plt.figure(figsize=(8,4))
stats_df["submit_ratio"].hist(bins=30)
plt.title("문항 수 대비 Submit 비율 분포")
plt.xlabel("submit_ratio")
plt.ylabel("사용자 수")
plt.show()

BadZipFile: File is not a zip file

In [None]:
!sudo apt-get install -y p7zip-full
!mkdir -p /content/KT4_extracted
!7z x /content/EdNet-KT4.zip -o/content/KT4_extracted



Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
p7zip-full is already the newest version (16.02+dfsg-8).
0 upgraded, 0 newly installed, 0 to remove and 38 not upgraded.

7-Zip [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,2 CPUs Intel(R) Xeon(R) CPU @ 2.20GHz (406F0),ASM,AES-NI)

Scanning the drive for archives:
  0M Scan /content/                   1 file, 175112192 bytes (167 MiB)

Extracting archive: /content/EdNet-KT4.zip
  0% 256 Open              45% 19968 Open                86% 38400 Open               
ERRORS:
Unexpected end of archive

--
Path = /content/EdNet-KT4.zip
Type = zip
ERRORS:
Unexpected end of archive
Physical Size = 176160768

 18%      1% 669 - KT4/u362904.csv                

In [None]:
# ===============================================
# ✅ EdNet KT4 전체 자동 분석 코드
# ===============================================
import os
import zipfile
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt

# -----------------------------------------------
# 1. 압축 해제 (7z 사용)
# -----------------------------------------------
!sudo apt-get install -y p7zip-full > /dev/null
!mkdir -p /content/KT4_extracted
!7z x /content/EdNet-KT4.zip -o/content/KT4_extracted > /dev/null

# -----------------------------------------------
# 2. CSV 파일 목록 확인
# -----------------------------------------------
folder = "/content/KT4_extracted"
all_files = [os.path.join(folder, f) for f in os.listdir(folder) if f.endswith(".csv")]

print(f"✅ 추출 완료! 총 {len(all_files)}개 CSV 파일 발견")

# -----------------------------------------------
# 3. 무작위 500명 샘플 추출
# -----------------------------------------------
sample_files = random.sample(all_files, 500)

user_stats = []

for f in sample_files:
    try:
        df = pd.read_csv(f)
        if df.empty:  # 빈 파일 skip
            continue

        user_id = os.path.basename(f).replace(".csv", "")

        # timestamp 기반 소요시간
        if "timestamp" in df.columns:
            df = df.sort_values("timestamp")
            duration = df["timestamp"].iloc[-1] - df["timestamp"].iloc[0]
        else:
            duration = np.nan

        # quit 횟수
        quit_count = (df["action_type"] == "quit").sum() if "action_type" in df.columns else np.nan

        # 플랫폼 비율
        if "platform" in df.columns:
            total_actions = len(df)
            mobile_ratio = (df["platform"] == "mobile").sum() / total_actions
            web_ratio = (df["platform"] == "web").sum() / total_actions
        else:
            mobile_ratio = np.nan
            web_ratio = np.nan

        # submit 비율
        if "action_type" in df.columns and "item_id" in df.columns:
            submit_count = (df["action_type"] == "submit").sum()
            unique_questions = df[df["item_id"].astype(str).str.startswith("q")]["item_id"].nunique()
            submit_ratio = submit_count / unique_questions if unique_questions > 0 else np.nan
        else:
            submit_count = np.nan
            unique_questions = np.nan
            submit_ratio = np.nan

        user_stats.append({
            "user_id": user_id,
            "quit_count": quit_count,
            "duration_ms": duration,
            "mobile_ratio": mobile_ratio,
            "web_ratio": web_ratio,
            "submit_count": submit_count,
            "unique_questions": unique_questions,
            "submit_ratio": submit_ratio
        })
    except Exception as e:
        print(f"⚠️ {f} 처리 중 오류:", e)
        continue

# -----------------------------------------------
# 4. 사용자별 통계 요약
# -----------------------------------------------
stats_df = pd.DataFrame(user_stats)
print("\n✅ 사용자별 통계 미리보기:")
print(stats_df.head())

print("\n📈 전체 통계 요약:")
print(stats_df.describe())

# -----------------------------------------------
# 5. 기본 시각화
# -----------------------------------------------
plt.figure(figsize=(8,4))
stats_df["quit_count"].hist(bins=30)
plt.title("Quit 행동 횟수 분포")
plt.xlabel("quit_count")
plt.ylabel("사용자 수")
plt.show()

plt.figure(figsize=(8,4))
stats_df["submit_ratio"].hist(bins=30)
plt.title("문항 수 대비 Submit 비율 분포")
plt.xlabel("submit_ratio")
plt.ylabel("사용자 수")
plt.show()

plt.figure(figsize=(6,4))
stats_df[["mobile_ratio", "web_ratio"]].mean().plot(kind='bar', color=['orange','skyblue'])
plt.title("평균 플랫폼 비율 (모바일 vs 웹)")
plt.ylabel("비율")
plt.show()

print("\n✅ 분석 완료!")