# 파일 불러오기

In [2]:
import pandas as pd

ratings = pd.read_csv(
    'ratings.dat',
    sep='::',
    engine='python',
    names=['user_id', 'movie_id', 'rating', 'timestamp']
)

users = pd.read_csv(
    'users.dat',
    sep='::',
    engine='python',
    names=['user_id', 'gender', 'age', 'occupation', 'zip_code']
)

movies = pd.read_csv(
    'movies.dat',
    sep='::',
    engine='python',
    encoding='latin-1',  # ✅ 이 줄만 추가하면 해결됨
    names=['movie_id', 'title', 'genres']
)


# 불러온 파일 체크


In [3]:
import os
print(os.listdir())  # 현재 폴더에 있는 파일 목록 보기


['.ipynb_checkpoints', '.venv', 'entertainment.m3u', 'index.category.m3u', 'index.m3u', 'iptv_channels_categories.csv', 'm3u_csv_chang.py', 'movies.dat', 'ratings.dat', 'README', 'requirements.txt', 'streamlit_app.py', 'streamlit_app_02.py', 'test.ipynb', 'users.dat', '연령대별_장르_선호_및_평점.csv', '연령대별_장르_선호_및_평점_한글.csv', '연령대별_장르_채널_매핑.csv', '장르_기반_IPTV_추천_데이터.csv']


In [4]:
print("🎬 ratings:\n", ratings.head())
print("\n👤 users:\n", users.head())
print("\n🎞️ movies:\n", movies.head())

🎬 ratings:
    user_id  movie_id  rating  timestamp
0        1      1193       5  978300760
1        1       661       3  978302109
2        1       914       3  978301968
3        1      3408       4  978300275
4        1      2355       5  978824291

👤 users:
    user_id gender  age  occupation zip_code
0        1      F    1          10    48067
1        2      M   56          16    70072
2        3      M   25          15    55117
3        4      M   45           7    02460
4        5      M   25          20    55455

🎞️ movies:
    movie_id                               title                        genres
0         1                    Toy Story (1995)   Animation|Children's|Comedy
1         2                      Jumanji (1995)  Adventure|Children's|Fantasy
2         3             Grumpier Old Men (1995)                Comedy|Romance
3         4            Waiting to Exhale (1995)                  Comedy|Drama
4         5  Father of the Bride Part II (1995)                       

# 파일 합치기

In [5]:
df = pd.merge(ratings, users, on='user_id')


In [6]:
df = pd.merge(df, movies, on='movie_id')


In [7]:
print(df.head())


   user_id  movie_id  rating  timestamp gender  age  occupation zip_code  \
0        1      1193       5  978300760      F    1          10    48067   
1        1       661       3  978302109      F    1          10    48067   
2        1       914       3  978301968      F    1          10    48067   
3        1      3408       4  978300275      F    1          10    48067   
4        1      2355       5  978824291      F    1          10    48067   

                                    title                        genres  
0  One Flew Over the Cuckoo's Nest (1975)                         Drama  
1        James and the Giant Peach (1996)  Animation|Children's|Musical  
2                     My Fair Lady (1964)               Musical|Romance  
3                  Erin Brockovich (2000)                         Drama  
4                    Bug's Life, A (1998)   Animation|Children's|Comedy  


In [8]:
df.drop_duplicates(inplace=True)


In [9]:
summary = df.groupby(['user_id', 'movie_id'])['rating'].agg(['mean', 'count']).reset_index()


# 연령대 별로 분류

In [10]:
age_map = {
    1: 'Under 18',
    18: '18-24',
    25: '25-34',
    35: '35-44',
    45: '45-49',
    50: '50-55',
    56: '56+'
}

df['age_group'] = df['age'].map(age_map)


In [11]:
genre_by_age = df.groupby('age_group')['genres'].value_counts().unstack().fillna(0)


In [12]:
avg_rating_by_age = df.groupby('age_group')['rating'].mean().sort_index()
print(avg_rating_by_age)


age_group
18-24       3.507573
25-34       3.545235
35-44       3.618162
45-49       3.638062
50-55       3.714512
56+         3.766632
Under 18    3.549520
Name: rating, dtype: float64


In [13]:
senior_df = df[df['age'] >= 50]


In [14]:
df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip_code,title,genres,age_group
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama,Under 18
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),Animation|Children's|Musical,Under 18
2,1,914,3,978301968,F,1,10,48067,My Fair Lady (1964),Musical|Romance,Under 18
3,1,3408,4,978300275,F,1,10,48067,Erin Brockovich (2000),Drama,Under 18
4,1,2355,5,978824291,F,1,10,48067,"Bug's Life, A (1998)",Animation|Children's|Comedy,Under 18


# csv파일로 내보내기

In [15]:
import pandas as pd

# ✔️ 연령대 매핑
age_map = {
    1: '18세 미만',
    18: '18-24세',
    25: '25-34세',
    35: '35-44세',
    45: '45-49세',
    50: '50-55세',
    56: '56세 이상'
}
df['연령대'] = df['age'].map(age_map)

# ✔️ 장르 분해
df_exploded = df.copy()
df_exploded['장르'] = df_exploded['genres'].str.split('|')
df_exploded = df_exploded.explode('장르')

# ✔️ 한글 장르 매핑
genre_map = {
    'Action': '액션', 'Adventure': '모험', 'Animation': '애니메이션', "Children's": '아동',
    'Comedy': '코미디', 'Crime': '범죄', 'Documentary': '다큐멘터리', 'Drama': '드라마',
    'Fantasy': '판타지', 'Film-Noir': '느와르', 'Horror': '공포', 'Musical': '뮤지컬',
    'Mystery': '미스터리', 'Romance': '로맨스', 'Sci-Fi': 'SF', 'Thriller': '스릴러',
    'War': '전쟁', 'Western': '서부'
}
df_exploded['장르'] = df_exploded['장르'].map(genre_map)
df_exploded = df_exploded.dropna(subset=['장르'])

# ✔️ 연령대-장르별 통계
genre_summary = df_exploded.groupby(['연령대', '장르']).agg(
    시청수=('rating', 'count'),
    평균평점=('rating', 'mean')
).reset_index()

# ✔️ 저장
genre_summary.to_csv("연령대별_장르_선호_및_평점_한글.csv", index=False)


# 파일 병합

In [21]:
import pandas as pd

# 📥 IPTV 채널 데이터 로드
iptv_df = pd.read_csv("iptv_channels_categories.csv")

# ✅ 영어 장르 → 한글 매핑 테이블
genre_mapping = {
    "Animation": "애니메이션",
    "Comedy": "코미디",
    "Documentary": "다큐멘터리",
    "Kids": "아동",
    "Music": "뮤지컬",
    # ⚠️ 아래 장르들은 기존 영화 데이터에 없으므로 제외
    "News": None, "Sports": None, "Entertainment": None, "Series": None,
    "Movies": None, "General": None, "Lifestyle": None, "Undefined": None
}

# ✅ 장르 컬럼 매핑
iptv_df["장르"] = iptv_df["category"].map(genre_mapping)

# ✅ 필요한 컬럼만 정리
iptv_df = iptv_df[["channel_name", "장르"]].dropna()

# 💾 저장 (필요하면 추가 컬럼도 포함 가능)
iptv_df.to_csv("iptv_channels_categories_한글매핑.csv", index=False)


In [20]:
print(iptv_df.columns)


Index(['channel_name', 'category'], dtype='object')
