# 파일 불러오기

In [4]:
import pandas as pd

ratings = pd.read_csv(
    'ratings.dat',
    sep='::',
    engine='python',
    names=['user_id', 'movie_id', 'rating', 'timestamp']
)

users = pd.read_csv(
    'users.dat',
    sep='::',
    engine='python',
    names=['user_id', 'gender', 'age', 'occupation', 'zip_code']
)

movies = pd.read_csv(
    'movies.dat',
    sep='::',
    engine='python',
    encoding='latin-1',  # ✅ 이 줄만 추가하면 해결됨
    names=['movie_id', 'title', 'genres']
)


# 불러온 파일 체크


In [5]:
import os
print(os.listdir())  # 현재 폴더에 있는 파일 목록 보기


['.ipynb_checkpoints', '.venv', 'movies.dat', 'ratings.dat', 'README', 'requirements.txt', 'test.ipynb', 'users.dat', '연령대별_장르_선호_및_평점.csv']


In [6]:
print("🎬 ratings:\n", ratings.head())
print("\n👤 users:\n", users.head())
print("\n🎞️ movies:\n", movies.head())

🎬 ratings:
    user_id  movie_id  rating  timestamp
0        1      1193       5  978300760
1        1       661       3  978302109
2        1       914       3  978301968
3        1      3408       4  978300275
4        1      2355       5  978824291

👤 users:
    user_id gender  age  occupation zip_code
0        1      F    1          10    48067
1        2      M   56          16    70072
2        3      M   25          15    55117
3        4      M   45           7    02460
4        5      M   25          20    55455

🎞️ movies:
    movie_id                               title                        genres
0         1                    Toy Story (1995)   Animation|Children's|Comedy
1         2                      Jumanji (1995)  Adventure|Children's|Fantasy
2         3             Grumpier Old Men (1995)                Comedy|Romance
3         4            Waiting to Exhale (1995)                  Comedy|Drama
4         5  Father of the Bride Part II (1995)                       

# 파일 합치기

In [7]:
df = pd.merge(ratings, users, on='user_id')


In [8]:
df = pd.merge(df, movies, on='movie_id')


In [9]:
print(df.head())


   user_id  movie_id  rating  timestamp gender  age  occupation zip_code  \
0        1      1193       5  978300760      F    1          10    48067   
1        1       661       3  978302109      F    1          10    48067   
2        1       914       3  978301968      F    1          10    48067   
3        1      3408       4  978300275      F    1          10    48067   
4        1      2355       5  978824291      F    1          10    48067   

                                    title                        genres  
0  One Flew Over the Cuckoo's Nest (1975)                         Drama  
1        James and the Giant Peach (1996)  Animation|Children's|Musical  
2                     My Fair Lady (1964)               Musical|Romance  
3                  Erin Brockovich (2000)                         Drama  
4                    Bug's Life, A (1998)   Animation|Children's|Comedy  


In [10]:
df.drop_duplicates(inplace=True)


In [11]:
summary = df.groupby(['user_id', 'movie_id'])['rating'].agg(['mean', 'count']).reset_index()


# 연령대 별로 분류

In [12]:
age_map = {
    1: 'Under 18',
    18: '18-24',
    25: '25-34',
    35: '35-44',
    45: '45-49',
    50: '50-55',
    56: '56+'
}

df['age_group'] = df['age'].map(age_map)


In [13]:
genre_by_age = df.groupby('age_group')['genres'].value_counts().unstack().fillna(0)


In [14]:
avg_rating_by_age = df.groupby('age_group')['rating'].mean().sort_index()
print(avg_rating_by_age)


age_group
18-24       3.507573
25-34       3.545235
35-44       3.618162
45-49       3.638062
50-55       3.714512
56+         3.766632
Under 18    3.549520
Name: rating, dtype: float64


In [15]:
senior_df = df[df['age'] >= 50]


In [16]:
df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,gender,age,occupation,zip_code,title,genres,age_group
0,1,1193,5,978300760,F,1,10,48067,One Flew Over the Cuckoo's Nest (1975),Drama,Under 18
1,1,661,3,978302109,F,1,10,48067,James and the Giant Peach (1996),Animation|Children's|Musical,Under 18
2,1,914,3,978301968,F,1,10,48067,My Fair Lady (1964),Musical|Romance,Under 18
3,1,3408,4,978300275,F,1,10,48067,Erin Brockovich (2000),Drama,Under 18
4,1,2355,5,978824291,F,1,10,48067,"Bug's Life, A (1998)",Animation|Children's|Comedy,Under 18


# csv파일로 내보내기

In [23]:

# 👉 먼저 df는 ratings + users + movies를 merge해서 만든 통합 데이터프레임이어야 함
# 👉 그리고 df에는 'age', 'rating', 'genres' 컬럼이 반드시 있어야 함

# ✅ 연령 코드 → 연령대 텍스트 매핑
age_map = {
    1: 'Under 18',
    18: '18-24',
    25: '25-34',
    35: '35-44',
    45: '45-49',
    50: '50-55',
    56: '56+'
}
df['age_group'] = df['age'].map(age_map)

# ✅ 장르를 분리 (Animation|Children's → Animation, Children's)
df_exploded = df.copy()
df_exploded['genres'] = df_exploded['genres'].str.split('|')
df_exploded = df_exploded.explode('genres')

# ✅ 연령대별 장르별 시청 수 및 평균 평점
genre_summary = df_exploded.groupby(['age_group', 'genres']).agg(
    watch_count=('rating', 'count'),
    avg_rating=('rating', 'mean')
).reset_index()

# ✅ CSV로 저장
genre_summary.to_csv("연령대별_장르_선호_및_평점.csv", index=False)

# 확인용 출력 (선택)
print(genre_summary.head())


  age_group      genres  watch_count  avg_rating
0     18-24      Action        50186    3.447097
1     18-24   Adventure        26324    3.408525
2     18-24   Animation        10269    3.624014
3     18-24  Children's        16924    3.294257
4     18-24      Comedy        69980    3.460417
