# 1. 사용 라이브러리 및 드라이브 연결

*   사용할 라이브러리 import
*   구글 드라이브 mount


In [1]:
import numpy as np
import pandas as pd
import chardet

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 2. 데이터 전처리

1.   사용할 데이터셋 및 연결할 데이터프레임 오픈
2.   하나의 데이터프레임으로 병합




## 1. 사용할 데이터셋 및 연결할 데이터프레임 오픈

In [151]:
# 파일 오픈할 때, UnicodeDecodeError 발생시 encoding 타입 확인을 위한 함수 작성
def check_encoding(file_path) :
    with open(file_path, 'rb') as rawdata:
        encoding_data = chardet.detect(rawdata.read())
        print(f"인코딩 타입 : {encoding_data['encoding']}")
        print(f"신뢰도 : {encoding_data['confidence']}")
    return

In [153]:
# 연결할 데이터 확인
file_path = "/content/u.item"
check_encoding(file_path)

인코딩 타입 : utf-8
신뢰도 : 0.99


{'encoding': 'utf-8', 'confidence': 0.99, 'language': ''}

In [156]:
# 데이터를 데이터프레임 형식으로 오픈
col_names = ["movie_id", "movie_title", "release_date", "video_release_date", "IMDb_URL", "unknown", "Action", "Adventure", "Animation", "Children's", "Comedy",
             "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]
movie_df = pd.read_csv(file_path, sep='|', names=col_names)
movie_df.head()

Unnamed: 0,movie_id,movie_title,release_date,video_release_date,IMDb_URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [157]:
movie_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1682 entries, 0 to 1681
Data columns (total 24 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   movie_id            1682 non-null   int64  
 1   movie_title         1682 non-null   object 
 2   release_date        1681 non-null   object 
 3   video_release_date  0 non-null      float64
 4   IMDb_URL            1679 non-null   object 
 5   unknown             1682 non-null   int64  
 6   Action              1682 non-null   int64  
 7   Adventure           1682 non-null   int64  
 8   Animation           1682 non-null   int64  
 9   Children's          1682 non-null   int64  
 10  Comedy              1682 non-null   int64  
 11  Crime               1682 non-null   int64  
 12  Documentary         1682 non-null   int64  
 13  Drama               1682 non-null   int64  
 14  Fantasy             1682 non-null   int64  
 15  Film-Noir           1682 non-null   int64  
 16  Horror

In [159]:
# 전달받은 데이터프레임 확인
df = pd.read_csv("/content/user_data.csv", index_col = 0)
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,age,gender,occupation
0,196,242,3,881250949,49,M,writer
1,196,393,4,881251863,49,M,writer
2,196,381,4,881251728,49,M,writer
3,196,251,3,881251274,49,M,writer
4,196,655,5,881251793,49,M,writer


In [160]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 7 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   user_id     100000 non-null  int64 
 1   item_id     100000 non-null  int64 
 2   rating      100000 non-null  int64 
 3   timestamp   100000 non-null  int64 
 4   age         100000 non-null  int64 
 5   gender      100000 non-null  object
 6   occupation  100000 non-null  object
dtypes: int64(5), object(2)
memory usage: 6.1+ MB


## 2. 하나의 데이터프레임으로 병합

In [162]:
# 전달받은 데이터프레임의 column명 변경
df.rename(columns={'item_id':'movie_id'}, inplace=True)
df

Unnamed: 0,user_id,movie_id,rating,timestamp,age,gender,occupation
0,196,242,3,881250949,49,M,writer
1,196,393,4,881251863,49,M,writer
2,196,381,4,881251728,49,M,writer
3,196,251,3,881251274,49,M,writer
4,196,655,5,881251793,49,M,writer
...,...,...,...,...,...,...,...
99995,941,919,5,875048887,20,M,student
99996,941,273,3,875049038,20,M,student
99997,941,1,5,875049144,20,M,student
99998,941,294,4,875048532,20,M,student


In [177]:
# 병합할 칼럼을 제외한 칼럼 드랍
moive_id_genre = movie_df.drop(["movie_title", "release_date", "video_release_date", "IMDb_URL"], axis=1)
moive_id_genre

Unnamed: 0,movie_id,unknown,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
3,4,0,1,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,1678,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1678,1679,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0
1679,1680,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0
1680,1681,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


In [182]:
# 하나의 데이터프레임으로 병합
use_df = pd.merge(df, moive_id_genre)
use_df

Unnamed: 0,user_id,movie_id,rating,timestamp,age,gender,occupation,unknown,Action,Adventure,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,196,242,3,881250949,49,M,writer,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,305,242,5,886307828,23,M,programmer,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6,242,4,883268170,42,M,executive,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,234,242,4,891033261,60,M,retired,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,63,242,3,875747190,31,M,marketing,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99995,863,1679,3,889289491,17,M,student,0,0,0,...,0,0,0,0,0,1,0,1,0,0
99996,863,1678,1,889289570,17,M,student,0,0,0,...,0,0,0,0,0,0,0,0,0,0
99997,863,1680,2,889289570,17,M,student,0,0,0,...,0,0,0,0,0,1,0,0,0,0
99998,896,1681,3,887160722,28,M,writer,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [184]:
# 사용할 데이터프레임을 csv파일로 저장
use_df.to_csv('movie_ratings.csv')