## import package

In [3]:
import requests
import pandas as pd
import numpy as np
import os
import time
import datetime
import json

## 2004~2016 영화데이터 수집

영화진흥위원회 연도별 박스오피스 목록을 파일로 불러들여서 하나의 파일로 합치고
누적 관객수가 10000명 이상인 것만 출력하고 total index를 drop 시킵니다.
영화명으로 중복된 데이터를 삭제 후 movie_df에 저장 했습니다.

In [21]:
csv_list = [file for file in os.listdir() if file.endswith("boxoffice.csv")]

In [22]:
for i, data in enumerate(csv_list):
    if i == 0:
        df = pd.read_csv(data)
    else:
        small_df = pd.read_csv(data)
        df = pd.concat([df, small_df]).reset_index(drop=True)

In [23]:
df["관객수"] = df["관객수"].str.replace(",","")
df["관객수"] = df["관객수"].astype(int)
movie_df = df[df["관객수"] > 10000].reset_index(drop=True)
movie_df = movie_df.drop(movie_df.index[[3191]])
movie_df = movie_df.drop_duplicates("영화명").reset_index(drop=True)

### movie_df to csv

In [25]:
movie_df.to_csv("movie.csv",encoding="utf-8")

## 영화 기본 정보 수집

영화진흥위원회 오픈api를 사용

영화 제목으로 1차적으로 영화코드, 영화제목, 감독, 제작년도, 개봉년도, 상영타입, 제작국가, 대표장르, 배급사 정보 수집

In [26]:
movieNm = movie_df["영화명"]

In [29]:
def get_movie_data(movieNm):
    url = "http://www.kobis.or.kr/kobisopenapi/webservice/rest/movie/searchMovieList.json"
    params = {"key":"74ab5419e95d3e66cd760d6f498dc423", "movieNm":movieNm}
    r = requests.get(url, params=params)
    return r.json()

In [30]:
def make_movie_df(movieNm):
    movie_df = pd.DataFrame(columns = ["movieCd", "movieNm", "director","prdtYear", "openDt", "typeNm", "repNationNm", "repGenreNm", "companyNm"])
    for i in list(movieNm):
        try:
            for data in get_movie_data(i)['movieListResult']['movieList']:
                if len(data["directors"]) >= 2:
                    director = data["directors"][0]["peopleNm"]
                elif len(data["directors"]) == 1:
                    director = data["directors"][0]["peopleNm"]
                if len(data["companys"]) >= 2:
                    companyNm = data["companys"][0]["companyNm"]
                elif len(data["companys"]) == 1:
                    companyNm = data["companys"][0]["companyNm"] 
                movie_df.loc[len(movie_df)] = [
                    data["movieCd"],
                    data["movieNm"],
                    director,
                    data["prdtYear"],
                    data["openDt"],
                    data["typeNm"],
                    data["repNationNm"],
                    data["repGenreNm"],
                    companyNm
                ]
        except:
            print(i)
    return movie_df

In [31]:
movie_info_df = make_movie_df(movieNm)

주홍글씨
스텔스
사하라
황후화
도쿄!
그들 각자의 영화관
시네마 천국


In [33]:
movie_info_df = movie_info_df[~movie_info_df["movieNm"].str.contains("시네마정동")].reset_index(drop=True)
movie_info_df = movie_info_df[movie_info_df["repNationNm"] != "기타"].reset_index(drop=True)
movie_info_df = movie_info_df.drop_duplicates("movieCd").reset_index(drop=True)
movie_info_df = movie_info_df.drop_duplicates("movieNm").reset_index(drop=True)
movie_info_df = movie_info_df[movie_info_df["openDt"] != ""].reset_index(drop=True)

### movie_info_df to csv

In [42]:
movie_info_df.to_csv("movie_info.csv",encoding="utf-8")

## 영화 상세 정보 수집

영화진흥위원회 오픈api 이용

영화 코드를 이용하여 영화제목, 상영시간, 상영등급, 배우, 배급사 정보 추출

In [53]:
movieCd = movie_info_df["movieCd"][:3000]

In [54]:
movieCd1 = movie_info_df["movieCd"][3000::]

In [63]:
def get_movie_detail(movieCd):
    url = "http://www.kobis.or.kr/kobisopenapi/webservice/rest/movie/searchMovieInfo.json"
    params = {"key":"d7aeec3963622626f334411333e0121e", "movieCd":movieCd}
    r = requests.get(url, params=params)
    return r.json()

In [64]:
def make_movie_detail_df(movie_info_df):
    movie_detail_df = pd.DataFrame(columns=["movieCd", 
                                            "movieNm", 
                                            "showTm", 
                                            "watchGradeNm", 
                                            "actor_1",
                                            "actor_2",
                                            "actor_3",
                                            "companyNm"
                                            ])
    for i in list(movieCd1):
        try:
            data = get_movie_detail(i)['movieInfoResult']['movieInfo']
            actor_list = []
            if len(data["audits"]) >= 2:
                watchGradeNm = data["audits"][0]["watchGradeNm"]
            elif len(data["audits"]) == 1:
                watchGradeNm = data["audits"][0]["watchGradeNm"]
            if len(data["companys"]) >= 2:
                companyNm = data["companys"][0]["companyNm"]
            elif len(data["companys"]) == 1:
                companyNm = data["companys"][0]["companyNm"] 
            if len(data["actors"]) >= 3:
                actor_list = [
                    data["actors"][0]["peopleNm"],
                    data["actors"][1]["peopleNm"],
                    data["actors"][2]["peopleNm"]
                ]
            else:
                for i in range(len(data["actors"])):
                    actor_list.append(data["actors"][i]["peopleNm"])
                for i in range(3-len(data["actors"])):
                    actor_list.append("")
            movie_detail_df.loc[len(movie_detail_df)] = [
                data["movieCd"],
                data["movieNm"],
                data["showTm"],
                watchGradeNm,
                *actor_list,
                companyNm
            ]
        except:
             print(i)
    return movie_detail_df

In [57]:
movie_detail_df = make_movie_detail_df(movieCd)
movie_detail1_df = make_movie_detail_df(movieCd1)
movie_detail = pd.concat([movie_detail_df, movie_detail1_df]).reset_index(drop=True)

In [80]:
boxoffice_df = movie_info_df.merge(movie_detail, left_on="movieCd", right_on="movieCd")[[
        "movieCd",
        "movieNm_x",
        "director",
        "openDt",
        "prdtYear",
        "repNationNm",
        "repGenreNm",
        "showTm",
        "watchGradeNm",
        "actor_1",
        "actor_2",
        "actor_3",
        "companyNm_y"
    ]]
boxoffice_df = boxoffice_df.rename(columns={"movieNm_x":"movieNm", "companyNm_y":"companyNm"})

### boxoffice_df to csv

In [84]:
boxoffice_df.to_csv("boxoffice.csv",encoding="utf-8")