# Naver movie 개봉 전 review Crawling

In [70]:
import re
import math
import pandas as pd
from urllib.request import urlopen
from bs4 import BeautifulSoup

class NaverMovieScraping:    
    def __init__(self, url):
        self.url = url
        
    def URLcode(self, url):
        self.url = url
        start = re.search('code=[0-9]+',self.url).group()
        code = re.search('[0-9]+',start).group()
        return code 
    
    def ReviewSCORE(self, code):
        
        self.code = code
        url_base = "http://movie.naver.com/movie/bi/mi/point.nhn?code={}".format(str(code))
        html = urlopen(url_base).read().decode('utf-8')
        soup = BeautifulSoup(html, 'lxml')
        graph = soup.findAll('span',{'class':'exp_cnt'})
        star = soup.findAll('div', {'class':'sc_area b_star'})
        movie_info = soup.findAll('dl', {'class' : 'info_spec'})
        exp_idx = [elem.text for elem in graph]
        star_idx = [s.text for s in star]
       
        try:
            smile = ''.join(re.findall('[0-9]',exp_idx[0]))
            not_smile= ''.join(re.findall('[0-9]',exp_idx[1])) 
            star_val = star_idx[0][17:21]
            num = star_idx[0][26:31]

        except IndexError as ex:
            smile = 0
            not_smile = 0
            star_val = 0
            num = 0
            
        try:
            text = [info.get_text().replace('\n', '').replace('\t', '').replace('\r','') for info in movie_info]
                           
        except ValueError as ex:
            text = []
            
        return pd.DataFrame({'보고싶어요' : [smile], 
                             '글쎄요' : [not_smile], 
                             '평점' : [star_val], 
                             '참여인원' : [num],
                             '영화정보' :[text]})
    
    def getActors(self, code):
        
        self.code = code
        url_role = "http://movie.naver.com/movie/bi/mi/detail.nhn?code={}".format(str(code))
        html_role = urlopen(url_role).read().decode('utf-8')
        soup = BeautifulSoup(html_role, 'lxml')
        actors = soup.findAll('a', {'class':'k_name'})
        roles = soup.findAll('p', {'class':'in_prt'})
        
        actor = [act.text for act in actors]
        role = [r.text.replace('\n','') for r in roles]
        
        return pd.DataFrame({'배우' : [actor[:len(actor)-1]],
                             '역할' : [role],
                             '감독' : [actor[len(actor)-1]]})
    
    def review_page(self, code):
        
        self.code = code
        # url의 경우 따로 개봉전 주소를 넣는다.
        url_base = "http://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code={}&type=before".format(str(code))
        html = urlopen(url_base).read().decode('utf-8')
        soup = BeautifulSoup(html, 'lxml')
        
        score_total = soup.findAll('div', {'class' : 'score_total'})
        review_num = [elem.findChildren('em')[1].get_text() for elem in score_total]
    
        # review_num이 1000개를 넘어가면 , 문자를 제거 
        return math.ceil(int(review_num[0].replace(",","")) / 10)
    
    def getReview(self, code, page_num):
    
        page = int(1)
        #count = int(input('page number : '))
        self.page_num = int(page_num) # review의 전체 페이지
    
        reple_list = []
        score_list = []
        good_list = []
        bad_list = []
        self.code = code
        
        while page_num:
        
            url_base = "http://movie.naver.com/movie/bi/mi/pointWriteFormList.nhn?code={}&type=before&page={}".format(str(code),str(page))
            html = urlopen(url_base).read().decode('utf-8')
            soup = BeautifulSoup(html, 'lxml')
        
            score_result = soup.find('div', {'class' : 'score_result'})
            li_list = score_result.find_all('li')
        
            for li in li_list:
            
                page = int(page)
                reple = li.find('div', {'class' : 'score_reple'}).find('p').get_text()
            
            # 별점이 없는 곳은 0점으로 넣는다.
                try:
                    score = li.find('div', {'class' : 'star_score'}).find('em').get_text()
                except Exception as ex:
                    score = 0
                
                good = li.find('div', {'class' : 'btn_area'}).get_text().split('\n')[1]
                bad = li.find('div', {'class' : 'btn_area'}).get_text().split('\n')[2]
            
                reple_list.append(reple)
                score_list.append(score)
                good_list.append(''.join([s for s in str(good) if s.isdigit()]))
                bad_list.append(''.join([s for s in str(bad) if s.isdigit()]))
            
            page_num -= 1
        
            if not page_num:
                break
            
            page += 1
    
        return pd.DataFrame({'score' : score_list, 'reple' : reple_list, 
                             'good' : good_list, 'bad' : bad_list})

### Naver movie review

In [121]:
import os
os.listdir(os.getcwd())
movie_df = pd.read_csv('url_list.csv', encoding="cp949")

In [122]:
def movie_score(movie_df):
    
    Review_df= pd.DataFrame()
    
    for i, url in enumerate(movie_df['url']):
        print(1)
        print('url 을 긁었습니다.', 'pos :', i)
        crawler = NaverMovieScraping(url)
        Review_df = pd.concat([Review_df, crawler.ReviewSCORE(crawler.URLcode(url))],ignore_index=True)
    
    return(Review_df)

In [123]:
def movie_actor(movie_df):
    
    Review_df= pd.DataFrame()
    
    for i, url in enumerate(movie_df['url']):
        print('url 을 긁었습니다.', 'pos :', i)
        crawler = NaverMovieScraping(url)
        Review_df = pd.concat([Review_df, crawler.getActors(crawler.URLcode(url))],ignore_index=True)
    
    return(Review_df)

In [124]:
data = movie_score(movie_df)
actor = movie_actor(movie_df)

In [125]:
data["title"] = movie_df["title"]
data["url"] = movie_df["url"]
data["y"] = movie_df["Y (손익분기점)"]

In [126]:
df = pd.concat([data, actor], axis=1) # 영화정보 DataFrame 

## Reivew 크롤링

In [116]:
def movie_review(movie_df):
    
    Review_df= pd.DataFrame()
    id = pd.DataFrame(movie_df["title"])
    for i, url in enumerate(movie_df['url']):
        print('url 을 긁었습니다.', 'pos :', i)
        crawler = NaverMovieScraping(url)
        URL = crawler.URLcode(url)
        page = crawler.review_page(URL)
        print(page)
        review = crawler.getReview(URL,page)
        review["id"] = movie_df["title"][i]
        review['y'] = movie_df["y"][i]
        Review_df = pd.concat([Review_df, review],ignore_index=True)
    return(Review_df)

In [119]:
review = movie_review(df) # 영화리뷰 DataFrame

In [117]:
review.to_csv("review.csv")