In [2]:
import requests
from bs4 import BeautifulSoup
import re
import os
import pandas as pd
import json
import time

In [3]:
page = 1
url_tmpl = 'http://www.kinopoisk.ru/top/navigator/m_act[num_vote]/100/m_act[rating]/1:/order/rating/page/%d/#results'
url = url_tmpl % page

In [4]:
# initialize session
s = requests.Session() 
s.headers.update({
        'Referer': 'http://www.kinopoisk.ru',
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:45.0) Gecko/20100101 Firefox/45.0'
        #'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.110 Safari/537.36'
    })

pattern = re.compile('.*/(\d+)/?$') # pattern to get movie_id from link

In [4]:
! mkdir all_movies_list
! ls

mkdir: all_movies_list: File exists
[34mall_movies_data_thread[m[m       loading_movies_kp_data.ipynb
[34mall_movies_list[m[m              [34muser_data[m[m
all_movies_list.csv


# Список фильмов
## Загружаем список всех фильмов 
Будем брать все фильмы у которых есть хотя бы 100 оценок.

In [22]:
# loading list of all movies with at least 100 votes
page = 1
while True:
    url = url_tmpl % page
    r = s.get(url)
    soup = BeautifulSoup(r.text)
    if soup.find(id = 'itemList'):
        with open('./all_movies_list/page_%d.html' % page, 'w') as output_file:
            output_file.write(r.text.encode('cp1251'))
            page += 1
    else:
        break

## Парсим выгруженные данные

In [10]:
movies_list = {}

for filename in os.listdir('./all_movies_list/'):
    if filename.find('html') == -1:
        continue
    filepath = './all_movies_list/' + filename
    
    with open(filepath) as input_file:
        text = input_file.read()     
    
    soup = BeautifulSoup(text)
    items = soup.find(id = 'itemList').find_all('div', {'class': "item _NO_HIGHLIGHT_"})
    
    for item in items:
        name_node = item.find('div', {'class': 'name'})
        name_link_node = name_node.find('a')
        name = name_link_node.text
        link = name_link_node.get('href')
        movie_id = pattern.match(link).groups()[0]
        
        movies_list[movie_id] = name

In [14]:
movies_list_df = pd.DataFrame.from_dict(movies_list, orient = 'index').reset_index()
movies_list_df.columns = ['movie_id', 'name']
movies_list_df.to_csv('all_movies_list.csv', encoding = 'utf-8')

# Загружаем детальную информацию по фильмам

In [26]:
def load_movie_data(movie_id, session):
    url = 'http://www.kinopoisk.ru/film/%s/' % (movie_id)
    #print url
    request = session.get(url)
    return request.text

In [27]:
# ! rm -r ./all_movies_data_thread/
# ! mkdir all_movies_data_thread

In [28]:
loaded_movies = set(map(lambda x: x.replace('.html', ''), list(os.listdir('./all_movies_data_thread'))))
len(loaded_movies)

36419

In [29]:
movies_list_df = pd.read_csv('all_movies_list.csv')
all_movies = set(map(str, movies_list_df['movie_id'].values.tolist()))
len(all_movies)

36418

In [30]:
movies_to_load = list(all_movies - loaded_movies)
len(movies_to_load)

0

In [23]:
N = 7
kvant = len(movies_to_load)/N
movies_chunks = []
for i in range(N):
    if i != N-1:
        movies_chunks.append(movies_to_load[i*kvant:(i+1)*kvant])
    else:
        movies_chunks.append(movies_to_load[i*kvant:])

In [24]:
import threading

class myThread (threading.Thread):
    def __init__(self, movies):
        threading.Thread.__init__(self)
        self.movies = movies
    
    def run(self):
        global loaded_dict
        print 'Starting thread...', len(self.movies)
        for movie_id in self.movies:
            time.sleep(0.1)
            tmp = load_movie_data(movie_id, s)
            if tmp.find('www.google.com/recaptcha/') != -1:
                print 'Looks like CAPTCHA'
                break
            with open('./all_movies_data_thread/%s.html' % (movie_id), 'w') as output_file:
                output_file.write(tmp.encode('utf-8'))

In [25]:
for i in range(N):
    thread = myThread(movies_chunks[i])
    thread.start()

Starting thread... 1
Starting thread... 1
Starting thread... 1
Starting thread... 1
Starting thread... 1
Starting thread... 1
Starting thread... 1


# Распарсим данные о фильмах

In [51]:
! mkdir all_movies_data_parsed

In [101]:
loaded_movies = map(lambda x: x.replace('.html', ''), list(os.listdir('./all_movies_data_thread/')))
parsed_movies = map(lambda x: x.replace('.csv', ''), list(os.listdir('./all_movies_data_parsed/')))

parsed_movies.remove('.DS_Store')
loaded_movies.remove('.DS_Store')

loaded_movies = set(loaded_movies)
parsed_movies = set(parsed_movies)

len(loaded_movies), len(parsed_movies)

(36417, 36416)

In [102]:
movies_to_parse = loaded_movies - parsed_movies
len(movies_to_parse)

1

In [106]:
def parse_movie_datafile(filename):
    with open('./all_movies_data_thread/' + filename) as input_file:
        text = input_file.read().decode('utf-8')
        
    movie_id = filename.replace('.html', '')
    
    soup = BeautifulSoup(text)
    
    name_eng = soup.find('span', {'itemprop': 'alternativeHeadline'}).text
    name_rus = soup.find('h1', {'class': 'moviename-big'}).text
    
    kp_rating = None
    if soup.find('span', {'class': 'rating_ball'}):
        kp_rating = float(soup.find('span', {'class': 'rating_ball'}).text)
    
    critics_rating = None
    critics_rating_block = soup.find('div', {'class': 'criticsRating'})
    if critics_rating_block:
        critics_rating_num_block = critics_rating_block.find('div', {'class': 'ratingNum'})
        if critics_rating_num_block:
            critics_rating = critics_rating_num_block.find('span').text
      
    imdb_rating = None
    if soup.find('div', {'class': 'block_2'}):
        imdb_block_text = soup.find('div', {'class': 'block_2'}).find(text = re.compile('^IMDb'))
        if imdb_block_text:
            imdb_rating = imdb_block_text.split()[1]
    
    info_table = soup.find('table', {'class': 'info'})
    
    movie_year = info_table.find('td', {'class': 'type'}, text = u'год').nextSibling.nextSibling.text[:5]
    movie_duration = info_table.find('td', {'class': 'type'}, text = u'время').nextSibling.text.split()[0]
    
    genres = None
    if info_table.find('span', {'itemprop': 'genre'}):
        genres = map(lambda x: x.text, info_table.find('span', {'itemprop': 'genre'}).find_all('a'))
    
    countries = map(lambda x: x.text, info_table.find('td', {'class': 'type'}, text = u'страна').nextSibling.nextSibling.find_all('a'))
    
    movie = {
    'movie_id': movie_id,
    'name_eng': name_eng,
    'name_rus': name_rus,
    'kp_rating': kp_rating,
    'critics_rating': critics_rating,
    'imdb_rating': imdb_rating,
    'movie_year': movie_year,
    'movie_duration': movie_duration,
    'genres': genres,
    'countries': countries
    }
    
    return movie


In [107]:
failed_movie_ids = []

for movie_id in movies_to_parse:
    filename = movie_id + '.html'
    movie = parse_movie_datafile(filename)
    with open('./all_movies_data_parsed/%s.csv' % movie_id, 'w') as output_file:
        output_file.write(pd.DataFrame.from_dict(movie, orient = 'index').T.to_csv(encoding = 'utf-8', index = False))

## Соберем распарсенные данные

In [111]:
tmp_dfs = []

for filename in os.listdir('./all_movies_data_parsed/'):
    if filename == '.DS_Store':
        continue
    tmp_dfs.append(pd.read_csv('./all_movies_data_parsed/' + filename))

In [112]:
df = pd.concat(tmp_dfs)

In [116]:
df.to_csv('kinopoisk_data.csv')