## TASK: IMDb scraping (deadline: 3 martie ora 23:59)

First codeblock has all imports and global variables

In [203]:
import bs4
import pandas as pd
import requests
import json
import re
from tqdm import tqdm

base_url = 'https://www.imdb.com/'
url_top = base_url + 'chart/top/'
headers = {"Accept-Language": "en-US,en;q=0.5"}
# If you want to load more this has to be true, otherwise will do only first 25 reviews
LOAD_MORE = True
# Total reviews loaded for a single movie (50 implies only one load more request)
TOTAL_LOADED = 50
# loads only first 10 movies' reviews
FAST_RUN = True

def parse_html(html):
    return bs4.BeautifulSoup(html, 'html.parser')


1. Pornind de la lista cu cele mai populare 250 de filme de pe IMDb ([https://www.imdb.com/chart/top/](https://www.imdb.com/chart/top/)), identificati pentru toate aceste filme link-ul catre pagina sa de recenzii.

Exemplu: aici se gaseste pagina cu recenzii pentru "The Shawshank Redemption": [https://www.imdb.com/title/tt0111161/reviews](https://www.imdb.com/title/tt0111161/reviews)


In [204]:

def get_titles(url):
    html = requests.get(url, headers=headers).content
    soup = parse_html(html)
    top_entry_title = soup.select("table.chart.full-width tbody.lister-list tr td.titleColumn")
    for entry in top_entry_title:
        url_title = entry.select_one("a")["href"]
        print(url_title)


def get_data(url):
    """
    Gets all the titles in the top 250 from IMDB
    :param url:
    :return:
    """
    html = requests.get(url, headers=headers).content
    soup = parse_html(html)
    top_entry_title = soup.select("table.chart.full-width tbody.lister-list tr")
    data = {
        "rank": [],
        "title": [],
        "year": [],
        "rating": [],
        "ratings_number": [],
        "url_title": [],
        "url_image": []
    }

    for entry in top_entry_title:
        columns = [column for column in entry.select("td")]

        # getting urls
        try:
            url_image = columns[0].select_one("a").select_one("img")["src"]
            url_title = columns[1].select_one("a")["href"]
        except Exception:
            continue

        # getting rank, title and year
        details = list(columns[1].stripped_strings)
        rank = re.sub('\.', '', details[0])
        title = details[1]
        year = re.sub('\(|\)', '', details[2])

        # getting rating and number ratings
        rating_data = columns[2].select_one("strong")['title']
        pattern = re.compile("[+-]?((\d+[\.\,])+)?\d+")
        rating_numbers = []
        for matching in pattern.finditer(rating_data):
            rating_numbers.append(matching.group())
        rating_score = rating_numbers[0]
        ratings_number = rating_numbers[1]

        # putting all data in dictionary
        data["rank"].append(rank)
        data["title"].append(title)
        data["year"].append(year)
        data["rating"].append(rating_score)
        data["ratings_number"].append(ratings_number)
        data["url_title"].append(url_title)
        data["url_image"].append(url_image)
    return pd.DataFrame(data)

# pd_data = get_data(url_top)
# pd_data.describe()
# pd_data.head()

2. Pentru fiecare film colectati date despre recenziile sale (titlu, text, rating, data, utlizator, etc.)

In [206]:
def extract_text_from_element(element:bs4.BeautifulSoup):
    value = None
    if element is not None:
        value = element.text.strip()
    return value

def extract_review_data(review_data: bs4.BeautifulSoup):
    """
    This gets the whole div.lister data and extracts the next load_more key and all the current reviews
    :param review_data: -- soup type object that has the div.lister data
    :return:            -- returns review_data(pd.DataFrame) and data_key
    """
    data_key = review_data.select_one("div.load-more-data")['data-key']
    data_review = review_data.select("div.lister-list div.review-container div.lister-item-content")

    data = {
        "title": [],
        "text": [],
        "rating": [],
        "user": [],
        "date": []
    }

    for review in data_review:
        # getting the rating
        rating_element = review.select_one("div.ipl-ratings-bar span.rating-other-user-rating span", class_=False)
        rating = extract_text_from_element(rating_element)

        # getting the title
        title_element = review.select_one("a.title")
        title = extract_text_from_element(title_element)

        # getting the username and date
        user_date_data = review.select("div.display-name-date > span")
        user = extract_text_from_element(user_date_data[0])
        date = extract_text_from_element(user_date_data[1])

        # getting the text
        text_element = review.select_one("div.content > div.text")
        text = extract_text_from_element(text_element)

        data["title"].append(title)
        data["text"].append(text)
        data["rating"].append(rating)
        data["user"].append(user)
        data["date"].append(date)
    return pd.DataFrame(data), data_key

def get_reviews(title_url, load_more=False, total_loaded=50):
    """
    Gets the reviews given a movie title as a panda dataframe
    :param title_url:       the url used in the link
    :param load_more:       boolean whether we load more than once reviews (basically we press the load more button)
    :param total_loaded:    number of reviews loaded in total
    :return:                returns all the reviews as panda dataframe
    """
    review_url = base_url + title_url + 'reviews/'
    html = requests.get(review_url, headers=headers).content
    soup = parse_html(html)

    # getting the title of the movie
    title_data = soup.select_one("section.article div.subpage_title_block")
    movie_title = title_data.select_one("div.subpage_title_block__right-column h3 a").text

    # getting the review data
    review_data = soup.select_one("section.article div.lister")
    total_data, next_key = extract_review_data(review_data)

    if load_more:
        current_loaded = total_data.shape[0]
        while current_loaded < total_loaded:
            more_reviews = review_url + "_ajax?ref_=undefined&paginationKey=" + next_key
            reviews_html = requests.get(more_reviews, headers=headers).content
            more_review_data = parse_html(reviews_html).select_one("div")
            new_data, next_key = extract_review_data(more_review_data)
            total_data = pd.concat([total_data, new_data])
            current_loaded += new_data.shape[0]

        # limit the total_data to correspond to the total_loaded param
        total_data = total_data[: total_loaded]

    total_data['movie_title'] = movie_title

    return total_data



3. Creati un dataset de recenzii, pentru fiecare recenzie stocati:
 * filmul caruia ii apartine
 * titlul recenziei
 * textul recenziei
 * ratingul
 * data
 * utilizator

 Salvati datasetul intr-un fisier JSON.


In [207]:
def get_titles_reviews(links, load_more=True, total_loaded=50, fast_run=True):
    """
    get all the movie data
    :param total_loaded: total loaded reviews for each movie
    :param links:       all links to the movies
    :param load_more:   if we want to request more, basically pressing "load more"
    :param fast_run:    reads only first 10 movies, so it runs faster
    :return:
    """
    all_data = []
    length = len(links) // 25 if fast_run else len(links)
    for link_index in tqdm(range(length)):
        title_url = links[link_index]
        review_data = get_reviews(title_url, load_more=load_more, total_loaded=total_loaded)
        all_data.append(review_data)
    return pd.concat(all_data)


# getting all data from top 250 movies
top_data = get_data(url_top)
# getting all titles from top 250 link
titles_url = top_data['url_title'].to_numpy()

# here if we want to, we can do a join between movie title and review movie title
# so that each review has all data about movie

# global params that can be changed from first codeblock
all_reviews = get_titles_reviews(
    links=titles_url,
    load_more=LOAD_MORE,
    total_loaded=TOTAL_LOADED,
    fast_run=FAST_RUN
)
json_str = all_reviews.to_json(orient='records')
json_result = json.loads(json_str)

with open('reviews_data.json', 'w', encoding='utf8') as fout:
    json.dump(json_result, fout, indent=4, sort_keys=True, ensure_ascii=False)



100%|██████████| 10/10 [00:25<00:00,  2.59s/it]


4. Pe o pagina cu recenzii putem gasi un numar mic de astfel de date. Butonul de "Load more" de la final, cand este apasat, produce un request care returneaza HTML-ul urmatoarelor recenzii. Folosind aceasta logica colectati automat pentru fiecare film un numar mai mare de recenzii.

In [208]:
# I've changed the function from point 2 to accept the specific length
# - using the load_more param

# Take a look at top_data (data collected from top 250 page) and all_reviews
top_data.head()
# all_reviews.head()


Unnamed: 0,rank,title,year,rating,ratings_number,url_title,url_image
0,1,The Shawshank Redemption,1994,9.2,2545876,/title/tt0111161/,https://m.media-amazon.com/images/M/MV5BMDFkYT...
1,2,The Godfather,1972,9.1,1751329,/title/tt0068646/,https://m.media-amazon.com/images/M/MV5BM2MyNj...
2,3,The Godfather: Part II,1974,9.0,1214561,/title/tt0071562/,https://m.media-amazon.com/images/M/MV5BMWMwMG...
3,4,The Dark Knight,2008,9.0,2496211,/title/tt0468569/,https://m.media-amazon.com/images/M/MV5BMTMxNT...
4,5,12 Angry Men,1957,8.9,752273,/title/tt0050083/,https://m.media-amazon.com/images/M/MV5BMWU4N2...
