## Scraping from scratch

In [None]:
!pip install -q requests lxml bs4 pandas 

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
from tqdm.auto import tqdm

HEADERS ={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}
BASE_URL = 'https://imdb.com'
OUTPUT_DIR = '/content/drive/MyDrive/ITMO Master/datasets/ml-industry/task2_1/'    # PLEASE CHANGE BASED ON YOUR OWN PATH

In [None]:
df_processed = pd.read_csv('./drive/MyDrive/ITMO Master/datasets/ml-industry/task2/processed.csv')
BASE_PATH_DRIVE = '/content/drive/MyDrive/ITMO Master/'
df_processed['img_local_path'] = df_processed['img_local_path'].apply(lambda x: x.replace('./', BASE_PATH_DRIVE))
df_processed.head()

Unnamed: 0,title,simple_desc,genre,img_local_path
0,Andor,Prequel series to Star Wars' 'Rogue One'. In a...,Action,/content/drive/MyDrive/ITMO Master/datasets/ml...
1,House of the Dragon,An internal succession war within House Targar...,Action,/content/drive/MyDrive/ITMO Master/datasets/ml...
2,Game of Thrones,Nine noble families fight for control over the...,Action,/content/drive/MyDrive/ITMO Master/datasets/ml...
3,The Lord of the Rings: The Rings of Power,Epic drama set thousands of years before the e...,Action,/content/drive/MyDrive/ITMO Master/datasets/ml...
4,Titans,A team of young superheroes combat evil and ot...,Action,/content/drive/MyDrive/ITMO Master/datasets/ml...


In [None]:
df_drop = df_processed.drop_duplicates(['title', 'simple_desc'])
len(df_processed), len(df_drop)

(12656, 6246)

In [None]:
df_sort = df_drop.groupby('genre', as_index=False).count().sort_values('title', ascending=True, ignore_index=True)
df_sort.head()

Unnamed: 0,genre,title,simple_desc,img_local_path
0,Film Noir,7,7,7
1,Thriller,21,21,21
2,Mystery,35,35,35
3,Sci-Fi,66,66,66
4,Fantasy,102,102,102


In [None]:
genres = df_sort[df_sort['title'] < 500]['genre'].tolist()
genres  # sorted by priority to avoid data duplicates

['Film Noir',
 'Thriller',
 'Mystery',
 'Sci-Fi',
 'Fantasy',
 'Drama',
 'Romance',
 'Family',
 'Adventure',
 'History',
 'Animation',
 'Horror',
 'Crime',
 'War',
 'Comedy',
 'Western',
 'Sport',
 'Musical',
 'Music',
 'Documentary']

In [None]:
genres = [
    "Action",
    "Adventure",
    "Animation",
    "Biography",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Family",
    "Fantasy",
    "Film Noir",
    "History",
    "Horror",
    "Music",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Short Film",
    "Sport",
    "Superhero",
    "Thriller",
    "War",
    "Western"
]

url_dict = {}

for genre in genres:
    url = "https://www.imdb.com/search/title/?genres={}&title_type=tv_series,mini_series&explore=genres&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=b4e1d6fb-9821-4c7d-ad14-31ed10854442&pf_rd_r=HJNGZFN3WH8KD7HHVR4N&pf_rd_s=center-7&pf_rd_t=15051&pf_rd_i=genre&ref_=ft_gnr_tvpop_3"
    # url = "https://www.imdb.com/search/title/?genres={}&sort=user_rating,desc&title_type=feature&num_votes=25000,&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=5aab685f-35eb-40f3-95f7-c53f09d542c3&pf_rd_r=N97GEQS6R7J9EV7V770D&pf_rd_s=right-6&pf_rd_t=15506&pf_rd_i=top&ref_=chttp_gnr_16"
    formated_url = url.format(genre)
    url_dict[genre] = formated_url
    
print(url_dict)

{'Action': 'https://www.imdb.com/search/title/?genres=Action&title_type=tv_series,mini_series&explore=genres&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=b4e1d6fb-9821-4c7d-ad14-31ed10854442&pf_rd_r=HJNGZFN3WH8KD7HHVR4N&pf_rd_s=center-7&pf_rd_t=15051&pf_rd_i=genre&ref_=ft_gnr_tvpop_3', 'Adventure': 'https://www.imdb.com/search/title/?genres=Adventure&title_type=tv_series,mini_series&explore=genres&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=b4e1d6fb-9821-4c7d-ad14-31ed10854442&pf_rd_r=HJNGZFN3WH8KD7HHVR4N&pf_rd_s=center-7&pf_rd_t=15051&pf_rd_i=genre&ref_=ft_gnr_tvpop_3', 'Animation': 'https://www.imdb.com/search/title/?genres=Animation&title_type=tv_series,mini_series&explore=genres&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=b4e1d6fb-9821-4c7d-ad14-31ed10854442&pf_rd_r=HJNGZFN3WH8KD7HHVR4N&pf_rd_s=center-7&pf_rd_t=15051&pf_rd_i=genre&ref_=ft_gnr_tvpop_3', 'Biography': 'https://www.imdb.com/search/title/?genres=Biography&title_type=tv_series,mini_series&explore=genres&pf_rd_m=A2FGELUUNOQJNL&pf_rd_p=b4e1d6fb-9821-4c7d-ad14-31

In [None]:
def get_movies(url, file_name, amount_to_collect=200, df=None):    
    
    genre = file_name[:-4]
    url = url

    # get number of data that already collected for specific genre
    if df is not None:
        already_collected = len(df[df['genre'] == genre])
        amount_to_collect -= already_collected

    # prepare all level directories
    os.makedirs(OUTPUT_DIR + 'images/' + genre, exist_ok=True)

    movie_list = []
    total_duplicate = 0

    while len(movie_list) < amount_to_collect:
        
    
        resp = requests.get(url, headers=HEADERS)
        content = BeautifulSoup(resp.content, 'lxml')

        local_duplicate = 0

        # for movie in tqdm(content.select('.lister-item-content'), desc=genre):
        for movie in content.select('.lister-item-content'):

            header = movie.select_one('.lister-item-header > a')

            # get task mandatory data
            title = header.get_text().strip().replace('/', ' ')
            simple_desc = movie.select('.text-muted')[2].get_text().strip()

            # early check data duplicate
            if df is not None:
                num_duplicate = len(df[(df['title'] == title) & (df['simple_desc'] == simple_desc)])
                total_duplicate += num_duplicate
                local_duplicate += num_duplicate
                if num_duplicate > 0:
                    continue

            
            # get optional data that potentially missing
            # year = movie.select_one('.lister-item-year').get_text().strip()[1:-1]
            try:
                labels = movie.select_one('.genre').get_text().strip()
            except:
                continue
            # try:
            #     votes = int(movie.select('.sort-num_votes-visible > span')[1].get_text().strip().replace(',', ''))
            # except:
            #     votes = None

            # try:
            #     rating = float(movie.select_one('.ratings-imdb-rating').get_text().strip())
            # except:
            #     rating = None

            # try:
            #     time = movie.select_one('.runtime').get_text().strip()
            # except:
            #     time = None
                        
            # try:
            #     certificate = movie.select_one('.certificate').get_text().strip()
            # except:
            #     certificate = None

            # try:
            #     metascore = int(movie.select_one('.ratings-metascore > span.metascore').get_text().strip())
            # except:
            #     metascore = None

            # get image on the detail page
            detail_page = BeautifulSoup(
                requests.get(BASE_URL + header.get('href'), headers=HEADERS).content,
                'lxml'
            )
            
            try:
                # img_url = detail_page.select_one('img.ipc-image').get('src')
                img_url = detail_page.select_one('div.ipc-media').select_one('img.ipc-image').get('src')
                img_res = requests.get(img_url)

                if img_res.status_code == 200:
                    filename = title + img_url[-4:]
                    save_path = OUTPUT_DIR + 'images/' + genre + '/' + filename
                    
                    with open(save_path, 'wb') as f:
                        f.write(img_res.content)

                else:
                    # save_path = None
                    continue
            except:
                continue

            data = {
                'title': title,
                'simple_desc': simple_desc,
                'genre': genre,
                'img_local_path': save_path,
                'labels': labels,
                # 'img_url': img_url,
                # 'year': year,
                # 'votes': votes,
                # 'time': time,            
                # 'rating': rating,
                # 'certificate': certificate,
                # 'metascore': metascore,

            }
    
            movie_list.append(data)

        print(f'[{genre.upper()}] Amount to collect left: {amount_to_collect - len(movie_list)}. Found {local_duplicate} duplicated movie(s) in this page.')

        next_page = content.select_one('a.lister-page-next.next-page')
        if next_page is not None:
            url = BASE_URL + next_page.get('href')
        else:
            print('NEXT PAGE NOT FOUND.')
            break

    print(f'Found {total_duplicate} duplicated movie(s)')
    print(f'Got {len(movie_list)} record(s) in {genre} genre.')
         
        
    dataframe = pd.DataFrame(movie_list)
    dataframe.to_csv(OUTPUT_DIR + file_name)

    return dataframe

In [None]:
df = None
for genre, url in tqdm(url_dict.items()):
    df_genre = get_movies(url, genre+'.csv', 250, df=df)
    if df is None:
        df = df_genre
    else:
        df = pd.concat([df, df_genre], ignore_index=True)
    print("Saved:", genre+'.csv')

df.to_csv(OUTPUT_DIR + 'compiled.csv')

  0%|          | 0/24 [00:00<?, ?it/s]

[ACTION] Amount to collect left: 200. Found 0 duplicated movie(s) in this page.
[ACTION] Amount to collect left: 150. Found 0 duplicated movie(s) in this page.
[ACTION] Amount to collect left: 100. Found 0 duplicated movie(s) in this page.
[ACTION] Amount to collect left: 50. Found 0 duplicated movie(s) in this page.
[ACTION] Amount to collect left: 0. Found 0 duplicated movie(s) in this page.
Found 0 duplicated movie(s)
Got 250 record(s) in Action genre.
Saved: Action.csv
[ADVENTURE] Amount to collect left: 241. Found 41 duplicated movie(s) in this page.
[ADVENTURE] Amount to collect left: 235. Found 44 duplicated movie(s) in this page.
[ADVENTURE] Amount to collect left: 224. Found 39 duplicated movie(s) in this page.
[ADVENTURE] Amount to collect left: 192. Found 18 duplicated movie(s) in this page.
[ADVENTURE] Amount to collect left: 142. Found 0 duplicated movie(s) in this page.
[ADVENTURE] Amount to collect left: 92. Found 0 duplicated movie(s) in this page.
[ADVENTURE] Amount to

In [None]:
# just check the output
df = pd.read_csv('/content/drive/MyDrive/ITMO Master/datasets/ml-industry/task2_1/compiled.csv', index_col=0)
df.head()

Unnamed: 0,title,simple_desc,genre,img_local_path,labels
0,Andor,Prequel series to Star Wars' 'Rogue One'. In a...,Action,/content/drive/MyDrive/ITMO Master/datasets/ml...,"Action, Adventure, Drama"
1,House of the Dragon,An internal succession war within House Targar...,Action,/content/drive/MyDrive/ITMO Master/datasets/ml...,"Action, Adventure, Drama"
2,Warrior Nun,"After waking up in a morgue, an orphaned teen ...",Action,/content/drive/MyDrive/ITMO Master/datasets/ml...,"Action, Drama, Fantasy"
3,Game of Thrones,Nine noble families fight for control over the...,Action,/content/drive/MyDrive/ITMO Master/datasets/ml...,"Action, Adventure, Drama"
4,The Lord of the Rings: The Rings of Power,Epic drama set thousands of years before the e...,Action,/content/drive/MyDrive/ITMO Master/datasets/ml...,"Action, Adventure, Drama"


In [None]:
# make sure there are at least 250 records for each genre
df.groupby('genre', as_index=False).title.count()

Unnamed: 0,genre,title
0,Action,250
1,Adventure,255
2,Animation,283
3,Biography,286
4,Comedy,260
5,Crime,261
6,Documentary,254
7,Drama,258
8,Family,291
9,Fantasy,275


In [None]:
# duplicate title but different description is possible, it seems represent different sequel of the movie
df[df.title == 'Doctor Who']

Unnamed: 0,title,simple_desc,genre,img_local_path,labels
250,Doctor Who,The further adventures in time and space of th...,Adventure,/content/drive/MyDrive/ITMO Master/datasets/ml...,"Adventure, Drama, Sci-Fi"
261,Doctor Who,The adventures in time and space of the Doctor...,Adventure,/content/drive/MyDrive/ITMO Master/datasets/ml...,"Adventure, Drama, Family"


https://www.youtube.com/watch?v=Hz8Au_vX_fY