# TAGgle
A wordle-like based on imdb film keywords.

The code here is scrapes data from IMDB and writes it to a json for use in the external app. 

The only variable needed to be altered here is the size, which references how many pages to scrape. Each page is 50 films. This enables the external app to choose from several difficulty levels.


In [2]:
#import dependencies
from splinter import Browser
from bs4 import BeautifulSoup as bs
import time
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import re
import fnmatch
import random
import string
import json

In [64]:

# how many pages to return, at 50 films per page
# this effectively sets a difficulty level; 1: easy, 5: medium, 20: severe
size = 20


In [65]:
# Set up Splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 105.0.5195
Get LATEST chromedriver version for 105.0.5195 google-chrome
Driver [C:\Users\David\.wdm\drivers\chromedriver\win32\105.0.5195.52\chromedriver.exe] found in cache


In [66]:
# method to grab all 50 films from an IMDB URL
def populate_films(url):
    browser.visit(url)
    soup = bs(browser.html, 'html.parser')
    # the image next to the film contains most of the data we need: film title in the alt-text, and film id in 'data-tconst'.
    href_list = soup.find_all('img', class_='loadlate')
    for item in href_list:
        film_names.append(item['alt'])
        film_ids.append(item['data-tconst'])
#     now grab the year of the film in case of name conflict
    year_list = soup.find_all('span', class_='lister-item-year text-muted unbold')
    for item in year_list:
        years.append(item.contents[0])

In [67]:
film_names = []
film_ids = []
years = []

print(f"grabbing the names and id of the first {size*50} films, ranked by most rated")    
for i in range (0,size):
    url = f"https://www.imdb.com/search/keyword/?ref_=kw_nxt&mode=detail&page={i+1}&sort=num_votes,desc&title_type=movie"
    populate_films(url)
    print(f"getting films from page {i+1} of {size}")
    

grabbing the names and id of the first 1000 films, ranked by most rated
getting films from page 1 of 20
getting films from page 2 of 20
getting films from page 3 of 20
getting films from page 4 of 20
getting films from page 5 of 20
getting films from page 6 of 20
getting films from page 7 of 20
getting films from page 8 of 20
getting films from page 9 of 20
getting films from page 10 of 20
getting films from page 11 of 20
getting films from page 12 of 20
getting films from page 13 of 20
getting films from page 14 of 20
getting films from page 15 of 20
getting films from page 16 of 20
getting films from page 17 of 20
getting films from page 18 of 20
getting films from page 19 of 20
getting films from page 20 of 20


In [68]:
# print a list of all the films 
for i in range (0, len(film_names)):
    print(f"{i+1}: {film_ids[i]}, {film_names[i]}, {years[i]}")

1: tt0111161, The Shawshank Redemption, (1994)
2: tt0468569, The Dark Knight, (2008)
3: tt1375666, Inception, (2010)
4: tt0137523, Fight Club, (1999)
5: tt0109830, Forrest Gump, (1994)
6: tt0110912, Pulp Fiction, (1994)
7: tt0133093, The Matrix, (1999)
8: tt0120737, The Lord of the Rings: The Fellowship of the Ring, (2001)
9: tt0068646, The Godfather, (1972)
10: tt0167260, The Lord of the Rings: The Return of the King, (2003)
11: tt0816692, Interstellar, (2014)
12: tt1345836, The Dark Knight Rises, (2012)
13: tt0167261, The Lord of the Rings: The Two Towers, (2002)
14: tt0114369, Seven, (1995)
15: tt1853728, Django Unchained, (2012)
16: tt0172495, Gladiator, (2000)
17: tt0372784, Batman Begins, (2005)
18: tt0361748, Inglourious Basterds, (2009)
19: tt0102926, The Silence of the Lambs, (1991)
20: tt0993846, The Wolf of Wall Street, (2013)
21: tt0848228, Avengers Assemble, (2012)
22: tt0120815, Saving Private Ryan, (1998)
23: tt0076759, Star Wars, (1977)
24: tt0108052, Schindler's List, 

In [69]:
# this method scrapes the keywords from the indivual page for the given film id.
def get_keywords(film_id):
    keyURL = f"https://www.imdb.com/title/{film_id}/keywords"
    browser.visit(keyURL)
    soup = bs(browser.html, 'html.parser')
    # in the keywords table, each keyword can be found in a 'sodatext' div.
    key_list = soup.find_all('div', class_='sodatext')
    
    keys = []
    for div in key_list:
        # we just want the text from what is in the page as an anchor.
        keys.append(div.find('a').contents[0])
    return keys
    

In [70]:
# code to remove punctutation and capitals from any text.
# perhaps common words like 'the' should be removed to?

def SPAG_remover(word):
    return word.lower().translate(str.maketrans('', '', string.punctuation)).replace('the ', '').replace('and ', '')

In [71]:
# this method loops through the film names, and for each one, calls the 'get_keywords' method. 
# all this data is then wrapped up in a dictionary.
# the dictionary is then added to the main dictionary, with a cleaned-up film name.
film_dict = {}
for i in range(0, len(film_names)):
    print(f"film {i+1} of {len(film_names)}")
    new_film = {'id': film_ids[i], 'punc_name': film_names[i] + " " + years[i], 'tags': get_keywords(film_ids[i])}
    film_dict.update({(SPAG_remover(film_names[i])): new_film})

film 1 of 1000
film 2 of 1000
film 3 of 1000
film 4 of 1000
film 5 of 1000
film 6 of 1000
film 7 of 1000
film 8 of 1000
film 9 of 1000
film 10 of 1000
film 11 of 1000
film 12 of 1000
film 13 of 1000
film 14 of 1000
film 15 of 1000
film 16 of 1000
film 17 of 1000
film 18 of 1000
film 19 of 1000
film 20 of 1000
film 21 of 1000
film 22 of 1000
film 23 of 1000
film 24 of 1000
film 25 of 1000
film 26 of 1000
film 27 of 1000
film 28 of 1000
film 29 of 1000
film 30 of 1000
film 31 of 1000
film 32 of 1000
film 33 of 1000
film 34 of 1000
film 35 of 1000
film 36 of 1000
film 37 of 1000
film 38 of 1000
film 39 of 1000
film 40 of 1000
film 41 of 1000
film 42 of 1000
film 43 of 1000
film 44 of 1000
film 45 of 1000
film 46 of 1000
film 47 of 1000
film 48 of 1000
film 49 of 1000
film 50 of 1000
film 51 of 1000
film 52 of 1000
film 53 of 1000
film 54 of 1000
film 55 of 1000
film 56 of 1000
film 57 of 1000
film 58 of 1000
film 59 of 1000
film 60 of 1000
film 61 of 1000
film 62 of 1000
film 63 of 1000
f

film 490 of 1000
film 491 of 1000
film 492 of 1000
film 493 of 1000
film 494 of 1000
film 495 of 1000
film 496 of 1000
film 497 of 1000
film 498 of 1000
film 499 of 1000
film 500 of 1000
film 501 of 1000
film 502 of 1000
film 503 of 1000
film 504 of 1000
film 505 of 1000
film 506 of 1000
film 507 of 1000
film 508 of 1000
film 509 of 1000
film 510 of 1000
film 511 of 1000
film 512 of 1000
film 513 of 1000
film 514 of 1000
film 515 of 1000
film 516 of 1000
film 517 of 1000
film 518 of 1000
film 519 of 1000
film 520 of 1000
film 521 of 1000
film 522 of 1000
film 523 of 1000
film 524 of 1000
film 525 of 1000
film 526 of 1000
film 527 of 1000
film 528 of 1000
film 529 of 1000
film 530 of 1000
film 531 of 1000
film 532 of 1000
film 533 of 1000
film 534 of 1000
film 535 of 1000
film 536 of 1000
film 537 of 1000
film 538 of 1000
film 539 of 1000
film 540 of 1000
film 541 of 1000
film 542 of 1000
film 543 of 1000
film 544 of 1000
film 545 of 1000
film 546 of 1000
film 547 of 1000
film 548 of 10

film 972 of 1000
film 973 of 1000
film 974 of 1000
film 975 of 1000
film 976 of 1000
film 977 of 1000
film 978 of 1000
film 979 of 1000
film 980 of 1000
film 981 of 1000
film 982 of 1000
film 983 of 1000
film 984 of 1000
film 985 of 1000
film 986 of 1000
film 987 of 1000
film 988 of 1000
film 989 of 1000
film 990 of 1000
film 991 of 1000
film 992 of 1000
film 993 of 1000
film 994 of 1000
film 995 of 1000
film 996 of 1000
film 997 of 1000
film 998 of 1000
film 999 of 1000
film 1000 of 1000


In [74]:
# display a random entry from the dictionary to check it's working
keith = random.choice(list(film_dict.keys()))
print(keith)
print(film_dict[keith])

get out


In [73]:
# export the dictionary to an external json file for portability
with open(f'data/imdb_tag_game_{size}.json', 'w') as fp:
    json.dump(film_dict, fp)