# TAGgle
A wordle-like based on imdb film keywords.

The code here is scrapes data from IMDB and writes it to a json for use in the external app. 

The only variable needed to be altered here is the size, which references how many pages to scrape. Each page is 50 films. This enables the external app to choose from several difficulty levels.


In [52]:
#import dependencies
from splinter import Browser
from bs4 import BeautifulSoup as bs
import time
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import re
import fnmatch
import random
import string
import json

In [53]:

# how many pages to return, at 50 films per page
# this effectively sets a difficulty level; 1: easy, 5: medium, 20: severe
size = 1


In [54]:
# Set up Splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 105.0.5195
Get LATEST chromedriver version for 105.0.5195 google-chrome
Driver [C:\Users\David\.wdm\drivers\chromedriver\win32\105.0.5195.52\chromedriver.exe] found in cache


In [55]:
# method to grab all 50 films from an IMDB URL
def populate_films(url):
    browser.visit(url)
    soup = bs(browser.html, 'html.parser')
    # the image next to the film contains most of the data we need: film title in the alt-text, and film id in 'data-tconst'.
    href_list = soup.find_all('img', class_='loadlate')
    for item in href_list:
        film_names.append(item['alt'])
        film_ids.append(item['data-tconst'])
#     now grab the year of the film in case of name conflict
    year_list = soup.find_all('span', class_='lister-item-year text-muted unbold')
    for item in year_list:
        years.append(item.contents[0])

In [56]:
film_names = []
film_ids = []
years = []

print(f"grabbing the names and id of the first {size*50} films, ranked by most rated")    
for i in range (0,size):
    url = f"https://www.imdb.com/search/keyword/?ref_=kw_nxt&mode=detail&page={i+1}&sort=num_votes,desc&title_type=movie"
    populate_films(url)
    print(f"getting films from page {i+1} of {size}")
    

grabbing the names and id of the first 50 films, ranked by most rated
getting films from page 1 of 1


In [57]:
# print a list of all the films 
for i in range (0, len(film_names)):
    print(f"{i+1}: {film_ids[i]}, {film_names[i]}, {years[i]}")

1: tt0111161, The Shawshank Redemption, (1994)
2: tt0468569, The Dark Knight, (2008)
3: tt1375666, Inception, (2010)
4: tt0137523, Fight Club, (1999)
5: tt0109830, Forrest Gump, (1994)
6: tt0110912, Pulp Fiction, (1994)
7: tt0133093, The Matrix, (1999)
8: tt0120737, The Lord of the Rings: The Fellowship of the Ring, (2001)
9: tt0068646, The Godfather, (1972)
10: tt0167260, The Lord of the Rings: The Return of the King, (2003)
11: tt0816692, Interstellar, (2014)
12: tt1345836, The Dark Knight Rises, (2012)
13: tt0167261, The Lord of the Rings: The Two Towers, (2002)
14: tt0114369, Seven, (1995)
15: tt1853728, Django Unchained, (2012)
16: tt0172495, Gladiator, (2000)
17: tt0372784, Batman Begins, (2005)
18: tt0361748, Inglourious Basterds, (2009)
19: tt0102926, The Silence of the Lambs, (1991)
20: tt0993846, The Wolf of Wall Street, (2013)
21: tt0848228, Avengers Assemble, (2012)
22: tt0120815, Saving Private Ryan, (1998)
23: tt0076759, Star Wars, (1977)
24: tt0108052, Schindler's List, 

In [58]:
# this method scrapes the keywords from the indivual page for the given film id.
def get_keywords(film_id):
    keyURL = f"https://www.imdb.com/title/{film_id}/keywords"
    browser.visit(keyURL)
    soup = bs(browser.html, 'html.parser')
    # in the keywords table, each keyword can be found in a 'sodatext' div.
    key_list = soup.find_all('div', class_='sodatext')
    
    keys = []
    for div in key_list:
        # we just want the text from what is in the page as an anchor.
        keys.append(div.find('a').contents[0])
    return keys
    

In [59]:
# code to remove punctutation and capitals from any text.
# perhaps common words like 'the' should be removed to?

def SPAG_remover(word):
    return word.lower().translate(str.maketrans('', '', string.punctuation)).replace('·', '').replace('the ', '').replace(' and ', '').replace(' ', '')

In [60]:
# this method loops through the film names, and for each one, calls the 'get_keywords' method. 
# all this data is then wrapped up in a dictionary.
# the dictionary is then added to the main dictionary, with a cleaned-up film name.
film_dict = {}
for i in range(0, len(film_names)):
    print(f"film {i+1} of {len(film_names)}")
    new_film = {'id': film_ids[i], 'punc_name': film_names[i] + " " + years[i], 'tags': get_keywords(film_ids[i])}
    film_dict.update({(SPAG_remover(film_names[i])) + years[i]: new_film})

film 1 of 50
film 2 of 50
film 3 of 50
film 4 of 50
film 5 of 50
film 6 of 50
film 7 of 50
film 8 of 50
film 9 of 50
film 10 of 50
film 11 of 50
film 12 of 50
film 13 of 50
film 14 of 50
film 15 of 50
film 16 of 50
film 17 of 50
film 18 of 50
film 19 of 50
film 20 of 50
film 21 of 50
film 22 of 50
film 23 of 50
film 24 of 50
film 25 of 50
film 26 of 50
film 27 of 50
film 28 of 50
film 29 of 50
film 30 of 50
film 31 of 50
film 32 of 50
film 33 of 50
film 34 of 50
film 35 of 50
film 36 of 50
film 37 of 50
film 38 of 50
film 39 of 50
film 40 of 50
film 41 of 50
film 42 of 50
film 43 of 50
film 44 of 50
film 45 of 50
film 46 of 50
film 47 of 50
film 48 of 50
film 49 of 50
film 50 of 50


In [65]:
# display a random entry from the dictionary to check it's working
keith = random.choice(list(film_dict.keys()))
print(keith)
print(film_dict[keith])

lordofringsreturnofking(2003)
{'id': 'tt0167260', 'punc_name': 'The Lord of the Rings: The Return of the King (2003)', 'tags': ['epic', 'orc', 'middle earth', 'hobbit', 'ring', 'battle', 'quest', 'journey', 'good versus evil', 'wizard', 'king', 'fate', 'falling into lava', 'lava', 'volcano', 'volcanic eruption', 'curse', 'courage', 'elrond character', 'galadriel character', 'legolas character', 'gollum character', 'frodo baggins character', 'saruman the white character', 'orcs character', 'gandalf character', 'elves character', 'bilbo baggins character', 'third part', 'sequel', 'male protagonist', 'violence', 'ensemble cast', 'battleaxe', 'tree', 'dead tree', 'barefoot', 'swordsman', 'mountain', 'royal', 'death', 'bowing', 'throne', 'fireplace', 'hatred', 'army', 'flying creature', 'ruins', 'ambush', 'outpost', 'signal', 'singing', 'blockbuster', 'cult classic', 'famous score', 'no opening credits', 'part computer animation', 'orchestral music score', 'symphonic music score', 'third in

In [66]:
# export the dictionary to an external json file for portability
with open(f'data/imdb_tag_game_{size}.json', 'w') as fp:
    json.dump(film_dict, fp)

In [63]:
film_names

['The Shawshank Redemption',
 'The Dark Knight',
 'Inception',
 'Fight Club',
 'Forrest Gump',
 'Pulp Fiction',
 'The Matrix',
 'The Lord of the Rings: The Fellowship of the Ring',
 'The Godfather',
 'The Lord of the Rings: The Return of the King',
 'Interstellar',
 'The Dark Knight Rises',
 'The Lord of the Rings: The Two Towers',
 'Seven',
 'Django Unchained',
 'Gladiator',
 'Batman Begins',
 'Inglourious Basterds',
 'The Silence of the Lambs',
 'The Wolf of Wall Street',
 'Avengers Assemble',
 'Saving Private Ryan',
 'Star Wars',
 "Schindler's List",
 'The Prestige',
 'The Departed',
 'Shutter Island',
 'The Green Mile',
 'Star Wars: Episode V - The Empire Strikes Back',
 'The Godfather: Part II',
 'Joker',
 'Memento',
 'Avatar',
 'Back to the Future',
 'Guardians of the Galaxy',
 'Titanic',
 'Leon',
 'American Beauty',
 'Goodfellas',
 'Pirates of the Caribbean: The Curse of the Black Pearl',
 'V for Vendetta',
 'American History X',
 'WALL·E',
 'Kill Bill: Vol. 1',
 'Avengers: Endg

In [64]:
film_dict.keys()

dict_keys(['shawshankredemption(1994)', 'darkknight(2008)', 'inception(2010)', 'fightclub(1999)', 'forrestgump(1994)', 'pulpfiction(1994)', 'matrix(1999)', 'lordofringsfellowshipofring(2001)', 'godfather(1972)', 'lordofringsreturnofking(2003)', 'interstellar(2014)', 'darkknightrises(2012)', 'lordofringstwotowers(2002)', 'seven(1995)', 'djangounchained(2012)', 'gladiator(2000)', 'batmanbegins(2005)', 'inglouriousbasterds(2009)', 'silenceoflambs(1991)', 'wolfofwallstreet(2013)', 'avengersassemble(2012)', 'savingprivateryan(1998)', 'starwars(1977)', 'schindlerslist(1993)', 'prestige(2006)', 'departed(2006)', 'shutterisland(2010)', 'greenmile(1999)', 'starwarsepisodevempirestrikesback(1980)', 'godfatherpartii(1974)', 'joker(I) (2019)', 'memento(2000)', 'avatar(2009)', 'backtofuture(1985)', 'guardiansofgalaxy(2014)', 'titanic(1997)', 'leon(1994)', 'americanbeauty(1999)', 'goodfellas(1990)', 'piratesofcaribbeancurseofblackpearl(2003)', 'vforvendetta(2005)', 'americanhistoryx(1998)', 'walle(2