# TAGgle
A wordle-like based on imdb film keywords.

The code here is scrapes data from IMDB and writes it to a json for use in the external app. 

The only variable needed to be altered here is the size, which references how many pages to scrape. Each page is 50 films. This enables the external app to choose from several difficulty levels.


In [11]:
#import dependencies
from splinter import Browser
from bs4 import BeautifulSoup as bs
import time
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import re
import fnmatch
import random
import string
import json

In [12]:

# how many pages to return, at 50 films per page
# this effectively sets a difficulty level; 1: easy, 5: medium, 20: severe
size = 100


In [13]:
#EDIT: read in existing json data to add to, due to webscraping breaking at 1390 (wayne's world).
with open(f'../data/imdb_tag_game_60.json', 'r') as f:
  film_dict = json.load(f)
len(film_dict)

3000

In [14]:
# Set up Splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 105.0.5195
Get LATEST chromedriver version for 105.0.5195 google-chrome
Trying to download new driver from https://chromedriver.storage.googleapis.com/105.0.5195.52/chromedriver_win32.zip
Driver has been saved in cache [C:\Users\David\.wdm\drivers\chromedriver\win32\105.0.5195.52]


In [15]:
# method to grab all 50 films from an IMDB URL
def populate_films(url):
    browser.visit(url)
    soup = bs(browser.html, 'html.parser')
    # the image next to the film contains most of the data we need: film title in the alt-text, and film id in 'data-tconst'.
    href_list = soup.find_all('img', class_='loadlate')
    for item in href_list:
        film_names.append(item['alt'])
        film_ids.append(item['data-tconst'])
#     now grab the year of the film in case of name conflict
    year_list = soup.find_all('span', class_='lister-item-year text-muted unbold')
    for item in year_list:
        years.append(item.contents[0])

In [16]:
film_names = []
film_ids = []
years = []

print(f"grabbing the names and id of the first {size*50} films, ranked by most rated")    
for i in range (60,size):
    url = f"https://www.imdb.com/search/keyword/?ref_=kw_nxt&mode=detail&page={i+1}&sort=num_votes,desc&title_type=movie"
    populate_films(url)
    print(f"getting films from page {i+1} of {size}")
    

grabbing the names and id of the first 5000 films, ranked by most rated
getting films from page 61 of 100
getting films from page 62 of 100
getting films from page 63 of 100
getting films from page 64 of 100
getting films from page 65 of 100
getting films from page 66 of 100
getting films from page 67 of 100
getting films from page 68 of 100
getting films from page 69 of 100
getting films from page 70 of 100
getting films from page 71 of 100
getting films from page 72 of 100
getting films from page 73 of 100
getting films from page 74 of 100
getting films from page 75 of 100
getting films from page 76 of 100
getting films from page 77 of 100
getting films from page 78 of 100
getting films from page 79 of 100
getting films from page 80 of 100
getting films from page 81 of 100
getting films from page 82 of 100
getting films from page 83 of 100
getting films from page 84 of 100
getting films from page 85 of 100
getting films from page 86 of 100
getting films from page 87 of 100
getting fi

In [17]:
# print a list of all the films 
for i in range (0, len(film_names)):
    print(f"{i+1}: {film_ids[i]}, {film_names[i]}, {years[i]}")

1: tt0120053, The Saint, (1997)
2: tt0327679, Ella Enchanted, (2004)
3: tt0112682, The City of Lost Children, (1995)
4: tt0120177, Spawn, (1997)
5: tt0486551, Beerfest, (2006)
6: tt0120686, Stepmom, (1998)
7: tt2106361, Into the Storm, (2014)
8: tt0219965, Bandits, (2001)
9: tt0082418, Friday the 13th: Part 2, (1981)
10: tt2125608, Searching for Sugar Man, (2012)
11: tt0114436, Showgirls, (1995)
12: tt0086006, Never Say Never Again, (1983)
13: tt2101341, Dead Man Down, (2013)
14: tt0450345, The Wicker Man, (2006)
15: tt2870756, Magic in the Moonlight, (2014)
16: tt2402157, The November Man, (2014)
17: tt0425413, Run Fatboy Run, (2007)
18: tt0356680, The Family Stone, (2005)
19: tt10059518, Unhinged, (I) (2020)
20: tt5215952, The Wailing, (2016)
21: tt2356180, Bhaag Milkha Bhaag, (2013)
22: tt7775622, Free Solo, (2018)
23: tt8359848, Climax, (I) (2018)
24: tt0287717, Spy Kids 2: Island of Lost Dreams, (2002)
25: tt2545118, Blackfish, (2013)
26: tt1242422, Cell 211, (2009)
27: tt1583420,

In [18]:
# this method scrapes the keywords from the indivual page for the given film id.
def get_keywords(film_id):
    keyURL = f"https://www.imdb.com/title/{film_id}/keywords"
    browser.visit(keyURL)
    soup = bs(browser.html, 'html.parser')
    # in the keywords table, each keyword can be found in a 'sodatext' div.
    key_list = soup.find_all('div', class_='sodatext')
    
    keys = []
    for div in key_list:
        # we just want the text from what is in the page as an anchor.
        keys.append(div.find('a').contents[0])
    return keys
    

In [19]:
# code to remove punctutation and capitals from any text.
# perhaps common words like 'the' should be removed to?

def SPAG_remover(word):
    return word.lower().translate(str.maketrans('', '', string.punctuation)).replace('·', '').replace('the ', '').replace(' and ', '').replace(' ', '')

In [20]:
# this method loops through the film names, and for each one, calls the 'get_keywords' method. 
# all this data is then wrapped up in a dictionary.
# the dictionary is then added to the main dictionary, with a cleaned-up film name.
for i in range(0, len(film_names)):
    print(f"film {i+1} of {len(film_names)}")
    new_film = {'id': film_ids[i], 'punc_name': film_names[i] + " " + years[i], 'tags': get_keywords(film_ids[i])}
    film_dict.update({(SPAG_remover(film_names[i])) + years[i]: new_film})

film 1 of 2000
film 2 of 2000
film 3 of 2000
film 4 of 2000
film 5 of 2000
film 6 of 2000
film 7 of 2000
film 8 of 2000
film 9 of 2000
film 10 of 2000
film 11 of 2000
film 12 of 2000
film 13 of 2000
film 14 of 2000
film 15 of 2000
film 16 of 2000
film 17 of 2000
film 18 of 2000
film 19 of 2000
film 20 of 2000
film 21 of 2000
film 22 of 2000
film 23 of 2000
film 24 of 2000
film 25 of 2000
film 26 of 2000
film 27 of 2000
film 28 of 2000
film 29 of 2000
film 30 of 2000
film 31 of 2000
film 32 of 2000
film 33 of 2000
film 34 of 2000
film 35 of 2000
film 36 of 2000
film 37 of 2000
film 38 of 2000
film 39 of 2000
film 40 of 2000
film 41 of 2000
film 42 of 2000
film 43 of 2000
film 44 of 2000
film 45 of 2000
film 46 of 2000
film 47 of 2000
film 48 of 2000
film 49 of 2000
film 50 of 2000
film 51 of 2000
film 52 of 2000
film 53 of 2000
film 54 of 2000
film 55 of 2000
film 56 of 2000
film 57 of 2000
film 58 of 2000
film 59 of 2000
film 60 of 2000
film 61 of 2000
film 62 of 2000
film 63 of 2000
f

film 490 of 2000
film 491 of 2000
film 492 of 2000
film 493 of 2000
film 494 of 2000
film 495 of 2000
film 496 of 2000
film 497 of 2000
film 498 of 2000
film 499 of 2000
film 500 of 2000
film 501 of 2000
film 502 of 2000
film 503 of 2000
film 504 of 2000
film 505 of 2000
film 506 of 2000
film 507 of 2000
film 508 of 2000
film 509 of 2000
film 510 of 2000
film 511 of 2000
film 512 of 2000
film 513 of 2000
film 514 of 2000
film 515 of 2000
film 516 of 2000
film 517 of 2000
film 518 of 2000
film 519 of 2000
film 520 of 2000
film 521 of 2000
film 522 of 2000
film 523 of 2000
film 524 of 2000
film 525 of 2000
film 526 of 2000
film 527 of 2000
film 528 of 2000
film 529 of 2000
film 530 of 2000
film 531 of 2000
film 532 of 2000
film 533 of 2000
film 534 of 2000
film 535 of 2000
film 536 of 2000
film 537 of 2000
film 538 of 2000
film 539 of 2000
film 540 of 2000
film 541 of 2000
film 542 of 2000
film 543 of 2000
film 544 of 2000
film 545 of 2000
film 546 of 2000
film 547 of 2000
film 548 of 20

film 972 of 2000
film 973 of 2000
film 974 of 2000
film 975 of 2000
film 976 of 2000
film 977 of 2000
film 978 of 2000
film 979 of 2000
film 980 of 2000
film 981 of 2000
film 982 of 2000
film 983 of 2000
film 984 of 2000
film 985 of 2000
film 986 of 2000
film 987 of 2000
film 988 of 2000
film 989 of 2000
film 990 of 2000
film 991 of 2000
film 992 of 2000
film 993 of 2000
film 994 of 2000
film 995 of 2000
film 996 of 2000
film 997 of 2000
film 998 of 2000
film 999 of 2000
film 1000 of 2000
film 1001 of 2000
film 1002 of 2000
film 1003 of 2000
film 1004 of 2000
film 1005 of 2000
film 1006 of 2000
film 1007 of 2000
film 1008 of 2000
film 1009 of 2000
film 1010 of 2000
film 1011 of 2000
film 1012 of 2000
film 1013 of 2000
film 1014 of 2000
film 1015 of 2000
film 1016 of 2000
film 1017 of 2000
film 1018 of 2000
film 1019 of 2000
film 1020 of 2000
film 1021 of 2000
film 1022 of 2000
film 1023 of 2000
film 1024 of 2000
film 1025 of 2000
film 1026 of 2000
film 1027 of 2000
film 1028 of 2000
fi

film 1429 of 2000
film 1430 of 2000
film 1431 of 2000
film 1432 of 2000
film 1433 of 2000
film 1434 of 2000
film 1435 of 2000
film 1436 of 2000
film 1437 of 2000
film 1438 of 2000
film 1439 of 2000
film 1440 of 2000
film 1441 of 2000
film 1442 of 2000
film 1443 of 2000
film 1444 of 2000
film 1445 of 2000
film 1446 of 2000
film 1447 of 2000
film 1448 of 2000
film 1449 of 2000
film 1450 of 2000
film 1451 of 2000
film 1452 of 2000
film 1453 of 2000
film 1454 of 2000
film 1455 of 2000
film 1456 of 2000
film 1457 of 2000
film 1458 of 2000
film 1459 of 2000
film 1460 of 2000
film 1461 of 2000
film 1462 of 2000
film 1463 of 2000
film 1464 of 2000
film 1465 of 2000
film 1466 of 2000
film 1467 of 2000
film 1468 of 2000
film 1469 of 2000
film 1470 of 2000
film 1471 of 2000
film 1472 of 2000
film 1473 of 2000
film 1474 of 2000
film 1475 of 2000
film 1476 of 2000
film 1477 of 2000
film 1478 of 2000
film 1479 of 2000
film 1480 of 2000
film 1481 of 2000
film 1482 of 2000
film 1483 of 2000
film 1484 

film 1885 of 2000
film 1886 of 2000
film 1887 of 2000
film 1888 of 2000
film 1889 of 2000
film 1890 of 2000
film 1891 of 2000
film 1892 of 2000
film 1893 of 2000
film 1894 of 2000
film 1895 of 2000
film 1896 of 2000
film 1897 of 2000
film 1898 of 2000
film 1899 of 2000
film 1900 of 2000
film 1901 of 2000
film 1902 of 2000
film 1903 of 2000
film 1904 of 2000
film 1905 of 2000
film 1906 of 2000
film 1907 of 2000
film 1908 of 2000
film 1909 of 2000
film 1910 of 2000
film 1911 of 2000
film 1912 of 2000
film 1913 of 2000
film 1914 of 2000
film 1915 of 2000
film 1916 of 2000
film 1917 of 2000
film 1918 of 2000
film 1919 of 2000
film 1920 of 2000
film 1921 of 2000
film 1922 of 2000
film 1923 of 2000
film 1924 of 2000
film 1925 of 2000
film 1926 of 2000
film 1927 of 2000
film 1928 of 2000
film 1929 of 2000
film 1930 of 2000
film 1931 of 2000
film 1932 of 2000
film 1933 of 2000
film 1934 of 2000
film 1935 of 2000
film 1936 of 2000
film 1937 of 2000
film 1938 of 2000
film 1939 of 2000
film 1940 

In [21]:
# display a random entry from the dictionary to check it's working
keith = random.choice(list(film_dict.keys()))
print(keith)
print(film_dict[keith])

lionsforlambs(2007)
{'id': 'tt0891527', 'punc_name': 'Lions for Lambs (2007)', 'tags': ['infrared', 'the white house', 'woman wears eyeglasses', 'mgm', 'student', 'afghanistan', 'professor', 'reporter', 'university', 'republican', 'california', 'political science', 'iran', 'behind enemy lines', 'class attendance', 'special forces', 'ridge', 'rescue', 'college', 'taliban', 'washington d.c.', 'reference to abu ghraib', 'vietnam war veteran', 'arlington national cemetery', 'shorthand', 'condescension', 'reference to the who', 'national mall', 'reference to watergate', 'reference to richard nixon', 'mexican american', 'colin powell', 'night vision', 'reference to chicago convention', 'condoleezza rice', 'military offensive', 'reference to aaron copland', 'reference to creighton abrams', 'hindu kush', 'reference to 9 11', 'reference to alexander the great', 'lincoln memorial', 'reference to saddam hussein', 'arrogance', 'reference to hamid karzai', 'reference to the holocaust', 'corpse', 'd

In [22]:
# export the dictionary to an external json file for portability
with open(f'../data/imdb_tag_game_{size}.json', 'w') as fp:
    json.dump(film_dict, fp)

In [23]:
film_names

['The Saint',
 'Ella Enchanted',
 'The City of Lost Children',
 'Spawn',
 'Beerfest',
 'Stepmom',
 'Into the Storm',
 'Bandits',
 'Friday the 13th: Part 2',
 'Searching for Sugar Man',
 'Showgirls',
 'Never Say Never Again',
 'Dead Man Down',
 'The Wicker Man',
 'Magic in the Moonlight',
 'The November Man',
 'Run Fatboy Run',
 'The Family Stone',
 'Unhinged',
 'The Wailing',
 'Bhaag Milkha Bhaag',
 'Free Solo',
 'Climax',
 'Spy Kids 2: Island of Lost Dreams',
 'Blackfish',
 'Cell 211',
 'Larry Crowne',
 'Rust and Bone',
 'Whip It',
 'Legally Blonde 2: Red, White & Blonde',
 'Sliding Doors',
 'Mystery Men',
 'El mariachi',
 'James And The Giant Peach',
 'Domino',
 'Cube 2: Hypercube',
 'The Net',
 'Top Secret!',
 'Dead Snow',
 'Just Mercy',
 'Breaking the Waves',
 'The King of Staten Island',
 'Aloha',
 'The Dark Crystal',
 'Elle',
 'Urban Legend',
 'Dream House',
 'Sisters',
 'The Finest Hours',
 '21 Bridges',
 'Brotherhood of the Wolf',
 'Ice Age: Collision Course',
 'Joan of Arc',
 

In [24]:
film_dict.keys()

dict_keys(['shawshankredemption(1994)', 'darkknight(2008)', 'inception(2010)', 'fightclub(1999)', 'forrestgump(1994)', 'pulpfiction(1994)', 'matrix(1999)', 'lordofringsfellowshipofring(2001)', 'godfather(1972)', 'lordofringsreturnofking(2003)', 'interstellar(2014)', 'darkknightrises(2012)', 'lordofringstwotowers(2002)', 'seven(1995)', 'djangounchained(2012)', 'gladiator(2000)', 'batmanbegins(2005)', 'inglouriousbasterds(2009)', 'silenceoflambs(1991)', 'wolfofwallstreet(2013)', 'avengersassemble(2012)', 'savingprivateryan(1998)', 'starwars(1977)', 'schindlerslist(1993)', 'prestige(2006)', 'departed(2006)', 'shutterisland(2010)', 'greenmile(1999)', 'starwarsepisodevempirestrikesback(1980)', 'godfatherpartii(1974)', 'joker(I) (2019)', 'memento(2000)', 'avatar(2009)', 'backtofuture(1985)', 'guardiansofgalaxy(2014)', 'titanic(1997)', 'leon(1994)', 'americanbeauty(1999)', 'goodfellas(1990)', 'piratesofcaribbeancurseofblackpearl(2003)', 'vforvendetta(2005)', 'americanhistoryx(1998)', 'walle(2

In [25]:
browser.quit()