# TAGgle
A wordle-like based on imdb film keywords.

The code here is scrapes data from IMDB and writes it to a json for use in the external app. 

The only variable needed to be altered here is the size, which references how many pages to scrape. Each page is 50 films. This enables the external app to choose from several difficulty levels.


In [1]:
#import dependencies
from splinter import Browser
from bs4 import BeautifulSoup as bs
import time
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import re
import fnmatch
import random
import string
import json

In [25]:

# how many pages to return, at 50 films per page
# this effectively sets a difficulty level; 1: easy, 5: medium, 20: severe
size = 100


In [26]:
# Set up Splinter
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)



Current google-chrome version is 105.0.5195
Get LATEST chromedriver version for 105.0.5195 google-chrome
Driver [C:\Users\David\.wdm\drivers\chromedriver\win32\105.0.5195.52\chromedriver.exe] found in cache


In [27]:
# method to grab all 50 films from an IMDB URL
def populate_films(url):
    browser.visit(url)
    soup = bs(browser.html, 'html.parser')
    # the image next to the film contains all the data we need: film title in the alt-text, and film id in 'data-tconst'.
    href_list = soup.find_all('img', class_='loadlate')
    for item in href_list:
        film_names.append(item['alt'])
        film_ids.append(item['data-tconst'])

In [28]:
film_names = []
film_ids = []

print(f"grabbing the names and id of the first {size*50} films, ranked by most rated")    
for i in range (0,size):
    url = f"https://www.imdb.com/search/keyword/?ref_=kw_nxt&mode=detail&page={i+1}&sort=num_votes,desc&title_type=movie"
    populate_films(url)
    print(f"getting films from page {i+1} of {size}")
    
# print a list of all the films 
for i in range (0, len(film_names)):
    print(f"{i+1}: {film_ids[i]}, {film_names[i]}")

grabbing the names and id of the first 5000 films, ranked by most rated
getting films from page 1 of 100
getting films from page 2 of 100
getting films from page 3 of 100
getting films from page 4 of 100
getting films from page 5 of 100
getting films from page 6 of 100
getting films from page 7 of 100
getting films from page 8 of 100
getting films from page 9 of 100
getting films from page 10 of 100
getting films from page 11 of 100
getting films from page 12 of 100
getting films from page 13 of 100
getting films from page 14 of 100
getting films from page 15 of 100
getting films from page 16 of 100
getting films from page 17 of 100
getting films from page 18 of 100
getting films from page 19 of 100
getting films from page 20 of 100
getting films from page 21 of 100
getting films from page 22 of 100
getting films from page 23 of 100
getting films from page 24 of 100
getting films from page 25 of 100
getting films from page 26 of 100
getting films from page 27 of 100
getting films from 

2184: tt0298814, The Core
2185: tt2241351, Money Monster
2186: tt2431286, Philomena
2187: tt11271038, Licorice Pizza
2188: tt0395584, The Devil's Rejects
2189: tt1781769, Anna Karenina
2190: tt0396555, Meet the Robinsons
2191: tt0452637, Lady in the Water
2192: tt0248667, Ali
2193: tt1172049, Demolition
2194: tt0096061, Scrooged
2195: tt1502404, Drive Angry
2196: tt0109444, Clear and Present Danger
2197: tt5503686, Hustlers
2198: tt0122718, Small Soldiers
2199: tt2140379, Self/less
2200: tt0127536, Elizabeth
2201: tt0816462, Conan the Barbarian
2202: tt0124315, The Cider House Rules
2203: tt1853739, You're Next
2204: tt1195478, The Five-Year Engagement
2205: tt4530422, Overlord
2206: tt0163187, Runaway Bride
2207: tt0115759, Broken Arrow
2208: tt1592525, Lockout
2209: tt0093428, The Living Daylights
2210: tt0105417, Sister Act
2211: tt1477076, Saw 3D
2212: tt5638642, The Ritual
2213: tt3606752, Cars 3
2214: tt1235166, A Prophet
2215: tt0318649, Sahara
2216: tt2357129, Jobs
2217: tt1045

4493: tt0196216, Small Time Crooks
4494: tt5001718, Everything, Everything
4495: tt1265990, The Roommate
4496: tt0089173, Friday the 13th V: A New Beginning
4497: tt0045061, The Quiet Man
4498: tt10895576, Mimi
4499: tt0162677, Summer of Sam
4500: tt1031969, The Rocker
4501: tt9354842, To All the Boys: P.S. I Still Love You
4502: tt0490084, Because I Said So
4503: tt1620981, The Addams Family
4504: tt0838247, After.Life
4505: tt3787590, We Are Your Friends
4506: tt3966404, Mustang
4507: tt0247199, The Believer
4508: tt8522006, Happiest Season
4509: tt0338309, Evil
4510: tt0078446, Up in Smoke
4511: tt7008872, Boy Erased
4512: tt1563742, Overboard
4513: tt0259288, Dragonfly
4514: tt0073312, Love and Death
4515: tt0052077, Plan 9 from Outer Space
4516: tt0120910, Fantasia 2000
4517: tt0034248, Suspicion
4518: tt0048750, The Trouble with Harry
4519: tt4154916, Replicas
4520: tt0139462, Message in a Bottle
4521: tt0115783, Bulletproof
4522: tt1783732, John Dies at the End
4523: tt0242193, 

In [29]:
# this method scrapes the keywords from the indivual page for the given film id.
def get_keywords(film_id):
    keyURL = f"https://www.imdb.com/title/{film_id}/keywords"
    browser.visit(keyURL)
    soup = bs(browser.html, 'html.parser')
    # in the keywords table, each keyword can be found in a 'sodatext' div.
    key_list = soup.find_all('div', class_='sodatext')
    
    keys = []
    for div in key_list:
        # we just want the text from what is in the page as an anchor.
        keys.append(div.find('a').contents[0])
    return keys
    

In [30]:
# code to remove punctutation and capitals from any text.
# perhaps common words like 'the' should be removed to?

def SPAG_remover(word):
    return word.lower().translate(str.maketrans('', '', string.punctuation))

In [None]:
# this method loops through the film names, and for each one, calls the 'get_keywords' method. 
# all this data is then wrapped up in a dictionary.
# the dictionary is then added to the main dictionary, with a cleaned-up film name.
film_dict = {}
for i in range(0, len(film_names)):
    print(f"film {i+1} of {len(film_names)}")
    new_film = {'id': film_ids[i], 'punc_name': film_names[i], 'tags': get_keywords(film_ids[i])}
    film_dict.update({SPAG_remover(film_names[i]): new_film})

film 1 of 5000
film 2 of 5000
film 3 of 5000
film 4 of 5000
film 5 of 5000
film 6 of 5000
film 7 of 5000
film 8 of 5000
film 9 of 5000
film 10 of 5000
film 11 of 5000
film 12 of 5000
film 13 of 5000
film 14 of 5000
film 15 of 5000
film 16 of 5000
film 17 of 5000
film 18 of 5000
film 19 of 5000
film 20 of 5000
film 21 of 5000
film 22 of 5000
film 23 of 5000
film 24 of 5000
film 25 of 5000
film 26 of 5000
film 27 of 5000
film 28 of 5000
film 29 of 5000
film 30 of 5000
film 31 of 5000
film 32 of 5000
film 33 of 5000
film 34 of 5000
film 35 of 5000
film 36 of 5000
film 37 of 5000
film 38 of 5000
film 39 of 5000
film 40 of 5000
film 41 of 5000
film 42 of 5000
film 43 of 5000
film 44 of 5000
film 45 of 5000
film 46 of 5000
film 47 of 5000
film 48 of 5000
film 49 of 5000
film 50 of 5000
film 51 of 5000
film 52 of 5000
film 53 of 5000
film 54 of 5000
film 55 of 5000
film 56 of 5000
film 57 of 5000
film 58 of 5000
film 59 of 5000
film 60 of 5000
film 61 of 5000
film 62 of 5000
film 63 of 5000
f

film 490 of 5000
film 491 of 5000
film 492 of 5000
film 493 of 5000
film 494 of 5000
film 495 of 5000
film 496 of 5000
film 497 of 5000
film 498 of 5000
film 499 of 5000
film 500 of 5000
film 501 of 5000
film 502 of 5000
film 503 of 5000
film 504 of 5000
film 505 of 5000
film 506 of 5000
film 507 of 5000
film 508 of 5000
film 509 of 5000
film 510 of 5000
film 511 of 5000
film 512 of 5000
film 513 of 5000
film 514 of 5000
film 515 of 5000
film 516 of 5000
film 517 of 5000
film 518 of 5000
film 519 of 5000
film 520 of 5000
film 521 of 5000
film 522 of 5000
film 523 of 5000
film 524 of 5000
film 525 of 5000
film 526 of 5000
film 527 of 5000
film 528 of 5000
film 529 of 5000
film 530 of 5000
film 531 of 5000
film 532 of 5000
film 533 of 5000
film 534 of 5000
film 535 of 5000
film 536 of 5000
film 537 of 5000
film 538 of 5000
film 539 of 5000
film 540 of 5000
film 541 of 5000
film 542 of 5000
film 543 of 5000
film 544 of 5000
film 545 of 5000
film 546 of 5000
film 547 of 5000
film 548 of 50

film 972 of 5000
film 973 of 5000
film 974 of 5000
film 975 of 5000
film 976 of 5000
film 977 of 5000
film 978 of 5000
film 979 of 5000
film 980 of 5000
film 981 of 5000
film 982 of 5000
film 983 of 5000
film 984 of 5000
film 985 of 5000
film 986 of 5000
film 987 of 5000
film 988 of 5000
film 989 of 5000
film 990 of 5000
film 991 of 5000
film 992 of 5000
film 993 of 5000
film 994 of 5000
film 995 of 5000
film 996 of 5000
film 997 of 5000
film 998 of 5000
film 999 of 5000
film 1000 of 5000
film 1001 of 5000
film 1002 of 5000
film 1003 of 5000
film 1004 of 5000
film 1005 of 5000
film 1006 of 5000
film 1007 of 5000
film 1008 of 5000
film 1009 of 5000
film 1010 of 5000
film 1011 of 5000
film 1012 of 5000
film 1013 of 5000
film 1014 of 5000
film 1015 of 5000
film 1016 of 5000
film 1017 of 5000
film 1018 of 5000
film 1019 of 5000
film 1020 of 5000
film 1021 of 5000
film 1022 of 5000
film 1023 of 5000
film 1024 of 5000
film 1025 of 5000
film 1026 of 5000
film 1027 of 5000
film 1028 of 5000
fi

film 1429 of 5000
film 1430 of 5000
film 1431 of 5000
film 1432 of 5000
film 1433 of 5000
film 1434 of 5000
film 1435 of 5000
film 1436 of 5000
film 1437 of 5000
film 1438 of 5000
film 1439 of 5000
film 1440 of 5000
film 1441 of 5000
film 1442 of 5000
film 1443 of 5000
film 1444 of 5000
film 1445 of 5000
film 1446 of 5000
film 1447 of 5000
film 1448 of 5000
film 1449 of 5000
film 1450 of 5000
film 1451 of 5000
film 1452 of 5000
film 1453 of 5000
film 1454 of 5000
film 1455 of 5000
film 1456 of 5000
film 1457 of 5000
film 1458 of 5000
film 1459 of 5000
film 1460 of 5000
film 1461 of 5000
film 1462 of 5000
film 1463 of 5000
film 1464 of 5000
film 1465 of 5000
film 1466 of 5000
film 1467 of 5000
film 1468 of 5000
film 1469 of 5000
film 1470 of 5000
film 1471 of 5000
film 1472 of 5000
film 1473 of 5000
film 1474 of 5000
film 1475 of 5000
film 1476 of 5000
film 1477 of 5000
film 1478 of 5000
film 1479 of 5000
film 1480 of 5000
film 1481 of 5000
film 1482 of 5000
film 1483 of 5000
film 1484 

In [None]:
# display a random entry from the dictionary to check it's working
print(random.choice(list(film_dict.values())))

In [None]:
# export the dictionary to an external json file for portability
with open(f'data/imdb_tag_game_{size}.json', 'w') as fp:
    json.dump(film_dict, fp)