In [45]:
import re

name_re = re.compile(R'"?(.*?)"?\s*\((\d+?)\)\s*(?:\(.*?\)\s*|\{.*\}\s*)*\s*(\w*)',re.U)

def get_movie(regex_res):
    return regex_res.group(1), int(regex_res.group(2))

def base_movies():
    movies = set([])
    with open('movies.list', 'r', encoding = 'latin-1') as fp:
        for line in iter(fp.readline, ''):
            res = name_re.search(line)
            if res:
                movies.add(get_movie(res))
    return movies

movies = list_movies()
print(len(movies))

1253650


Eliminate those movies before 1990.

In [46]:
def year_filter(movies, year_after = 1990):
    return set([(title,year) for (title,year) in movies if year >= year_after])

movies = year_filter(movies)
print(len(movies))

836953


Eliminate non-English movies

In [47]:
def language_filter(movies, lang = 'English'):
    lang_movies = set([])
    movie = ('',0)
    langs = set([])
    with open('language.list', 'r', encoding = 'latin-1') as fp:
        for line in iter(fp.readline, ''):
            res = name_re.search(line)
            if res:
                if res.group(3) == lang:
                    lang_movies.add(get_movie(res))
    return movies & lang_movies

movies = language_filter(movies)
print(len(movies))

509430


In [19]:
import random
print(random.sample(movies,10))

[('Brief Reunion', 2011), ('Delta Station', 2012), ('Barney: Furry Friends', 2010), ('Speed Dating Webseries', 2013), ('Faultless: The American Orphan', 2012), ('Strand', 2013), ('Second to None', 2010), ('What Should We Watch?', 2013), ('Edison Manor', 2014), ('The xXx Factor 2: The Next Level', 2005)]


Eliminate movies that do not have plots.

In [48]:
def parse_lines(lines):
    movie = None
    plot = []
    for line in lines:
        if line.startswith('MV:'):
            res = name_re.search(line[4:])
            if res:
                movie = get_movie(res)
        elif line.startswith('PL:'):
            plot.append(line[4:])
    return movie, ' '.join(plot)

def plot_filter(movies):
    movie_plots  = { }
    plots = []
    lines = []
    with open('plot.list', 'r', encoding = 'latin-1') as fp:
        delim = '-'*79+'\n'
        for line in iter(fp.readline, ''):
            if line == delim:
                movie, plot = parse_lines(lines)
                if movie in movie_plots:
                    movie_plots[movie] = movie_plots[movie] + ' ' + plot
                else:
                    movie_plots[movie] = plot
                lines = []
            else:
                lines.append(line.strip())
    keys = movie_plots.keys() & movies
    return {k:movie_plots[k] for k in keys}

movies = plot_filter(movies)
print(len(movies))

147238


In [44]:
for s in random.sample(list(movies),4):
    print(s,movies[s])

('Blindness', 2010) What does it mean to see? So many claim to be those who will see God; to be the ones that do all the right things, and so deserve heaven. Yet, what if that is actually an expression of blindness? What if the one who truly sees is the one who we least expect? Who doesn't look that fancy, but trusts in God's power to change them? 'Blindness' exposes our assumptions as we journey with a blind man who 'sees', desperate for those who are 'blind' to truly see.
('Airplanes', 2006) Jen Heck's Airplanes tells the love story of two teenage girls from the moment they first see each other to their eventual break-up. While an educational-style voice-over discusses the science and art of flight, the characters, like airplanes themselves, take off, struggle to fly, and eventually come in for an emergency landing.
('Melancholia', 2013) Eugene is just a melancholic character in a play - at least this is the idea of the author of the film. But the protagonist has his own ideas and st

Write down the data in json format.

In [39]:
import io, json

def write_movies(movies, path):
    with io.open(path, 'w', encoding='latin-1') as f:
        data = [{'title':title,'year':year,'plot':plot} for (title,year),plot in movies.items()]
        f.write(json.dumps(data, ensure_ascii=False))
        
write_movies(movies, 'data.json')

Create wrapper function for the whole thing.

In [50]:
def prepare_movies(path = None):
    movies = plot_filter(language_filter(year_filter(base_movies())))
    if path is not None:
        write_movies(movies, path)
    
    return movies