In [49]:
import re

name_re = re.compile(R'"?(.*?)"?\s*\(([\d/IVX?]+?)\)\s*(?:\(.*?\)\s*|\{.*\}\s*)*\s*(\w*)',re.U)

def get_movie(regex_res):
    return regex_res.group(1).strip(), regex_res.group(2).strip()

def base_movies():
    movies = set([])
    with open('movies.list', 'r', encoding = 'latin-1') as fp:
        for line in iter(fp.readline, ''):
            res = name_re.search(line)
            if res:
                movies.add(get_movie(res))
    return movies

movies = base_movies()
print(len(movies))

1338639


Eliminate those movies before 1990.

In [50]:
def year_filter(movies, year_after = 1990):
    l = []
    for title, year in movies:
        try:
            if int(year[0:4]) >= year_after:
                l.append((title, year))
        except ValueError:
            pass       
    return set(l)

movies = year_filter(movies)
print(len(movies))

883181


Eliminate non-English movies

In [51]:
def language_filter(movies, lang = 'English'):
    lang_movies = set([])
    movie = ('',0)
    langs = set([])
    with open('language.list', 'r', encoding = 'latin-1') as fp:
        for line in iter(fp.readline, ''):
            res = name_re.search(line)
            if res:
                if res.group(3) == lang:
                    lang_movies.add(get_movie(res))
    return movies & lang_movies

movies = language_filter(movies)
print(len(movies))

544791


In [52]:
import random
print(random.sample(movies,10))

[("Russell Coight's Celebrity Challenge", '2004'), ('The Savior Is Born', '1992'), ("'Til Death", '2015/II'), ('Savage Wild', '2009'), ('The Casting Call', '2015'), ('The Dark', '2008'), ('Sports Bloopers Encyclopedia', '1996'), ('Thou Shalt Laugh 5', '2010'), ('Stages', '2016'), ('Basques in the West', '2012')]


Eliminate movies that do not have plots.

In [53]:
def parse_lines(lines):
    movie = None
    plot = []
    for line in lines:
        if line.startswith('MV:'):
            res = name_re.search(line[4:])
            if res:
                movie = get_movie(res)
        elif line.startswith('PL:'):
            plot.append(line[4:])
    return movie, ' '.join(plot)

def plot_filter(movies):
    movie_plots  = { }
    plots = []
    lines = []
    with open('plot.list', 'r', encoding = 'latin-1') as fp:
        delim = '-'*79+'\n'
        for line in iter(fp.readline, ''):
            if line == delim:
                movie, plot = parse_lines(lines)
                if movie in movie_plots:
                    movie_plots[movie] = movie_plots[movie] + ' ' + plot
                else:
                    movie_plots[movie] = plot
                lines = []
            else:
                lines.append(line.strip())
    keys = movie_plots.keys() & movies
    return {k:{'plot':movie_plots[k]} for k in keys}

movies = plot_filter(movies)
print(len(movies))

157526


In [54]:
for s in random.sample(list(movies),4):
    print(s,movies[s])

('Cubanoson: The Story', '2011') {'plot': '"Cubanoson: The Story" is a short documentary on Papo Ortega\'s Cubanoson, New York\'s Cuban Orchestra. The film is presented by MeLu Films and directed by Lucio Fernandez. It is a behind the scenes look at what makes this orchestra so unique in the world of Latin music. The film features interviews with all of Cubanoson\'s musicians; with music by Papo Ortega as well as a dynamic live performance by Cubanoson of Guillermo Portabales\' classic "El Carretero". The film is produced by MeLu Films in association with Cubanoson Productions and El Gitano Discreto Productions. The film is dedicated to all who love Latin music and particularly Cuban music.'}
('Belgrade Pride', '1997') {'plot': 'In June 2001 Belgrade lesbians, gay men, bisexuals and trans people organized a Pride event in celebration of the new era of democracy and tolerance that was believed to have come to Serbia. About 40 people came to celebrate. Several thousand counter-demonstrat

In [55]:
def genre_filter(movies):
    keys  = set([])
    genre_re = re.compile(R'"?(.*?)"?\s*\(([\d/IVX?]+?)\)\s*(?:\(.*?\)\s*|\{.*\}\s*)*\s*([\w-]*)', re.U)
    with open('genres.list','r',encoding = 'latin-1') as fp:
        i = 0
        for line in iter(fp.readline, ''):
            i += 1
            if i == 380:
                break
        for line in iter(fp.readline, ''):
            res = genre_re.search(line)
            if res:
                mov = get_movie(res)
                keys.add(mov)
                if mov in movies:
                    if 'genres' in movies[mov]:
                        movies[mov]['genres'].add(res.group(3))
                    else:
                        movies[mov]['genres'] = set([res.group(3)])
    
    mkeys = keys & movies.keys()
    return {k:v for k,v in movies.items() if k in mkeys}
    
movies = genre_filter(movies)
print(len(movies))

155241


In [56]:
def cast_filter(movies, file_name = 'actresses'):
    keys  = set([])
    with open(file_name+'.list','r',encoding = 'latin-1') as fp:
        current_cast = ''
        cast_re = re.compile(R'([^\t]+?,.*)\t+"?(.*?)"?\s*\(([\d/IVX?]+?)\)\s*(?:\(.*?\)\s*|\{.*\}\s*)*\s*(\w*)',re.U)
        film_re = re.compile(R'\t+"?(.*?)"?\s*\(([\d/IVX?]+?)\)\s*(?:\(.*?\)\s*|\{.*\}\s*)*\s*(\w*)', re.U)

        for line in iter(fp.readline, ''):
            res = cast_re.search(line)
            if res:
                current_cast = res.group(1).strip()
                mov = (res.group(2).strip(),res.group(3).strip())
                keys.add(mov)
                if mov in movies:
                    if 'cast' in movies[mov]:
                        movies[mov]['cast'].add(current_cast)
                    else:
                        movies[mov]['cast'] = set([current_cast])
                            
            else:
                res2 = film_re.search(line)
                if res2:
                    mov = (res2.group(1).strip(),res2.group(2).strip())
                    keys.add(mov)
                    if mov in movies:
                        if 'cast' in movies[mov]:
                            movies[mov]['cast'].add(current_cast)
                        else:
                            movies[mov]['cast'] = set([current_cast])
    mkeys = keys & movies.keys()
    return {k:v for k,v in movies.items() if k in mkeys}

movies = cast_filter(movies)
print(len(movies))

108804


In [57]:
movies = cast_filter(movies, file_name = 'actors')
print(len(movies))

103798


Write down the data in json format.

In [58]:
import io, json

def write_movies(movies, path):
    with io.open(path, 'w', encoding='latin-1') as f:
        data = [{'title':title,'year':year,'plot':val['plot'], 'cast':list(val['cast']), \
                 'genres':list(val['genres'])} for (title,year),val in movies.items()]
        f.write(json.dumps(data, ensure_ascii=False))
        
write_movies(movies, 'data.json')

Create wrapper function for the whole thing.

In [59]:
def prepare_movies(path = None):
    movies = cast_filter(cast_filter(genre_filter(plot_filter(language_filter(year_filter(base_movies())))),'actors'))
    if path is not None:
        write_movies(movies, path)
    
    return movies
