In [2]:
import re

name_re = re.compile(R'"?(.*?)"?\s*\((\d+?)\)\s*(?:\(.*?\)\s*|\{.*\}\s*)*\s*(\w*)',re.U)

def get_movie(regex_res):
    return regex_res.group(1), int(regex_res.group(2))

def base_movies():
    movies = set([])
    with open('movies.list', 'r', encoding = 'latin-1') as fp:
        for line in iter(fp.readline, ''):
            res = name_re.search(line)
            if res:
                movies.add(get_movie(res))
    return movies

movies = base_movies()
print(len(movies))

1253650


Eliminate those movies before 1990.

In [3]:
def year_filter(movies, year_after = 1990):
    return set([(title,year) for (title,year) in movies if year >= year_after])

movies = year_filter(movies)
print(len(movies))

836953


Eliminate non-English movies

In [4]:
def language_filter(movies, lang = 'English'):
    lang_movies = set([])
    movie = ('',0)
    langs = set([])
    with open('language.list', 'r', encoding = 'latin-1') as fp:
        for line in iter(fp.readline, ''):
            res = name_re.search(line)
            if res:
                if res.group(3) == lang:
                    lang_movies.add(get_movie(res))
    return movies & lang_movies

movies = language_filter(movies)
print(len(movies))

509430


In [5]:
import random
print(random.sample(movies,10))

[('The Two Ravens', 2016), ('Eyes of Christmas', 2013), ('Sonny', 2011), ('Glitch', 2003), ('Goodbye Lullaby: Bonus DVD', 2011), ('Sex and the City: A Farewell', 2004), ('Broken Pieces', 2015), ('Forbidden Games', 1990), ('Live with John Legend', 2012), ('Jezebel', 2006)]


Eliminate movies that do not have plots.

In [6]:
def parse_lines(lines):
    movie = None
    plot = []
    for line in lines:
        if line.startswith('MV:'):
            res = name_re.search(line[4:])
            if res:
                movie = get_movie(res)
        elif line.startswith('PL:'):
            plot.append(line[4:])
    return movie, ' '.join(plot)

def plot_filter(movies):
    movie_plots  = { }
    plots = []
    lines = []
    with open('plot.list', 'r', encoding = 'latin-1') as fp:
        delim = '-'*79+'\n'
        for line in iter(fp.readline, ''):
            if line == delim:
                movie, plot = parse_lines(lines)
                if movie in movie_plots:
                    movie_plots[movie] = movie_plots[movie] + ' ' + plot
                else:
                    movie_plots[movie] = plot
                lines = []
            else:
                lines.append(line.strip())
    keys = movie_plots.keys() & movies
    return {k:{'plot':movie_plots[k]} for k in keys}

movies = plot_filter(movies)
print(len(movies))

147238


In [7]:
for s in random.sample(list(movies),4):
    print(s,movies[s])

('I Am a Man', 2010) {'plot': "Personal. Humorous. Artistic. 'I Am A Man' is a personal documentary that focuses on the difficulties of being a single male in contemporary American society through the eyes of Renowned artist Charly Palmer and filmmaker Justin J.Jackson. 'I Am A Man' explores what it is to be a man through two drastically different individuals living in present day America."}
('Noam Chomsky: Rebel Without a Pause', 2003) {'plot': "Linguist, intellectual and activist, Noam Chomsky discusses and reflects on the state of world events including the War in Iraq, September 11th, the War on Terror, Media Manipulation and Control, Social Activism, Fear, and American Foreign Policy in both large forums and in small interactive discussions with other intellectuals, activists, fans, students and critics. Interwoven, is Dr. Carol Chomsky, Noam's wife and manager who reflects on what drives Noam and what life is like with him. Other candid reflections about Noam Chomsky and his thou

Write down the data in json format.

In [8]:
def cast_filter(movies, file_name = 'actresses'):
    keys  = set([])
    with open(file_name+'.list','r',encoding = 'latin-1') as fp:
        current_cast = ''
        cast_re = re.compile(R'([^\t]+?,.*)\t+"?(.*?)"?\s*\((\d+?)\)\s*(?:\(.*?\)\s*|\{.*\}\s*)*\s*(\w*)', re.U)
        film_re = re.compile(R'\t+"?(.*?)"?\s*\((\d+?)\)\s*(?:\(.*?\)\s*|\{.*\}\s*)*\s*(\w*)', re.U)

        for line in iter(fp.readline, ''):
            res = cast_re.search(line)
            if res:
                current_cast = res.group(1).strip()
                mov = (res.group(2),int(res.group(3)))
                keys.add(mov)
                if mov in movies:
                    if 'cast' in movies[mov]:
                        movies[mov]['cast'].append(current_cast)
                    else:
                        movies[mov]['cast'] = [current_cast]
                            
            else:
                res2 = film_re.search(line)
                if res2:
                    mov = (res2.group(1),int(res2.group(2)))
                    keys.add(mov)
                    if mov in movies:
                        if 'cast' in movies[mov]:
                            movies[mov]['cast'].append(current_cast)
                        else:
                            movies[mov]['cast'] = [current_cast]
    mkeys = keys & movies.keys()
    return {k:v for k,v in movies.items() if k in mkeys}

movies = cast_filter(movies)

In [11]:
print(len(movies))

102783


In [12]:
movies = cast_filter(movies, file_name = 'actors')
print(len(movies))

97880


In [13]:
import io, json

def write_movies(movies, path):
    with io.open(path, 'w', encoding='latin-1') as f:
        data = [{'title':title,'year':year,'plot':val['plot'], 'cast':val['cast']} \
                for (title,year),val in movies.items()]
        f.write(json.dumps(data, ensure_ascii=False))
        
write_movies(movies, 'data.json')

Create wrapper function for the whole thing.

In [14]:
def prepare_movies(path = None):
    movies = cast_filter(cast_filter(plot_filter(language_filter(year_filter(base_movies()))),'actors'))
    if path is not None:
        write_movies(movies, path)
    
    return movies
