In [18]:
import re

name_re = re.compile(R'"?(.*?)"?\s*\((\d+?)\)\s*(?:\(.*?\)\s*|\{.*\}\s*)*\s*(\w*)',re.U)

def get_movie(regex_res):
    return regex_res.group(1).strip(), int(regex_res.group(2))

def base_movies():
    movies = set([])
    with open('movies.list', 'r', encoding = 'latin-1') as fp:
        for line in iter(fp.readline, ''):
            res = name_re.search(line)
            if res:
                movies.add(get_movie(res))
    return movies

movies = base_movies()
print(len(movies))

1253650


Eliminate those movies before 1990.

In [19]:
def year_filter(movies, year_after = 1990):
    return set([(title,year) for (title,year) in movies if year >= year_after])

movies = year_filter(movies)
print(len(movies))

836953


Eliminate non-English movies

In [20]:
def language_filter(movies, lang = 'English'):
    lang_movies = set([])
    movie = ('',0)
    langs = set([])
    with open('language.list', 'r', encoding = 'latin-1') as fp:
        for line in iter(fp.readline, ''):
            res = name_re.search(line)
            if res:
                if res.group(3) == lang:
                    lang_movies.add(get_movie(res))
    return movies & lang_movies

movies = language_filter(movies)
print(len(movies))

509430


In [21]:
import random
print(random.sample(movies,10))

[("Annie's Bar", 1996), ('No Reply', 2006), ('Scorsese on Imamura', 2007), ('Grammy', 2015), ('NOOMA Shells 020', 2007), ('Lab Art Show', 2015), ('Mofos: Real Slut Party 9', 2012), ('No Parking', 2011), ('Ménage à Moi', 2016), ('Forged', 2007)]


Eliminate movies that do not have plots.

In [22]:
def parse_lines(lines):
    movie = None
    plot = []
    for line in lines:
        if line.startswith('MV:'):
            res = name_re.search(line[4:])
            if res:
                movie = get_movie(res)
        elif line.startswith('PL:'):
            plot.append(line[4:])
    return movie, ' '.join(plot)

def plot_filter(movies):
    movie_plots  = { }
    plots = []
    lines = []
    with open('plot.list', 'r', encoding = 'latin-1') as fp:
        delim = '-'*79+'\n'
        for line in iter(fp.readline, ''):
            if line == delim:
                movie, plot = parse_lines(lines)
                if movie in movie_plots:
                    movie_plots[movie] = movie_plots[movie] + ' ' + plot
                else:
                    movie_plots[movie] = plot
                lines = []
            else:
                lines.append(line.strip())
    keys = movie_plots.keys() & movies
    return {k:{'plot':movie_plots[k]} for k in keys}

movies = plot_filter(movies)
print(len(movies))

147238


In [24]:
for s in random.sample(list(movies),4):
    print(s,movies[s])

('Daedalus', 2012) {'plot': "Daedalus is the story of a man who after an accident discovers that he's being hunted by a hit-man. A stranger offers to help, but in return requires him to do a job as an assassin. In a twisted turn of events, Daedalus discovers the truth."}
("I Don't Want to Kill Myself", 2011) {'plot': "When James fakes a suicide attempt to get his friend out of a DUI, he is sentenced to an out-patient recovery group for suicidal people, where he must convince everyone, including himself, that he doesn't want to kill himself."}
('Modern Music', 2013) {'plot': 'In Modern Music we follow an ad-hoc family (label, band, and management) as they navigate the ever changing music business. The results are a hilarious look at a struggling art form and the inter-personal relationships of the people who keep it alive.'}
('Louie Anderson Presents', 2011) {'plot': 'Louie Anderson is one of the most versatile and successful comedians working in Hollywood today, loved by the television

In [35]:
def genre_filter(movies):
    keys  = set([])
    genre_re = re.compile(R'"?(.*?)"?\s*\((\d+?)\)\s*(?:\(.*?\)\s*|\{.*\}\s*)*\s*([\w-]*)', re.U)
    with open('genres.list','r',encoding = 'latin-1') as fp:
        i = 0
        for line in iter(fp.readline, ''):
            i += 1
            if i == 380:
                break
        for line in iter(fp.readline, ''):
            res = genre_re.search(line)
            if res:
                mov = get_movie(res)
                keys.add(mov)
                if mov in movies:
                    if 'genres' in movies[mov]:
                        movies[mov]['genres'].add(res.group(3))
                    else:
                        movies[mov]['genres'] = set([res.group(3)])
    
    mkeys = keys & movies.keys()
    return {k:v for k,v in movies.items() if k in mkeys}
    
movies = genre_filter(movies)
print(len(movies))

144988


In [36]:
def cast_filter(movies, file_name = 'actresses'):
    keys  = set([])
    with open(file_name+'.list','r',encoding = 'latin-1') as fp:
        current_cast = ''
        cast_re = re.compile(R'([^\t]+?,.*)\t+"?(.*?)"?\s*\((\d+?)\)\s*(?:\(.*?\)\s*|\{.*\}\s*)*\s*(\w*)', re.U)
        film_re = re.compile(R'\t+"?(.*?)"?\s*\((\d+?)\)\s*(?:\(.*?\)\s*|\{.*\}\s*)*\s*(\w*)', re.U)

        for line in iter(fp.readline, ''):
            res = cast_re.search(line)
            if res:
                current_cast = res.group(1).strip()
                mov = (res.group(2),int(res.group(3)))
                keys.add(mov)
                if mov in movies:
                    if 'cast' in movies[mov]:
                        movies[mov]['cast'].add(current_cast)
                    else:
                        movies[mov]['cast'] = set([current_cast])
                            
            else:
                res2 = film_re.search(line)
                if res2:
                    mov = (res2.group(1),int(res2.group(2)))
                    keys.add(mov)
                    if mov in movies:
                        if 'cast' in movies[mov]:
                            movies[mov]['cast'].add(current_cast)
                        else:
                            movies[mov]['cast'] = set([current_cast])
    mkeys = keys & movies.keys()
    return {k:v for k,v in movies.items() if k in mkeys}

movies = cast_filter(movies)
print(len(movies))

101278


In [37]:
movies = cast_filter(movies, file_name = 'actors')
print(len(movies))

96585


Write down the data in json format.

In [38]:
import io, json

def write_movies(movies, path):
    with io.open(path, 'w', encoding='latin-1') as f:
        data = [{'title':title,'year':year,'plot':val['plot'], 'cast':list(val['cast']), \
                 'genres':list(val['genres'])} for (title,year),val in movies.items()]
        f.write(json.dumps(data, ensure_ascii=False))
        
write_movies(movies, 'data.json')

Create wrapper function for the whole thing.

In [39]:
def prepare_movies(path = None):
    movies = cast_filter(cast_filter(genre_filter(plot_filter(language_filter(year_filter(base_movies())))),'actors'))
    if path is not None:
        write_movies(movies, path)
    
    return movies
