# WIKIPEDIA DATASET

In [None]:
import json
import re
import numpy as np
import urllib
import urllib2
from IPython.display import Image

The code in this notebook is used to download movies data from wikipedia and build a dictionary, in order to have a clean dataset.

Movies since 1989 are obtained from specific wikipedia pages (that have the structure xxxx_in_film, with xxxx representing the year), since from this year all the pages follow a similar structure. Each of these wikipedia pages contain tables with a list of movies of that year, splitted by the trimester of the release. From these tables, we extract the title and the genre (except for movies from 2004 to 2011 that don't have this info). The other information is obtained from the infobox that each movie has in their wikipedia page.

## Defined functions

In [None]:
# Download a wikipedia page
def download_wiki(name):
    
    # set the parameters (explained in detail here https://www.mediawiki.org/wiki/API:Tutorial)
    baseurl = "https://en.wikipedia.org/w/api.php?"
    action = "action=query"
    title = "titles="
    content = "prop=revisions&rvprop=content"
    dataformat = "format=json"
    
    # construct the query
    query = "%s%s&%s%s&%s&%s" % (baseurl,action,title,name.encode('utf-8'),content,dataformat)
    print query
    wikiresponse = urllib2.urlopen(query)
    wikisource = wikiresponse.read()
    wikijson = json.loads(wikisource)

    try:
        key = wikijson["query"]["pages"].keys()
        text = wikijson["query"]["pages"][key[0]]["revisions"][0]["*"]
    except KeyError:
        print "Invalid wikipage name or page does not exist"
        return
    return text

In [None]:
# Returns the link part of a wikipedia link
def get_movie_link(f):
    try:
        movie_link = (re.search(r'\[\[(.+?)\]\]', f).group(1)).replace(" ", "_")
        
        # If the link contains the character |, we return the right side
        if '|' in movie_link:
            movie_link = re.search(r'(.+)\|', movie_link).group(1)
        
        return movie_link.encode('utf-8')
    
    # If the provided string does not contain a link, an error message is returned.
    except:
        error_str = "Movie without wikipage"
        print error_str
        return ""

In [None]:
# Returns de label part of a wikipedia link.
def get_link_name(f, must_have_link = True):
    try:
        link_name = re.search(r'\[\[(.+?)\]\]', f).group(1)
        
        # If the link contains the character |, we return the right side
        if '|' in link_name:
            link_name = re.search(r'\|(.+)', link_name).group(1)
        
        return link_name.encode('utf-8')
    # If the provided string does not contain a link, an error message is returned.
    except:
        aux = ""
        if must_have_link == False: aux = f.encode("utf-8")
        return aux

In [None]:
# Tries to clean "string" with the regular expression "regex". If "regex" doesn't match the "string", the same string is returned
def clean_string_regex(string, regex):
    try:
        return re.search(regex, string, re.DOTALL).group(1)
    except:
        return string 

In [None]:
# Returns the image from the infobox of the movie wiki
def get_cover_link(link, img_name):
    baseurl = "http://en.wikipedia.org/w/api.php?action=query&titles="
    # Queries the wikipage for the list of images
    generator = "generator=images&gimlimit=100&prop=imageinfo&iiprop=url|dimensions|mime"
    dataformat = "format=json"
    query = "%s%s&%s&%s" % (baseurl,link.encode('utf-8'),generator,dataformat)
    wikijson = json.loads(urllib2.urlopen(query).read())
    try:
        # Obtains the list of all urls
        urls = re.findall(r"(https?://upload.wikimedia.org/wikipedia/en.+?)\'",str(wikijson["query"]["pages"]))
        # Looks for the images that have the same name as found in the wikipedia infobox
        cover_img = list(set([url for url in urls if img_name.lower() in url.lower()]))
        if cover_img == []: return ""
        # If more than one is obtained, the first one is returned
        else: cover_img = cover_img[0]
            
    except KeyError:
        print "Movie without cover image"
        return ""
    
    return cover_img

In [None]:
# Given a string "movie_wiki", extracts the information related to "atribute". "j" was used for debugging
def get_movie_attribute(movie_wiki, attribute, j):
    try:
        # Extracts the info between the atribute row and the next "="
        attr_dirty = re.search(r'\|[\s\t]*%ss?[\s\t]*=(.*?)\n[\s\t]*\|[\s\t]*[\w\.\s\-]+[\s\t]*=' % attribute, movie_wiki, re.DOTALL).group(1).strip()
    except:
        try:
            # If the attribute is the last one in the infobox, it looks for the closing curly brackets
            attr_dirty = re.search(r'\|[\s\t]*%ss?[\s\t]*=(.+?)\}\}' % attribute, movie_wiki, re.DOTALL).group(1).strip()          
        except:
            print str(j+1) + ". Movie does not contain %s" %attribute
            return ""
   
    # Removes all links in [[]] or [[|]] format
    try:
        attr_clean = re.sub(r'\[\[[^\]]+?\|([^\]]+?)\]\]', r'\1', attr_dirty)
        attr_clean = re.sub(r'\[\[(.*?)\]\]', r'\1', attr_clean, re.DOTALL)
    except:
        attr_clean = re.sub(r'\[\[(.*?)\]\]', r'\1', attr_dirty, re.DOTALL)
    
    # List of values can be separated by "*", "|" or <br>
    try:
        attr_list = re.search(r'^\{\{(.+?)\*(.+)\}\}', attr_clean, re.DOTALL).group(2).split("*")
    except:
        try:
            attr_list = re.search(r'^\{\{(.+?)\|(.+)\}\}', attr_clean, re.DOTALL).group(2)
            attr_list = re.sub(r'\{\{(.+?)\}\}',"", attr_list).split("|")
        except:
            attr_list = re.split("<br.*?>", attr_clean)

    # Only the text on the left side of some special characters (that represent the beginning of a comment, link, citation...)
    # is kept.
    attr_list = [clean_string_regex(attr, r'(.*?)[<\(\{]').strip() for attr in attr_list]
    
    # Sometimes, in a list, some elements in exclude_list appear and are used to classify elements in a list, but contain no
    # relevant information
    exclude_list = ["Animation:", "Live-action:", "Co-director:", "Uncredited:"]
    
    # Quotation marks are used to put a text in bold, italics or both. We are not interested in this info, so we delete blocks of
    # two ore more consecutive quotation marks (single quotes)
    attr_list = [re.sub(r"\'\'+","",attr).strip() for attr in attr_list]    
    attr_list = list(set(attr_list) - set(exclude_list)) 
    if len(attr_list) == 1: attr_list = attr_list[0]
    else:
        # We delete the array elements that became empty
        attr_list = [attr for attr in attr_list if attr != ""]
        if len(attr_list) == 1: attr_list = attr_list[0]

    return attr_list

In [None]:
# To flatten the list of movie attributes
def flatten_to_strings(listOfLists):
    """Flatten a list of (lists of (lists of strings)) for any level of nesting"""
    result = []

    for i in listOfLists:
        # Only append if i is a basestring (superclass of string)
        if isinstance(i, basestring):
            result.append(i)
        # Otherwise call this function recursively
        else:
            result.extend(flatten_to_strings(i))
    return result

In [None]:
# Extracts the gross of a movie. It can't be done with the function get_movie_attribute as it requires special processing
def get_gross(wikitext,j):
    total_gross = 0
    # All the characters after "gross" are obtained
    aux = re.search(r"\|[\s\t]*gross.*",wikitext, re.DOTALL).group()
    # Then, we look for currency symbols (dollar, euro, pound and yen)
    movie_gross = [x.decode("utf-8") for x in re.findall(ur"[\$\€\£\¥].*",aux)]
    
    for elem in movie_gross:
        million = False
        billion = False
        # We delete all the parts of the string in between of "{}" or "<>"
        elem = re.sub(r"\{\{.*?\}\}", r"", elem)
        elem = re.sub(r"\<.*?\>", r"", elem)
        try:
            # Currency and value are extracted
            aux = re.search(ur"([\$\€\£\¥])(.*)",elem)
            currency = aux.group(1)
            value = aux.group(2).replace(",","").encode("utf-8")
            # If the text million or billion apperar, value is multiplied by the corresponding factor
            try: 
                value = re.search(r"(.*?) ?million",value).group(1)
                million = True
            except: pass
            try: 
                value = re.search(r"(.*?) ?billion",value).group(1)
                billion = True
            except: pass
            value = float(re.search(r'[\d\.]+', value).group())
            if million: value *= 1e6
            elif billion: value *= 1e9   
            # According to the currency, the value is converted to dollars using an approximated exchange rate
            if currency in currencies:
                if currency == u'\u20AC': # Euros
                    value = value*1.06
                elif currency == u'\u00A3': # Pounds
                    value = value*1.24
                elif currency == u'\u00A5': # Yens
                    value = value*0.00888
            total_gross += value
        except:
            pass
            
    if total_gross == 0: total_gross = ""
    return total_gross

## Building the dataset

At first, we obtain the list of movies and their genre from the xxxx_in_film pages

In [None]:
# Extracts table information from wikipages year_in_film. In each of the pages, the table contains for each movie: title, studio,
# cast and crew, genre and medium. Only the title and the genre is extracted from each film, as the other parameters will be
# obtained from each movie wikipedia page.

l = 0
year_range = range(1989,2017)
movies_list= [""]*len(year_range)
movies_names = [""]*len(year_range)
genres_list = []

path = "/Users/Ferran/Downloads/year_in_film_final/"
path2 = "/Users/Ferran/Downloads/year_in_film_title_final/"
for k in year_range:
    name = str(k) + "_in_film"
    text = download_wiki(name)
    # The table classifies movies according to the month they were released. Therefore, to obtain it, we look for 3 "="
    # (beginning of a subsection) and January. We keep all the characters until we find 2 "=" (end of section)
    dirty_table = re.search(r'=== ?January(.+?)[\}\]]\n\n== ?\w', text, re.DOTALL).group(1)
    
    # Between 2004 and 2011, there is no "Genre" cell
    if k in range(2004,2011):
        # Each of the cells of the table is obtained
        clean_table = re.findall(r"\| \'\'(.+?)\|\|(.*?)\|\|(.*?)\|", dirty_table, re.DOTALL)
        for i in xrange(len(clean_table)): 
            genres_list.append("")
        
    else:
        clean_table = re.findall(r"\| \'\'(.+?)\|\|(.*?)\|\|(.*?)\|\|(.*?)\|\|(.*?)\|", dirty_table, re.DOTALL)
        for i in xrange(len(clean_table)):
            genres_clean = [get_link_name(genre, must_have_link=False).strip() for genre in clean_table[i][3].split(",")]
            genres_list.append(genres_clean)

    #List with all the movies link part in a year
    movies_list[l] = [get_movie_link(clean_table[i][0]) for i in xrange(len(clean_table))]
    
    #List with all the movies name part in a year
    movies_names[l] = [get_link_name(clean_table[i][0]) for i in xrange(len(clean_table))]    
        
    #Movies names and links saved in files
    with open("%s%s_in_film.txt" %(path, k) , "w") as text_file:
        for movie in movies_list[l]:
            print>>text_file, movie.replace("/",".")
    
    with open("%s%s_in_film_title.txt" %(path2, k) , "w") as text_file:
        for title in movies_names[l]:
            print>>text_file, title

    l += 1

Then, we download all movie wikis and save them to text files

In [None]:
j = 0
i = 0
path = "/Users/Ferran/Downloads/movie_wikipages/"
for movies_in_year in movies_list:
    for movie in movies_list[j]:
        try:
            # Downloads movie wikis. urllib.quote() is used to convert special characters to % syntax.
            movie_wiki = download_wiki(urllib.quote(movie)).encode("utf-8") 
            if movie_wiki == None: continue
            # Each Wikipedia page is saved in a file.
            with open("%s%s.txt" % (path,movie.replace("/",".")), "w") as text_file:
                text_file.write(movie_wiki)
                
        except AttributeError:
            print "Error: " + str(i+1) + ". " + movie
   
        i+=1
    j+=1

Finally, we get the movies attributes lists using the functions defined above and the result is saved to a clean dictionary that only contains relevant information.

In [None]:
path_year = "C:\Users\s161328\Downloads\year_in_film_final\\"
#path_year = "/Users/Ferran/Downloads/year_in_film_final/"

#path_title = "/Users/Ferran/Downloads/year_in_film_title_final/"
path_title = "C:\Users\s161328\Downloads\year_in_film_title_final\\"

#path_wikis = "/Users/Ferran/Downloads/movie_wikipages/"
path_wikis = "C:\Users\s161328\Downloads\movie_wikipages\\"

year_range = range(1989,2017)
movies_dict = dict()

i = 0
for year in year_range: 
    f = open('%s%s_in_film.txt' %(path_year, year))
    movies_list = f.read().split("\n")[0:-1]
    
    g = open('%s%s_in_film_title.txt' %(path_title, year))
    movies_names = g.read().split("\n")[0:-1]
    j = 0
    currencies = [u'\u00A3', u'\u0024', u'\u00A5', u'\u20AC']

    for movie in movies_list: 
        try:
            wikitext = open('%s%s.txt' % (path_wikis, movie)).read() 
            total_gross = 0;
            # We first extract the infobox area
            infobox = re.search(r'[iI]nfobox [fF]ilm(.+?)\'\'\'\'\'', wikitext, re.DOTALL).group(1)
            # Then we look for each attribute and write it in a dictionary.
            if movies_names[j] not in movies_dict.keys():
                movies_dict[movies_names[j]] = {"Director": get_movie_attribute(infobox, "director",j),
                                               "Starring": get_movie_attribute(infobox, "starring",j),
                                               "Language": get_movie_attribute(infobox, "language",j),
                                               "Country": get_movie_attribute(infobox, "country",j),
                                               "Image": get_cover_link(urllib2.quote(movies_list[j]), get_movie_attribute(infobox, "image",j).replace(" ", "_")),
                                               "Year": year,
                                               "Gross": get_gross(infobox, j),
                                               "Genre": genres_list[i]}
        except Exception as e:
            print e
            print str(j+1) + ". Movie without infobox or wikipage does not exist"
        
        i += 1
        j += 1
# The obtained dictionary is save in a txt file
json.dump(movies_dict, open("movies_dict.txt", 'w'))