# Python code profilng
- Code / Example inspired by Sebastian Mathot: 
    - https://www.youtube.com/watch?v=8qEnExGLZfY

---

### What is pop?

In [9]:
list_ = list(range(10))
list_

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [10]:
nine = list_.pop()
print(nine, list_)

9 [0, 1, 2, 3, 4, 5, 6, 7, 8]


## Goal: Find all duplicate movies from a text file of 10,000 movie titles.

In [5]:
def read_movies(src):
    """Read movies from a text file, return the movie titles as a list"""
    
    with open(src) as f:
        movie_list = f.read().splitlines() 
        movie_list = [movie.lower() for movie in movie_list]
        return movie_list

In [59]:
def is_duplicate(item, collection):
    
    """Determine (True or False) whether a given item (i.e. movie)
       is in a collection of other movie titles (i.e. list).
       
       If you've exhausted the list of movies and found no matches, return False."""
    if item in collection:
        return True
#     for movie in collection:
#         if movie == item:
#             return True
        
    return False

In [None]:
@profile
def find_duplicate_movies(src='movies.txt'):
    
    """Return all movies that appear twice (i.e. duplicates) in the text file.
       Search through the list of movies systematically, collecting duplicates as you go."""
    
    movie_list = read_movies(src)
    duplicates = []
    
    while movie_list: 
        
        movie = movie_list.pop()
        
        if is_duplicate(movie, movie_list): 
            
            duplicates.append(movie)
            
    
    return duplicates

In [3]:
@profile
def faster_find_duplicate_movies(src='movies.txt'):
    
    """Return all movies that appear twice (i.e. duplicates) in the text file.
       Search through the list of movies systematically, collecting duplicates as you go."""
    
    movie_list = read_movies(src)
    duplicates = []
    ### Create an index - alphabetically sorted list
    ### we can know that duplicate movies will be right next to each other
    ### Then we just have to look for duplicate pairs and those are our duplicate movies
    sorted_movie_list = sorted(movie_list)
    pairs = [(movie1, movie2) for movie1, movie2 in zip(sorted_movie_list[1:], sorted_movie_list[:-1])]
    duplicates = [movie1 for (movie1, movie2) in pairs if movie1 == movie2]  
    return duplicates

In [11]:
%timeit [x for x in range(10000000)]

1.01 s ± 14.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


---

### cProfile decorator:

In [2]:
import cProfile, pstats, io


def profile(fnc):
    
    """A decorator that uses cProfile to profile a function. 
       Starts the profile before executing a function, then exeuctes the function,
       then stops the profile, then prints out a diagnostics report.
       
       Lots of boilerplate code from the Python 3 documentation:
       https://docs.python.org/3/library/profile.html#profile.Profile
       """
    
    def inner(*args, **kwargs):
        
        pr = cProfile.Profile()
        pr.enable()  
        retval = fnc(*args, **kwargs)       
        pr.disable() 
        s = io.StringIO()
        sortby = 'cumulative'
        ps = pstats.Stats(pr, stream=s).sort_stats(sortby)
        ps.print_stats()
        print(s.getvalue())

        return retval

    return inner

In [22]:
hash('strings')

3634254388487460745

In [27]:
#1
find_duplicate_movies()

         98214041 function calls in 42.888 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.075    0.075   42.888   42.888 <ipython-input-26-87c1d2ff59d4>:1(find_duplicate_movies)
    10000   25.021    0.003   42.791    0.004 <ipython-input-25-bb5709c6a925>:1(is_duplicate)
 98193766   17.770    0.000   17.770    0.000 {method 'lower' of 'str' objects}
    10000    0.018    0.000    0.018    0.000 {method 'pop' of 'list' objects}
        1    0.000    0.000    0.004    0.004 <ipython-input-1-27c53f1fe273>:1(read_movies)
        1    0.002    0.002    0.002    0.002 {method 'splitlines' of 'str' objects}
        1    0.001    0.001    0.001    0.001 {built-in method io.open}
        1    0.000    0.000    0.000    0.000 {method 'read' of '_io.TextIOWrapper' objects}
      263    0.000    0.000    0.000    0.000 {method 'append' of 'list' objects}
        1    0.000    0.000    0.000    0.000 /home/tommu/anacon

['Zookeeper (2011)',
 'Miracle on 34th Street (1994)',
 'Babylon 5: Thirdspace (1998)',
 'Police Academy 6: City Under Siege (1989)',
 'War of the Worlds (2005)',
 'Chaplin (1992)',
 'Twelfth Night (1996)',
 'Memento (2000)',
 'Fire and Ice (2008)',
 'Stan Helsing (2009)',
 'Intimate Strangers (Confidences trop intimes) (2004)',
 'Anything for Her (Pour elle) (2008)',
 'Simpatico (1999)',
 'High School Musical 2 (2007)',
 'Big Blue, The (Grand bleu, Le) (1988)',
 'Bedazzled (1967)',
 'Remember Me (Ricordati di me) (2003)',
 'Saturn 3 (1980)',
 '11:14 (2003)',
 "Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)",
 'Thousand Words, A (2012)',
 'Carnosaur (1993)',
 'Cold Fish (Tsumetai nettaigyo) (2010)',
 'Very Potter Sequel, A (2010)',
 'Antichrist (2009)',
 'Captain Horatio Hornblower R.N. (1951)',
 'Postman Always Rings Twice, The (1981)',
 'Red Violin, The (Violon rouge, Le) (1998)',
 'Sorority House Massacre II (1990)',
 'Just Jim (2015)'

In [31]:
#2 
find_duplicate_movies()

         30276 function calls in 3.394 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.028    0.028    3.394    3.394 <ipython-input-30-87c1d2ff59d4>:1(find_duplicate_movies)
    10000    3.349    0.000    3.349    0.000 <ipython-input-29-6689f546dda9>:1(is_duplicate)
        1    0.001    0.001    0.011    0.011 <ipython-input-28-0d67e89540a9>:1(read_movies)
        1    0.003    0.003    0.006    0.006 <ipython-input-28-0d67e89540a9>:6(<listcomp>)
    10000    0.006    0.000    0.006    0.000 {method 'pop' of 'list' objects}
    10000    0.003    0.000    0.003    0.000 {method 'lower' of 'str' objects}
        1    0.002    0.002    0.002    0.002 {method 'splitlines' of 'str' objects}
        1    0.002    0.002    0.002    0.002 {built-in method io.open}
        1    0.000    0.000    0.001    0.001 {method 'read' of '_io.TextIOWrapper' objects}
        1    0.000    0.000    0.000    0.000 /home/tommu

['zookeeper (2011)',
 'miracle on 34th street (1994)',
 'babylon 5: thirdspace (1998)',
 'police academy 6: city under siege (1989)',
 'war of the worlds (2005)',
 'chaplin (1992)',
 'twelfth night (1996)',
 'memento (2000)',
 'fire and ice (2008)',
 'stan helsing (2009)',
 'intimate strangers (confidences trop intimes) (2004)',
 'anything for her (pour elle) (2008)',
 'simpatico (1999)',
 'high school musical 2 (2007)',
 'big blue, the (grand bleu, le) (1988)',
 'bedazzled (1967)',
 'remember me (ricordati di me) (2003)',
 'saturn 3 (1980)',
 '11:14 (2003)',
 "harry potter and the sorcerer's stone (a.k.a. harry potter and the philosopher's stone) (2001)",
 'thousand words, a (2012)',
 'carnosaur (1993)',
 'cold fish (tsumetai nettaigyo) (2010)',
 'very potter sequel, a (2010)',
 'antichrist (2009)',
 'captain horatio hornblower r.n. (1951)',
 'postman always rings twice, the (1981)',
 'red violin, the (violon rouge, le) (1998)',
 'sorority house massacre ii (1990)',
 'just jim (2015)'

In [61]:
#3 
find_duplicate_movies()

         30276 function calls in 1.324 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.016    0.016    1.324    1.324 <ipython-input-60-0838555b9ebd>:1(find_duplicate_movies)
    10000    1.281    0.000    1.281    0.000 <ipython-input-59-ea5199d89b98>:1(is_duplicate)
        1    0.001    0.001    0.022    0.022 <ipython-input-58-0d67e89540a9>:1(read_movies)
        1    0.002    0.002    0.009    0.009 <ipython-input-58-0d67e89540a9>:6(<listcomp>)
        1    0.007    0.007    0.007    0.007 {built-in method io.open}
    10000    0.006    0.000    0.006    0.000 {method 'lower' of 'str' objects}
    10000    0.004    0.000    0.004    0.000 {method 'pop' of 'list' objects}
        1    0.000    0.000    0.003    0.003 {method 'read' of '_io.TextIOWrapper' objects}
        1    0.003    0.003    0.003    0.003 {method 'splitlines' of 'str' objects}
        1    0.000    0.000    0.003    0.003 /home/tommu

['zookeeper (2011)',
 'miracle on 34th street (1994)',
 'babylon 5: thirdspace (1998)',
 'police academy 6: city under siege (1989)',
 'war of the worlds (2005)',
 'chaplin (1992)',
 'twelfth night (1996)',
 'memento (2000)',
 'fire and ice (2008)',
 'stan helsing (2009)',
 'intimate strangers (confidences trop intimes) (2004)',
 'anything for her (pour elle) (2008)',
 'simpatico (1999)',
 'high school musical 2 (2007)',
 'big blue, the (grand bleu, le) (1988)',
 'bedazzled (1967)',
 'remember me (ricordati di me) (2003)',
 'saturn 3 (1980)',
 '11:14 (2003)',
 "harry potter and the sorcerer's stone (a.k.a. harry potter and the philosopher's stone) (2001)",
 'thousand words, a (2012)',
 'carnosaur (1993)',
 'cold fish (tsumetai nettaigyo) (2010)',
 'very potter sequel, a (2010)',
 'antichrist (2009)',
 'captain horatio hornblower r.n. (1951)',
 'postman always rings twice, the (1981)',
 'red violin, the (violon rouge, le) (1998)',
 'sorority house massacre ii (1990)',
 'just jim (2015)'

In [6]:
#4 with new function
faster_find_duplicate_movies()

         10016 function calls in 0.024 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    0.024    0.024 <ipython-input-3-664c1380db4a>:1(faster_find_duplicate_movies)
        1    0.000    0.000    0.012    0.012 <ipython-input-5-0d67e89540a9>:1(read_movies)
        1    0.008    0.008    0.008    0.008 {built-in method builtins.sorted}
        1    0.002    0.002    0.005    0.005 <ipython-input-5-0d67e89540a9>:6(<listcomp>)
        1    0.004    0.004    0.004    0.004 <ipython-input-3-664c1380db4a>:13(<listcomp>)
        1    0.003    0.003    0.003    0.003 {method 'splitlines' of 'str' objects}
    10000    0.003    0.000    0.003    0.000 {method 'lower' of 'str' objects}
        1    0.001    0.001    0.002    0.002 {method 'read' of '_io.TextIOWrapper' objects}
        1    0.000    0.000    0.001    0.001 /home/tommu/anaconda3/envs/nlp/lib/python3.7/codecs.py:319(decode)
        1    

['11:14 (2003)',
 '12 years a slave (2013)',
 '3 ninjas (1992)',
 '7 faces of dr. lao (1964)',
 '9 (2009)',
 'a christmas story live! (2017)',
 'adventures of ichabod and mr. toad, the (1949)',
 'alice (2009)',
 'alice (neco z alenky) (1988)',
 'alien: covenant (2017)',
 'american hardcore (2006)',
 'american outlaws (2001)',
 "amy's o (a.k.a. amy's orgasm) (2001)",
 'antichrist (2009)',
 'anything for her (pour elle) (2008)',
 'arrival (2016)',
 'aspen extreme (1993)',
 'azumi (2003)',
 'babylon 5: thirdspace (1998)',
 'bad lieutenant (1992)',
 'battle royale 2: requiem (batoru rowaiaru ii: chinkonka) (2003)',
 'baywatch (2017)',
 'bedazzled (1967)',
 'big blue, the (grand bleu, le) (1988)',
 "big momma's house (2000)",
 'big trouble (2002)',
 'birdman of alcatraz (1962)',
 'black orpheus (orfeu negro) (1959)',
 'blair witch project, the (1999)',
 'blankman (1994)',
 'blood simple (1984)',
 'blood: the last vampire (2009)',
 'blue juice (1995)',
 'boiling point (1993)',
 'bridge of sp