# Popular Role Function

#### Function variables

In [237]:
year = 2017 # sets the focus for file titles
plusYears = 0 # controls the upper range for a desired (Optional)
minusYears = 10 # controls the lower range for a desired year
filmCount = 10 # controls the number of film counts for selection
results = 10 # controls the number of results to be captured
validation = 'off' # controls verbose validation messages

#### Imports & global variable initialization

In [238]:
import pandas as pd
mostPopularStagingFile = pd.DataFrame()
leastPopularStagingFile = pd.DataFrame()

#### Reads for required data sets 

In [228]:
movies = pd.read_csv('./data/us_movies_ratings.csv', sep=',',low_memory=False)
if validation == 'on': display('Rows loaded: ' + str(len(movies)))

principals = pd.read_csv('./data/title.principals.tsv', sep='\t',low_memory=False)
if validation == 'on': display('Rows loaded: ' + str(len(principals)))

names = pd.read_csv('./data/name.basics.tsv', sep='\t',low_memory=False)
if validation == 'on': display('Rows loaded: ' + str(len(names)))

#### Calcuate the desired range of years and select film titles

In [229]:
# calculate the desired selection range by year
fromRange = year - minusYears
toRange = year + plusYears
if validation == 'on': display(fromRange)
if validation == 'on': display(toRange)
    
# select all titles for the desired year
movieSelection = movies[(movies.startYear >= fromRange) & (movies.startYear<= toRange)]
if validation == 'on': display(str(len(movieSelection)))

#### Merge principals data set into movie selections to assign roles to film titles

In [230]:
# merge selected titles 
movieCastSelections = movieSelection.merge(principals, left_on='tconst', right_on='tconst', how='inner')
if validation == 'on': display(str(len(movieCastSelections)))
if validation == 'on': display(movieCastSelections.head())

#### Most & least popular function

In [241]:
def mostPopular(role):
    global mostPopularStagingFile, leastPopularStagingFile
    
    # select all roles from the move/cast data set
    role = movieCastSelections[movieCastSelections.category == role]
    if validation == 'on': display('Roles selected: ' + str(len(actors)))
    
    # aggergate roles by film count and average votes
    roleAgg = (role.groupby('nconst')
       .agg({'tconst':'count', 'numVotes': 'mean'})
       .reset_index()  
       )
    if validation == 'on': display('Rows after combined: ' + str(len(roleAgg)))
    
    # rename columns for clarity
    roleAgg.rename(columns={'tconst': 'totalFilms', 'numVotes': 'avgNumVotes'}, inplace=True)
   
    # drop all roles that fall under the desired film count
    roleSelection = roleAgg[roleAgg.totalFilms >= filmCount]
    if validation == 'on': display('Roles selected by file count: ' + str(len(roleSelection)))
    
    # sort the results by average number of votes in decending order
    roleSelection.sort_values('avgNumVotes', ascending=False, inplace=True)
    
    # merge principals data to remaining roles to get role name
    popularRole = roleSelection.merge(names, left_on='nconst', right_on='nconst', how='inner')
    if validation == 'on': display('Role after prinicpals merge: ' + str(len(popularRole)))
    
    # select the desired top ranked roles
    mostPopularRole = popularRole[:results]
    if validation == 'on': display('Total role selected: ' + str(len(mostPopularRole)))
    
    # select the desired bottom ranked roles
    leastPopularRole = popularRole[-results:]
    if validation == 'on': display('Total role selected: ' + str(len(leastPopularRole)))
    
    # drop non-needed columns from final data set
    mostPopularRole.drop(['birthYear','deathYear','primaryProfession','knownForTitles'], axis=1, inplace=True)
    
    # drop non-needed columns from final data set
    leastPopularRole.drop(['birthYear','deathYear','primaryProfession','knownForTitles'], axis=1, inplace=True)
    
    mostPopularStagingFile = mostPopularStagingFile.append(mostPopularRole)
    leastPopularStagingFile = leastPopularStagingFile.append(leastPopularRole)
    
    # display final results for most popular role
    display(mostPopularRole)
    
    # display final results for least popular role
    display(leastPopularRole)    

In [242]:
mostPopular('actor')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,nconst,totalFilms,avgNumVotes,primaryName
0,nm0000375,16,435877.0625,Robert Downey Jr.
1,nm0000288,13,418878.230769,Christian Bale
2,nm0000093,14,344362.285714,Brad Pitt
3,nm0695435,10,290905.1,Chris Pratt
4,nm0004874,11,274158.0,Vin Diesel
5,nm1165110,11,270048.909091,Chris Hemsworth
6,nm0262635,16,268700.4375,Chris Evans
7,nm0000226,10,267559.0,Will Smith
8,nm0413168,11,261304.272727,Hugh Jackman
9,nm0177896,21,254875.47619,Bradley Cooper


Unnamed: 0,nconst,totalFilms,avgNumVotes,primaryName
117,nm0000598,14,54947.071429,Dennis Quaid
118,nm0005377,13,54290.461538,Sam Rockwell
119,nm0519043,16,52067.125,Justin Long
120,nm0005188,15,50547.333333,James Marsden
121,nm0000112,12,46394.083333,Pierce Brosnan
122,nm0001427,11,41026.090909,Greg Kinnear
123,nm0029400,10,38817.0,Michael Angarano
124,nm0005048,11,36053.636364,Thomas Jane
125,nm0000199,10,33960.2,Al Pacino
126,nm0000501,11,32232.818182,Ray Liotta


In [243]:
mostPopular('actress')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,nconst,totalFilms,avgNumVotes,primaryName
0,nm2225369,14,324810.428571,Jennifer Lawrence
1,nm0757855,10,320470.1,Zoe Saldana
2,nm0424060,13,289443.461538,Scarlett Johansson
3,nm0004266,16,273949.6875,Anne Hathaway
4,nm0680983,11,227383.818182,Ellen Page
5,nm0010736,19,213156.684211,Amy Adams
6,nm0000234,13,208468.153846,Charlize Theron
7,nm0000949,15,205541.933333,Cate Blanchett
8,nm1567113,13,201298.461538,Jessica Chastain
9,nm1297015,18,201079.666667,Emma Stone


Unnamed: 0,nconst,totalFilms,avgNumVotes,primaryName
49,nm0000295,14,59016.428571,Kate Beckinsale
50,nm0647634,10,57946.2,Elizabeth Olsen
51,nm0731075,14,57448.0,Emma Roberts
52,nm0915208,17,56980.529412,Naomi Watts
53,nm0000161,10,55449.1,Salma Hayek
54,nm0939697,10,54378.7,Evan Rachel Wood
55,nm0000215,14,52648.071429,Susan Sarandon
56,nm0266824,10,38119.8,Dakota Fanning
57,nm0005028,10,37099.9,Kate Hudson
58,nm1950086,11,23994.636364,Greta Gerwig


#### Write results to a CSV file

In [244]:
mostPopularStagingFile.to_csv('./data/most_popular_roles.csv', sep=',')
leastPopularStagingFile.to_csv('./data/least_popular_roles.csv', sep=',')