# Popular Role Function

#### Function variables

In [87]:
year = 2017 # sets the focus for file titles
plusYears = 0 # controls the upper range for a desired (Optional)
minusYears = 10 # controls the lower range for a desired year
results = 10 # controls the number of results to be captured for actors & actress
validation = 'off' # controls verbose validation messages
viewResults = 'off'

#### Most & least popular function

In [88]:
import pandas as pd
import pickle
mostPopularStagingFile = pd.DataFrame()
leastPopularStagingFile = pd.DataFrame()

def startUp():
    movies = pd.read_csv('./data/us_movies_ratings.csv', sep=',',low_memory=False)
    if validation == 'on': display('Rows loaded: ' + str(len(movies)))

    principals = pd.read_csv('./data/title.principals.tsv', sep='\t',low_memory=False)
    if validation == 'on': display('Rows loaded: ' + str(len(principals)))

    names = pd.read_csv('./data/name.basics.tsv', sep='\t',low_memory=False)
    if validation == 'on': display('Rows loaded: ' + str(len(names)))

    # calculate the desired selection range by year
    fromRange = year - minusYears
    toRange = year + plusYears
    if validation == 'on': display(fromRange)
    if validation == 'on': display(toRange)

    # select all titles for the desired year
    movieSelection = movies[(movies.startYear >= fromRange) & (movies.startYear<= toRange)]
    if validation == 'on': display(str(len(movieSelection)))

    # merge selected titles 
    movieCastSelections = movieSelection.merge(principals, left_on='tconst', right_on='tconst', how='inner')
    if validation == 'on': display(str(len(movieCastSelections)))
    if validation == 'on': display(movieCastSelections.head())

def mostPopular(role, filmCount):
    global mostPopularStagingFile, leastPopularStagingFile
    
    # select all roles from the move/cast data set
    role = movieCastSelections[movieCastSelections.category == role]
    if validation == 'on': display('Roles selected: ' + str(len(role)))
    
    # aggergate roles by film count and average votes
    roleAgg = (role.groupby('nconst')
       .agg({'tconst':'count', 'numVotes': 'mean'})
       .reset_index()  
       )
    if validation == 'on': display('Rows after combined: ' + str(len(roleAgg)))
    
    # rename columns for clarity
    roleAgg.rename(columns={'tconst': 'totalFilms', 'numVotes': 'avgNumVotes'}, inplace=True)
   
    # drop all roles that fall under the desired film count
    roleSelection = roleAgg[roleAgg.totalFilms >= filmCount]
    if validation == 'on': display('Roles selected by file count: ' + str(len(roleSelection)))
    
    # sort the results by average number of votes in decending order
    roleSelection.sort_values('avgNumVotes', ascending=False, inplace=True)
    
    # merge principals data to remaining roles to get role name
    popularRole = roleSelection.merge(names, left_on='nconst', right_on='nconst', how='inner')
    if validation == 'on': display('Role after prinicpals merge: ' + str(len(popularRole)))
    
    # select the desired top ranked roles
    mostPopularRole = popularRole[:results]
    if validation == 'on': display('Total role selected: ' + str(len(mostPopularRole)))
    
    # select the desired bottom ranked roles
    leastPopularRole = popularRole[-results:]
    if validation == 'on': display('Total role selected: ' + str(len(leastPopularRole)))
    
    # drop non-needed columns from final data set
    mostPopularRole.drop(['birthYear','deathYear','primaryProfession','knownForTitles'], axis=1, inplace=True)
    
    # drop non-needed columns from final data set
    leastPopularRole.drop(['birthYear','deathYear','primaryProfession','knownForTitles'], axis=1, inplace=True)
    
    mostPopularStagingFile = mostPopularStagingFile.append(mostPopularRole)
    leastPopularStagingFile = leastPopularStagingFile.append(leastPopularRole)
    
    # display final results for most popular role
    if viewResults == 'on': display(mostPopularRole)
    
    # display final results for least popular role
    if viewResults == 'on': display(leastPopularRole)

def buildBinaryMatrix():
    # place all role ids into a list
    popularRoles = mostPopularStagingFile['nconst'].values
    if validation == 'on': display('Count of popular roles list: ' + str(len(popularRoles)))
    
    # using the movie/cast dataframe, pull all titles that have a popular role listed
    titlesWithAPopularRole = movieCastSelections[movieCastSelections['nconst'].isin(popularRoles)]
    if validation == 'on': display('Total files with a popular role: ' + str(len(titlesWithAPopularRole.shape)))
    
    # select all titles that have a popular actor
    titlesWithAPopularActor = titlesWithAPopularRole[titlesWithAPopularRole.category == 'actor']
    if validation == 'on': display('Total files with a popular actor: ' + star(len(titlesWithAPopularActor.shape)))  
    # place all tconsts in to a list
    popularActors = titlesWithAPopularActor['tconst'].values
    
    # select all titles that have a popular actress
    titlesWithAPopularActress = titlesWithAPopularRole[titlesWithAPopularRole.category == 'actress']
    if validation == 'on': display('Total files with a popular actress: ' + star(len(titlesWithAPopularActress.shape)))
    # place all tconsts in to a list
    popularActresses = titlesWithAPopularActress['tconst'].values
        
    # select all titles that have a popular director
    titlesWithAPopularDirector = titlesWithAPopularRole[titlesWithAPopularRole.category == 'director']
    if validation == 'on': display('Total files with a popular director: ' + star(len(titlesWithAPopularDirector.shape)))
    popularDirectors = titlesWithAPopularDirector['tconst'].values
    
    # select all titles that have a popular director
    titlesWithAPopularWriter = titlesWithAPopularRole[titlesWithAPopularRole.category == 'writer']
    if validation == 'on': display('Total files with a popular writer: ' + star(len(titlesWithAPopularWriter.shape)))
    popularWriters = titlesWithAPopularWriter['tconst'].values
    
    # make a copy of the orignal movie file (Vidit's file)
    binaryMatrix = movies.copy()
    
    # drop non-needed columns from final data set
    binaryMatrix.drop(['primaryTitle','startYear', 'averageRating'], axis=1, inplace=True)
    
    # build the binary mapping for actor, actress, director and popular writers
    binaryMatrix['popularGenre'] = 0
    binaryMatrix['popularActor'] = (binaryMatrix['tconst'].isin(popularActors)).astype(int)
    binaryMatrix['popularActress'] = (binaryMatrix['tconst'].isin(popularActresses)).astype(int)
    binaryMatrix['popularDirector'] = (binaryMatrix['tconst'].isin(popularDirectors)).astype(int)
    binaryMatrix['popularWriter'] = (binaryMatrix['tconst'].isin(popularWriters)).astype(int)
    
    if validation == 'on': display('Total counts for actor: ' + str(binaryMatrix.PopularActor.value_counts()))
    if validation == 'on': display('Total counts for actor: ' + str(binaryMatrix.PopularActress.value_counts()))
    if validation == 'on': display('Total counts for actor: ' + str(binaryMatrix.PopularDirector.value_counts()))
    if validation == 'on': display('Total counts for actor: ' + str(binaryMatrix.PopularPopular.value_counts()))


In [90]:
startUp()
mostPopular('actor', 10)
mostPopular('actress', 10)
mostPopular('director', 5)
mostPopular('writer', 5)
buildBinaryMatrix()
mostPopularStagingFile.to_csv('./data/most_popular_roles.csv', sep=',')
leastPopularStagingFile.to_csv('./data/least_popular_roles.csv', sep=',')
binaryMatrix.to_csv('./data/model_data_set3.csv', sep=',')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
