In [4]:
import pandas as pd

movies = pd.read_pickle('movies.p')
taglines = pd.read_pickle('taglines.p')

# merge using left join
movies_taglines = movies.merge(taglines, on='id', how='left')

print(movies_taglines.shape)

(4803, 5)


In [None]:
financials = pd.read_pickle('financials.p')

# Merge the movies table with the financials table with a left join
movies_financials = movies.merge(financials, on='id', how='left')

# Count the number of rows in the budget column that are missing
number_of_missing_fin = movies_financials['budget'].isna().sum()

# Print the number of movies missing financials
print(number_of_missing_fin)

In [18]:
import pandas as pd

# dataset com informações sobre vários filmes
movies = pd.read_pickle('movies.p')

# ^ : Indica que o texto deve começar com o padrão a seguir
# . : indica qualquer coisa.
# * :  0 ou + ocorrências do char anterior.
#.* : aceita qualquer coisa que vier depois
regex = "^Toy Story.*"

# encontra os filmes na tabela cujo título preenche o padrão
# da regex e atribui à variável toy_story
toy_story = movies[movies.title.str.match(regex)]
toy_story

Unnamed: 0,id,title,popularity,release_date
103,10193,Toy Story 3,59.995418,2010-06-16
2637,863,Toy Story 2,73.575118,1999-10-30
3716,862,Toy Story,73.640445,1995-10-30


In [19]:
# Merge the toy_story and taglines tables with a inner join
toystory_tag = toy_story.merge(taglines, on='id')

# Print the rows and shape of toystory_tag
print(toystory_tag)
print(toystory_tag.shape)

      id        title  popularity release_date                   tagline
0  10193  Toy Story 3   59.995418   2010-06-16  No toy gets left behind.
1    863  Toy Story 2   73.575118   1999-10-30        The toys are back!
(2, 5)


# Merging with columns that have different names

In [21]:
import pandas as pd
movie_to_genres = pd.read_pickle('movie_to_genres.p')

m = movie_to_genres['genre'] == 'TV Movie'
tv_genre = movie_to_genres[m]

tv_movies = movies.merge(tv_genre, how='right',
left_on='id', right_on='movie_id')
print(tv_movies.head())

       id                      title  popularity release_date  movie_id  \
0   10947        High School Musical   16.536374   2006-01-20     10947   
1   13187  A Charlie Brown Christmas    8.701183   1965-12-09     13187   
2   22488         Love's Abiding Joy    1.128559   2006-10-06     22488   
3   78814       We Have Your Husband    0.102003   2011-11-12     78814   
4  153397                   Restless    0.812776   2012-12-07    153397   

      genre  
0  TV Movie  
1  TV Movie  
2  TV Movie  
3  TV Movie  
4  TV Movie  


# Merge with outer join

In [22]:
m = movie_to_genres['genre'] == 'Family'
family = movie_to_genres[m].head(3)


m = movie_to_genres['genre'] == 'Comedy'
comedy = movie_to_genres[m].head(3)

family_comedy = family.merge(comedy, on='movie_id', how='outer',
suffixes=('_fam', '_com'))
print(family_comedy)

   movie_id genre_fam genre_com
0        12    Family       NaN
1        35    Family    Comedy
2       105    Family       NaN
3         5       NaN    Comedy
4        13       NaN    Comedy


In [None]:
# use the same logic above to create action and scifi movies
action_movies = []
scifi_movies = []

# Merge action_movies to the scifi_movies with right join
action_scifi = action_movies.merge(scifi_movies, on='movie_id', how='right',
                                   suffixes=('_act','_sci'))

# From action_scifi, select only the rows where the genre_act column is null
scifi_only = action_scifi[action_scifi['genre_act'].isnull()]

# Merge the movies and scifi_only tables with an inner join
movies_and_scifi_only = movies.merge(scifi_only,left_on='id',right_on='movie_id')

# Print the first few rows and shape of movies_and_scifi_only
print(movies_and_scifi_only.head())
print(movies_and_scifi_only.shape)

# Use right join to merge the movie_to_genres and pop_movies tables
genres_movies = movie_to_genres.merge(pop_movies, how='right',
                                      left_on='movie_id',
                                      right_on='id')

# Count the number of genres
genre_count = genres_movies.groupby('genre').agg({'id':'count'})

# Plot a bar chart of the genre_count
genre_count.plot(kind='bar')
plt.show()

In [None]:
iron_1_actors = []
iron_2_actors = []
# Merge iron_1_actors to iron_2_actors on id with outer join using suffixes
iron_1_and_2 = iron_1_actors.merge(iron_2_actors,
                                     on='id',
                                     how='outer',
                                     suffixes=('_1','_2'))

# Create an index that returns true if name_1 or name_2 are null
m = ((iron_1_and_2['name_1'].isna()) |
     (iron_1_and_2['name_2'].isna()))

# Print the first few rows of iron_1_and_2
print(iron_1_and_2[m].head())

In [32]:
import pandas as pd

sequels = pd.read_pickle('sequels.p')

original_sequels = sequels.merge(sequels, left_on='sequel', right_on='id',
suffixes=('_org','_seq'))
original_sequels_sorted = original_sequels.sort_values(by=["title_org", "title_seq"])
original_sequels_sorted.to_csv('original_sequels.csv')
print(original_sequels_sorted.head())

    id_org                                    title_org  sequel_org  id_seq  \
38   64688                               21 Jump Street      187017  187017   
71   10072  A Nightmare on Elm Street 3: Dream Warriors       10131   10131   
51    3049                   Ace Ventura: Pet Detective        9273    9273   
52     348                                        Alien         679     679   
29    6477                      Alvin and the Chipmunks       23398   23398   

                                        title_seq  sequel_seq  
38                                 22 Jump Street        <NA>  
71  A Nightmare on Elm Street 4: The Dream Master        <NA>  
51                 Ace Ventura: When Nature Calls        <NA>  
52                                         Aliens        <NA>  
29        Alvin and the Chipmunks: The Squeakquel        <NA>  


In [33]:
import pandas as pd

crews = pd.read_pickle('crews.p')

# Merge the crews table to itself
crews_self_merged = crews.merge(crews, on='id', how='inner',
                                suffixes=('_dir','_crew'))

# Create a boolean index to select the appropriate rows
boolean_filter = ((crews_self_merged['job_dir'] == 'Director') &
                  (crews_self_merged['job_crew'] != 'Director'))
direct_crews = crews_self_merged[boolean_filter]

# Print the first few rows of direct_crews
print(direct_crews.head())

        id department_dir   job_dir       name_dir department_crew  \
156  19995      Directing  Director  James Cameron         Editing   
157  19995      Directing  Director  James Cameron           Sound   
158  19995      Directing  Director  James Cameron      Production   
160  19995      Directing  Director  James Cameron         Writing   
161  19995      Directing  Director  James Cameron             Art   

           job_crew          name_crew  
156          Editor  Stephen E. Rivkin  
157  Sound Designer  Christopher Boyes  
158         Casting          Mali Finn  
160          Writer      James Cameron  
161    Set Designer    Richard F. Mays  


In [41]:
import pandas as pd

movies = pd.read_csv('movies.csv',index_col=['id'])
taglines = pd.read_pickle('taglines.p')

movies_taglines = movies.merge(taglines, on='id', how='left')
print(movies_taglines.head())

      id  Unnamed: 0                 title  popularity release_date  \
0    257           0          Oliver Twist   20.415572   2005-09-23   
1  14290           1  Better Luck Tomorrow    3.877036   2002-01-12   
2  38365           2             Grown Ups   38.864027   2010-06-24   
3   9672           3              Infamous    3.680896   2006-11-16   
4  12819           4       Alpha and Omega   12.300789   2010-09-17   

                                           tagline  
0                                              NaN  
1             Never underestimate an overachiever.  
2  Boys will be boys. . . some longer than others.  
3          There's more to the story than you know  
4                           A Pawsome 3D Adventure  


In [None]:
samuel = pd.read_csv('samuel.csv',index_col=['movie_id','cast_id'])
print(samuel.head())

casts = pd.read_csv('casts.csv',index_col=['movie_id','cast_id'])
print(casts.head())

# Multiindex merge

In [None]:
samuel_casts = samuel.merge(casts, on=['movie_id','cast_id'])
print(samuel_casts.head())
print(samuel_casts.shape)

# Index merge with left_on and right_on

In [None]:
movies_genres = movies.merge(movie_to_genres, left_on='id', left_index=True,
right_on='movie_id', right_index=True)
print(movies_genres.head())

In [None]:
import pandas as pd
ratings = pd.read_pickle('ratings.p')
movies = pd.read_pickle('movies.p')

# Merge to the movies table the ratings table on the index
movies_ratings = movies.merge(ratings, on='id')

# Print the first few rows of movies_ratings
print(movies_ratings.head())

In [None]:
financials = pd.read_pickle('financials.p')

# Merge sequels and financials on index id
sequels_fin = sequels.merge(financials, on='id', how='left')

# Self merge with suffixes as inner join with left on sequel and right on id
orig_seq = sequels_fin.merge(sequels_fin, how='inner', left_on='sequel',
                             right_on='id', right_index=True,
                             suffixes=('_org','_seq'))

# Add calculation to subtract revenue_org from revenue_seq
orig_seq['diff'] = orig_seq['revenue_seq'] - orig_seq['revenue_org']

# Select the title_org, title_seq, and diff
titles_diff = orig_seq[['title_org','title_seq','diff']]

# Print the first rows of the sorted titles_diff
print(titles_diff.sort_values(by='diff',ascending=False).head())