In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter

In [2]:
data = pd.read_csv('movie_bd_v5.csv')
data.sample(5)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year
1392,tt0358273,28000000,186438883,Walk the Line,Joaquin Phoenix|Reese Witherspoon|Ginnifer Goo...,James Mangold,Love is a burning thing.,A chronicle of country music legend Johnny Cas...,136,Drama|Music|Romance,Tree Line Films|Konrad Pictures|Catfish Produc...,9/13/2005,7.0,2005
1131,tt0324216,9500000,107071655,The Texas Chainsaw Massacre,Jessica Biel|Jonathan Tucker|Erica Leerhsen|Mi...,Marcus Nispel,What you know about fear... doesn't even come ...,After picking up a traumatized young hitchhike...,98,Horror,New Line Cinema|Next Entertainment|Platinum Du...,10/17/2003,6.0,2003
1781,tt0841046,35000000,18317151,Walk Hard: The Dewey Cox Story,John C. Reilly|Jenna Fischer|Tim Meadows|Krist...,Jake Kasdan,Life made him tough. Love made him strong. Mus...,Singer Dewey Cox overcomes adversity to become...,96,Comedy|Music,Columbia Pictures Corporation|Apatow Productions,12/21/2007,6.5,2007
1001,tt1259521,30000000,66486080,The Cabin in the Woods,Kristen Connolly|Chris Hemsworth|Anna Hutchiso...,Drew Goddard,If you hear a strange sound outside... have sex,Five college friends spend the weekend at a re...,95,Horror|Thriller,Lionsgate|Mutant Enemy Productions,4/12/2012,6.5,2012
732,tt1568346,90000000,232617430,The Girl with the Dragon Tattoo,Daniel Craig|Rooney Mara|Christopher Plummer|G...,David Fincher,Evil shall with evil be expelled.,This English-language adaptation of the Swedis...,158,Thriller|Crime|Mystery|Drama,Columbia Pictures|Scott Rudin Productions|Film...,12/14/2011,7.1,2011


In [3]:
data.describe()

Unnamed: 0,budget,revenue,runtime,vote_average,release_year
count,1889.0,1889.0,1889.0,1889.0,1889.0
mean,54310830.0,155365300.0,109.658549,6.140762,2007.860773
std,48587210.0,214669800.0,18.017041,0.764763,4.468841
min,5000000.0,2033165.0,63.0,3.3,2000.0
25%,20000000.0,34560580.0,97.0,5.6,2004.0
50%,38000000.0,83615410.0,107.0,6.1,2008.0
75%,72000000.0,178262600.0,120.0,6.6,2012.0
max,380000000.0,2781506000.0,214.0,8.1,2015.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1889 entries, 0 to 1888
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   imdb_id               1889 non-null   object 
 1   budget                1889 non-null   int64  
 2   revenue               1889 non-null   int64  
 3   original_title        1889 non-null   object 
 4   cast                  1889 non-null   object 
 5   director              1889 non-null   object 
 6   tagline               1889 non-null   object 
 7   overview              1889 non-null   object 
 8   runtime               1889 non-null   int64  
 9   genres                1889 non-null   object 
 10  production_companies  1889 non-null   object 
 11  release_date          1889 non-null   object 
 12  vote_average          1889 non-null   float64
 13  release_year          1889 non-null   int64  
dtypes: float64(1), int64(4), object(9)
memory usage: 206.7+ KB


# Предобработка

In [5]:
answers = {} # создадим словарь для ответов

# в указанной колонке конвертируем строки с разделителями в list, после чего делаем explode:
def df_explode_str(df, colname):
    new_df = df.copy()
    new_df[colname] = new_df[colname].apply(lambda s: s.split('|'))
    return new_df.explode(colname)

# генерим поле title для ответов:
data['title'] = data.apply(lambda row: f'{row.original_title} ({row.imdb_id})', axis=1)

# вычисляем прибыль:
data['profit'] = data.revenue - data.budget;

# конвертируем дату из str в datetime
data.release_date = pd.to_datetime(data.release_date, format='%m/%d/%Y')

In [6]:
data.sample(5)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,title,profit
822,tt1540133,6000000,19560274,The Guard,Brendan Gleeson|Don Cheadle|Liam Cunningham|Ma...,John Michael McDonagh,The FBI are about to discover that things work...,An unorthodox Irish policeman with a confronta...,96,Action|Comedy|Thriller|Crime,UK Film Council|Crescendo Productions|Element ...,2011-07-07,6.6,2011,The Guard (tt1540133),13560274
1468,tt0410297,40000000,114830111,The Lake House,Keanu Reeves|Sandra Bullock|Shohreh Aghdashloo...,Alejandro Agresti,How do you hold on to someone you've never met?,A lonely doctor who once occupied an unusual l...,99,Romance|Drama|Mystery,Village Roadshow Pictures|Vertigo Entertainmen...,2006-06-16,6.3,2006,The Lake House (tt0410297),74830111
1562,tt0317198,50000000,40203020,Bridget Jones: The Edge of Reason,RenÃ©e Zellweger|Hugh Grant|Colin Firth|Jim Br...,Beeban Kidron,Same Bridget. Brand new diary.,The story picks up four weeks after the first ...,108,Comedy|Romance,Miramax Films|Universal Pictures|Studio Canal|...,2004-11-10,6.0,2004,Bridget Jones: The Edge of Reason (tt0317198),-9796980
237,tt2528814,17000000,29789000,God's Not Dead,Kevin Sorbo|Shane Harper|David A.R. White|Dean...,Harold Cronk,What do you believe?,College philosophy professor Mr. Radisson's cu...,113,Drama,Pure Flix Entertainment,2014-03-21,6.1,2014,God's Not Dead (tt2528814),12789000
1589,tt0346156,70000000,57958696,Sky Captain and the World of Tomorrow,Jude Law|Gwyneth Paltrow|Giovanni Ribisi|Angel...,Kerry Conran,Who will save us?,"When gigantic robots attack New York City, ""Sk...",107,Mystery|Action|Thriller|Science Fiction|Adventure,Paramount Pictures|Natural Nylon Entertainment...,2004-09-17,5.7,2004,Sky Captain and the World of Tomorrow (tt0346156),-12041304


# 1. У какого фильма из списка самый большой бюджет?

Использовать варианты ответов в коде решения запрещено.    
Вы думаете и в жизни у вас будут варианты ответов?)

In [7]:
data[data.budget == data.budget.max()]

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,title,profit
723,tt1298650,380000000,1021683000,Pirates of the Caribbean: On Stranger Tides,Johnny Depp|PenÃ©lope Cruz|Geoffrey Rush|Ian M...,Rob Marshall,Live Forever Or Die Trying.,Captain Jack Sparrow crosses paths with a woma...,136,Adventure|Action|Fantasy,Walt Disney Pictures|Jerry Bruckheimer Films|M...,2011-05-11,6.3,2011,Pirates of the Caribbean: On Stranger Tides (t...,641683000


ВАРИАНТ 2

In [8]:
data.sort_values(by='budget', ascending=False).head(1)

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,title,profit
723,tt1298650,380000000,1021683000,Pirates of the Caribbean: On Stranger Tides,Johnny Depp|PenÃ©lope Cruz|Geoffrey Rush|Ian M...,Rob Marshall,Live Forever Or Die Trying.,Captain Jack Sparrow crosses paths with a woma...,136,Adventure|Action|Fantasy,Walt Disney Pictures|Jerry Bruckheimer Films|M...,2011-05-11,6.3,2011,Pirates of the Caribbean: On Stranger Tides (t...,641683000


In [9]:
answers[1] = data[data.budget == data.budget.max()].iloc[0].title

# 2. Какой из фильмов самый длительный (в минутах)?

In [10]:
data[data.runtime == data.runtime.max()]

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,title,profit
1157,tt0279111,56000000,12923936,Gods and Generals,Stephen Lang|Jeff Daniels|Robert Duvall|Kevin ...,Ronald F. Maxwell,The nations heart was touched by...,The film centers mostly around the personal an...,214,Drama|History|War,Turner Pictures|Antietam Filmworks,2003-02-21,5.8,2003,Gods and Generals (tt0279111),-43076064


In [11]:
answers[2] = data[data.runtime == data.runtime.max()].iloc[0].title

# 3. Какой из фильмов самый короткий (в минутах)?





In [12]:
data[data.runtime == data.runtime.min()]

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,title,profit
768,tt1449283,30000000,14460000,Winnie the Pooh,Jim Cummings|Travis Oates|Jim Cummings|Bud Luc...,Stephen Anderson|Don Hall,Oh Pooh.,"During an ordinary day in Hundred Acre Wood, W...",63,Animation|Family,Walt Disney Pictures|Walt Disney Animation Stu...,2011-04-13,6.8,2011,Winnie the Pooh (tt1449283),-15540000


In [13]:
answers[3] = 'Winnie the Pooh (tt1449283)'

# 4. Какова средняя длительность фильмов?


In [14]:
round(data.runtime.mean())

110

In [15]:
answers[4] = round(data.runtime.mean())

# 5. Каково медианное значение длительности фильмов? 

In [16]:
round(data.runtime.median())

107

In [17]:
answers[5] = round(data.runtime.median())

# 6. Какой самый прибыльный фильм?
#### Внимание! Здесь и далее под «прибылью» или «убытками» понимается разность между сборами и бюджетом фильма. (прибыль = сборы - бюджет) в нашем датасете это будет (profit = revenue - budget) 

In [18]:
data[data.profit == data.profit.max()]

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,title,profit
239,tt0499549,237000000,2781505847,Avatar,Sam Worthington|Zoe Saldana|Sigourney Weaver|S...,James Cameron,Enter the World of Pandora.,"In the 22nd century, a paraplegic Marine is di...",162,Action|Adventure|Fantasy|Science Fiction,Ingenious Film Partners|Twentieth Century Fox ...,2009-12-10,7.1,2009,Avatar (tt0499549),2544505847


In [19]:
answers[6] = data[data.profit == data.profit.max()].iloc[0].title

# 7. Какой фильм самый убыточный? 

In [20]:
data[data.profit == data.profit.min()]

Unnamed: 0,imdb_id,budget,revenue,original_title,cast,director,tagline,overview,runtime,genres,production_companies,release_date,vote_average,release_year,title,profit
1245,tt1210819,255000000,89289910,The Lone Ranger,Johnny Depp|Armie Hammer|William Fichtner|Hele...,Gore Verbinski,Never Take Off the Mask,The Texas Rangers chase down a gang of outlaws...,149,Action|Adventure|Western,Walt Disney Pictures|Jerry Bruckheimer Films|I...,2013-07-03,6.0,2013,The Lone Ranger (tt1210819),-165710090


In [21]:
answers[7] = data[data.profit == data.profit.min()].iloc[0].title

# 8. У скольких фильмов из датасета объем сборов оказался выше бюджета?

In [22]:
len(data.query('revenue > budget'))

1478

In [23]:
answers[8] = len(data.query('revenue > budget'))

# 9. Какой фильм оказался самым кассовым в 2008 году?

In [24]:
data[data.release_year == 2008].sort_values(by='revenue').iloc[-1]

imdb_id                                                         tt0468569
budget                                                          185000000
revenue                                                        1001921825
original_title                                            The Dark Knight
cast                    Christian Bale|Michael Caine|Heath Ledger|Aaro...
director                                                Christopher Nolan
tagline                                                   Why So Serious?
overview                Batman raises the stakes in his war on crime. ...
runtime                                                               152
genres                                        Drama|Action|Crime|Thriller
production_companies    DC Comics|Legendary Pictures|Warner Bros.|Syncopy
release_date                                          2008-07-16 00:00:00
vote_average                                                          8.1
release_year                          

In [25]:
answers[9] = data[data.release_year == 2008].sort_values(by='revenue').iloc[-1].title

# 10. Самый убыточный фильм за период с 2012 по 2014 г. (включительно)?


In [26]:
data.query('2012 <= release_year <= 2014').sort_values(by='profit').iloc[0]

imdb_id                                                         tt1210819
budget                                                          255000000
revenue                                                          89289910
original_title                                            The Lone Ranger
cast                    Johnny Depp|Armie Hammer|William Fichtner|Hele...
director                                                   Gore Verbinski
tagline                                           Never Take Off the Mask
overview                The Texas Rangers chase down a gang of outlaws...
runtime                                                               149
genres                                           Action|Adventure|Western
production_companies    Walt Disney Pictures|Jerry Bruckheimer Films|I...
release_date                                          2013-07-03 00:00:00
vote_average                                                            6
release_year                          

In [27]:
answers[10] = data.query('2012 <= release_year <= 2014').sort_values(by='profit').iloc[0].title

# 11. Какого жанра фильмов больше всего?

In [28]:
df_explode_str(data, 'genres')['genres'].value_counts()

Drama              782
Comedy             683
Thriller           596
Action             582
Adventure          415
Crime              315
Romance            308
Family             260
Science Fiction    248
Fantasy            222
Horror             176
Mystery            168
Animation          139
Music               64
History             62
War                 58
Western             19
Documentary          8
Foreign              2
Name: genres, dtype: int64

ВАРИАНТ 2

In [29]:
genres_list = [genre for genres in data.genres for genre in genres.split('|')] #создаем список со всеми значениями поля genres
Counter(genres_list).most_common() #cчитаем кол-во повторов с помощью Counter

[('Drama', 782),
 ('Comedy', 683),
 ('Thriller', 596),
 ('Action', 582),
 ('Adventure', 415),
 ('Crime', 315),
 ('Romance', 308),
 ('Family', 260),
 ('Science Fiction', 248),
 ('Fantasy', 222),
 ('Horror', 176),
 ('Mystery', 168),
 ('Animation', 139),
 ('Music', 64),
 ('History', 62),
 ('War', 58),
 ('Western', 19),
 ('Documentary', 8),
 ('Foreign', 2)]

In [30]:
answers[11] = 'Drama'

# 12. Фильмы какого жанра чаще всего становятся прибыльными? 

In [31]:
df_explode_str(data.query('profit > 0'), 'genres')['genres'].value_counts()

Drama              560
Comedy             551
Thriller           446
Action             444
Adventure          337
Romance            242
Crime              231
Family             226
Science Fiction    195
Fantasy            188
Horror             150
Animation          120
Mystery            119
Music               47
History             46
War                 41
Western             12
Documentary          7
Name: genres, dtype: int64

In [32]:
answers[12] = 'Drama'

# 13. У какого режиссера самые большие суммарные кассовые сборы?

In [33]:
df_explode_str(data, 'director').groupby('director')['revenue'].sum().sort_values()

director
Simon Hunter            2033165
Keanu Reeves            2054941
Paul Schrader           2062066
Steven Shainberg        2281089
David MichÃ´d           2295423
                        ...    
J.J. Abrams          3579169916
Michael Bay          3886938960
David Yates          4154295625
Christopher Nolan    4167548502
Peter Jackson        6490593685
Name: revenue, Length: 997, dtype: int64

In [34]:
answers[13] = 'Peter Jackson'

# 14. Какой режисер снял больше всего фильмов в стиле Action?

In [35]:
df_explode_str(data[data.genres.str.contains('Action')], 'director')['director'].value_counts()

Robert Rodriguez      9
Paul W.S. Anderson    7
Michael Bay           7
Ridley Scott          6
Antoine Fuqua         6
                     ..
Boaz Yakin            1
Peter Segal           1
Brian Robbins         1
John A. Davis         1
Oxide Pang Chun       1
Name: director, Length: 364, dtype: int64

In [36]:
answers[14] = 'Robert Rodriguez'

# 15. Фильмы с каким актером принесли самые высокие кассовые сборы в 2012 году? 

In [37]:
df_explode_str(data.query('release_year == 2012'), 'cast').groupby('cast')['revenue'].sum().sort_values()

cast
Nicolas Cage            2106557
Josh Lucas              2106557
Sami Gayle              2106557
Danny Huston            2106557
Jason Bateman           3428048
                        ...    
Robert Downey Jr.    1519557910
Chris Evans          1519557910
Anne Hathaway        1522851057
Denis Leary          1629460639
Chris Hemsworth      2027450773
Name: revenue, Length: 466, dtype: int64

In [38]:
answers[15]='Chris Hemsworth'

# 16. Какой актер снялся в большем количестве высокобюджетных фильмов?

In [39]:
df_explode_str(data[data.budget > data.budget.mean()], 'cast')['cast'].value_counts()

Matt Damon         18
Adam Sandler       17
Angelina Jolie     16
Eddie Murphy       15
Tom Cruise         15
                   ..
Rila Fukushima      1
Madeleine Stowe     1
Suraj Sharma        1
Richard Briers      1
George Takei        1
Name: cast, Length: 1505, dtype: int64

In [40]:
answers[16] = 'Matt Damon'

# 17. В фильмах какого жанра больше всего снимался Nicolas Cage? 

In [41]:
df_explode_str(data[data.cast.str.contains('Nicolas Cage')], 'genres')['genres'].value_counts()

Action             17
Thriller           15
Drama              12
Crime              10
Fantasy             8
Adventure           7
Comedy              6
Science Fiction     4
Mystery             3
Animation           3
Family              3
History             2
Romance             1
War                 1
Horror              1
Name: genres, dtype: int64

In [42]:
answers[17] = 'Action'

# 18. Самый убыточный фильм от Paramount Pictures

In [43]:
data[data.production_companies.str.contains('Paramount Pictures')].sort_values(by='profit').iloc[0]

imdb_id                                                         tt0267626
budget                                                          100000000
revenue                                                          35168966
original_title                                       K-19: The Widowmaker
cast                    Harrison Ford|Liam Neeson|Peter Sarsgaard|Joss...
director                                                  Kathryn Bigelow
tagline                                          Fate has found its hero.
overview                When Russia's first nuclear submarine malfunct...
runtime                                                               138
genres                                             Thriller|Drama|History
production_companies    Paramount Pictures|Intermedia Films|National G...
release_date                                          2002-07-19 00:00:00
vote_average                                                            6
release_year                          

In [44]:
answers[55] = 'K-19: The Widowmaker (tt0267626)'

# 19. Какой год стал самым успешным по суммарным кассовым сборам?

In [45]:
data.groupby('release_year')[['revenue']].sum().sort_values(by='revenue')

Unnamed: 0_level_0,revenue
release_year,Unnamed: 1_level_1
2000,10664099805
2001,13017764865
2002,14136361487
2003,14346123312
2006,14775042320
2005,15309425558
2004,15663430720
2007,18162406801
2008,18252781990
2009,20261791024


In [46]:
answers[19] = 2015

# 20. Какой самый прибыльный год для студии Warner Bros?

In [47]:
data[data.production_companies.str.contains('Warner Bros')].groupby('release_year')[['profit']].sum().sort_values(by='profit')

Unnamed: 0_level_0,profit
release_year,Unnamed: 1_level_1
2000,452631386
2006,620170743
2015,870368348
2002,1022709901
2012,1258020056
2001,1343545668
2005,1551980298
2004,1631933725
2013,1636453400
2009,1822454136


In [48]:
answers[20] = 2014

# 21. В каком месяце за все годы суммарно вышло больше всего фильмов?

In [49]:
data.release_date.dt.month.value_counts()

9     227
12    190
10    186
8     161
3     156
4     149
6     147
11    146
7     142
5     140
2     135
1     110
Name: release_date, dtype: int64

In [50]:
answers[21] = 'Сентябрь'

# 22. Сколько суммарно вышло фильмов летом? (за июнь, июль, август)

In [51]:
len(data[data.release_date.apply(lambda dt: dt.month in [6,7,8])])

450

In [52]:
answers[22] = 450

# 23. Для какого режиссера зима – самое продуктивное время года? 

In [53]:
df_explode_str(data[data.release_date.apply(lambda dt: dt.month in [1,2,12])], 'director')['director'].value_counts()

Peter Jackson        7
Steven Soderbergh    6
Clint Eastwood       6
Adam Shankman        4
Martin Scorsese      4
                    ..
Len Wiseman          1
Brad Bird            1
Jake Kasdan          1
Jason Moore          1
Pat O'Connor         1
Name: director, Length: 358, dtype: int64

In [54]:
answers[23] = 'Peter Jackson'

# 24. Какая студия дает самые длинные названия своим фильмам по количеству символов?

In [55]:
df = data.assign(title_len = data.original_title.str.len())
df_explode_str(df, 'production_companies').groupby('production_companies')['title_len'].mean().sort_values()

production_companies
Global Entertainment Group     2.0
Ixtlan Productions             2.0
XM2 Productions                2.0
Berlanti Productions           3.0
Everest Entertainment          3.0
                              ... 
Polsky Films                  46.0
Museum Canada Productions     46.0
Dos Corazones                 47.0
Jim Henson Company, The       59.0
Four By Two Productions       83.0
Name: title_len, Length: 1771, dtype: float64

In [56]:
answers[24] = 'Four By Two Productions'

# 25. Описание фильмов какой студии в среднем самые длинные по количеству слов?

In [57]:
df = data.assign(overview_word_count = data.overview.apply(lambda s: len(s.split())))
df_explode_str(df, 'production_companies').groupby('production_companies')['overview_word_count'].mean().sort_values()

production_companies
Motion Picture Corporation of America     11.0
Empire Pictures                           11.0
Henceforth                                13.0
Phantom Four                              13.0
London Boulevard                          13.0
                                         ...  
Brookwell-McNamara Entertainment         156.0
Heineken Branded Entertainment           159.0
98 MPH Productions                       159.0
Room 9 Entertainment                     161.0
Midnight Picture Show                    175.0
Name: overview_word_count, Length: 1771, dtype: float64

In [58]:
answers[25] = 'Midnight Picture Show'

# 26. Какие фильмы входят в 1 процент лучших по рейтингу? 
по vote_average

In [59]:
data.sort_values(by='vote_average')[['original_title', 'vote_average']].tail(int(len(data)*0.01))

Unnamed: 0,original_title,vote_average
155,The Theory of Everything,7.8
863,The Lord of the Rings: The Two Towers,7.8
1688,There Will Be Blood,7.8
283,Mr. Nobody,7.8
1191,12 Years a Slave,7.9
872,The Pianist,7.9
128,Gone Girl,7.9
1800,Memento,7.9
119,Guardians of the Galaxy,7.9
370,Inception,7.9


In [60]:
answers[26] = 'Inside Out, The Dark Knight, 12 Years a Slave'

# 27. Какие актеры чаще всего снимаются в одном фильме вместе?


In [61]:
# применяем explode к полю cast:
df = df_explode_str(data, 'cast')[['imdb_id', 'cast']]

# делаем merge с самим собой по полю imdb_id 
df = df.merge(df, on='imdb_id').query('cast_x != cast_y')

# после merge у нас появились дубли, когда 2 актера встречаются в разных полях:
#|   imdb_id|        cast_x|        cast_y|
#|----------|--------------|--------------|
#| tt0290334|   Halle Berry| Famke Janssen|
#| tt0290334| Famke Janssen|   Halle Berry|
#
# для удаления таких дублей создаем поле pair = tuple(sorted([cast_x, cast_y])):
df['pair'] = df.apply(lambda row: tuple(sorted([row.cast_x, row.cast_y])), axis=1)

# удаляем дубли и считаем кол-во
df[['imdb_id', 'pair']].drop_duplicates()['pair'].value_counts()

(Daniel Radcliffe, Rupert Grint)       8
(Emma Watson, Rupert Grint)            8
(Daniel Radcliffe, Emma Watson)        8
(Helena Bonham Carter, Johnny Depp)    6
(Ben Stiller, Owen Wilson)             6
                                      ..
(50 Cent, Forest Whitaker)             1
(Mark Wahlberg, Tina Fey)              1
(Kerry Washington, Vince Vaughn)       1
(Johnny Depp, Julie Christie)          1
(Lochlyn Munro, Richard Gant)          1
Name: pair, Length: 17942, dtype: int64

ВАРИАНТ 2

In [62]:
from itertools import combinations
pairs = data['cast'].apply(lambda s: list(combinations(sorted(s.split('|')), 2)))
Counter(pairs.explode()).most_common(5)

[(('Daniel Radcliffe', 'Emma Watson'), 8),
 (('Daniel Radcliffe', 'Rupert Grint'), 8),
 (('Emma Watson', 'Rupert Grint'), 8),
 (('Ben Stiller', 'Owen Wilson'), 6),
 (('Helena Bonham Carter', 'Johnny Depp'), 6)]

In [63]:
answers[27] = 'Daniel Radcliffe & Rupert Grint'

# Submission

In [64]:
# в конце можно посмотреть свои ответы к каждому вопросу
answers

{1: 'Pirates of the Caribbean: On Stranger Tides (tt1298650)',
 2: 'Gods and Generals (tt0279111)',
 3: 'Winnie the Pooh (tt1449283)',
 4: 110,
 5: 107,
 6: 'Avatar (tt0499549)',
 7: 'The Lone Ranger (tt1210819)',
 8: 1478,
 9: 'The Dark Knight (tt0468569)',
 10: 'The Lone Ranger (tt1210819)',
 11: 'Drama',
 12: 'Drama',
 13: 'Peter Jackson',
 14: 'Robert Rodriguez',
 15: 'Chris Hemsworth',
 16: 'Matt Damon',
 17: 'Action',
 55: 'K-19: The Widowmaker (tt0267626)',
 19: 2015,
 20: 2014,
 21: 'Сентябрь',
 22: 450,
 23: 'Peter Jackson',
 24: 'Four By Two Productions',
 25: 'Midnight Picture Show',
 26: 'Inside Out, The Dark Knight, 12 Years a Slave',
 27: 'Daniel Radcliffe & Rupert Grint'}

In [65]:
# и убедиться что ни чего не пропустил)
len(answers)

27