In [1]:
import pandas as pd

In [2]:
# read the title_basics_2018.csv file and load it into Pandas DataFrame
df_title_basics_2018 = pd.read_csv('./title_basics_2018.csv')

# display the dataframe structure, data and test the title_basics_2018.csv file was loaded correctly
print(df_title_basics_2018.head().to_string())

      tconst                primaryTitle               originalTitle  year  runtimeMinutes       genres
0  tt0069049  The Other Side of the Wind  The Other Side of the Wind  2018             122        Drama
1  tt0111414                 A Thin Life                 A Thin Life  2018              75       Comedy
2  tt0170651        T.G.M. - osvoboditel        T.G.M. - osvoboditel  2018              60  Documentary
3  tt0192528               Heaven & Hell              Reverse Heaven  2018             104        Drama
4  tt0253093                Gangavataran                Gangavataran  2018             134           \N


In [3]:
# read the title_ratings.csv file and load it into Pandas DataFrame
df_title_ratings = pd.read_csv('./title_ratings.csv')

# display the dataframe structure, data and test the title_ratings.csv file was loaded correctly
print(df_title_ratings.head().to_string())

      tconst  averageRating  numVotes
0  tt0000001            5.6      1543
1  tt0000002            6.1       186
2  tt0000003            6.5      1201
3  tt0000004            6.2       114
4  tt0000005            6.1      1921


In [4]:
# merge the df_title_basics_2018 and df_title_ratings on the 'tconst' column, which works as primary key:
merged_df_title = pd.merge(df_title_basics_2018, df_title_ratings, on='tconst')

# display the dataframe structure, data and test the merger of df_title_basics_2018 and df_title_ratings were correct
print(merged_df_title.head().to_string())

      tconst                primaryTitle                originalTitle  year  runtimeMinutes       genres  averageRating  numVotes
0  tt0069049  The Other Side of the Wind   The Other Side of the Wind  2018             122        Drama            6.9      4937
1  tt0170651        T.G.M. - osvoboditel         T.G.M. - osvoboditel  2018              60  Documentary            7.5         6
2  tt0192528               Heaven & Hell               Reverse Heaven  2018             104        Drama            3.9        74
3  tt0253093                Gangavataran                 Gangavataran  2018             134           \N            6.6         8
4  tt0262759    Seven Jews from My Class  Siedmiu Zydów z mojej klasy  2018              40  Documentary            7.0         6


In [5]:
# get the overall statistics of the merged data of df_title_basics_2018 and df_title_ratings
print(merged_df_title.describe())

         year  runtimeMinutes  averageRating       numVotes
count  7229.0     7229.000000    7229.000000    7229.000000
mean   2018.0       96.271130       6.246217    2588.912989
std       0.0       27.400867       1.494149   19831.779889
min    2018.0        7.000000       1.000000       5.000000
25%    2018.0       83.000000       5.400000      18.000000
50%    2018.0       93.000000       6.400000      69.000000
75%    2018.0      106.000000       7.300000     346.000000
max    2018.0      840.000000      10.000000  719146.000000


In [6]:
# filter the films by year of 2018 only
films_year_2018 = merged_df_title[merged_df_title['year'] == 2018]

# check the overall statistics of filtered data
print(films_year_2018.describe()) # mean of year is 2018

         year  runtimeMinutes  averageRating       numVotes
count  7229.0     7229.000000    7229.000000    7229.000000
mean   2018.0       96.271130       6.246217    2588.912989
std       0.0       27.400867       1.494149   19831.779889
min    2018.0        7.000000       1.000000       5.000000
25%    2018.0       83.000000       5.400000      18.000000
50%    2018.0       93.000000       6.400000      69.000000
75%    2018.0      106.000000       7.300000     346.000000
max    2018.0      840.000000      10.000000  719146.000000


In [7]:
# first we filter out the films with number of votes that were average and above
best_films_2018 = films_year_2018[films_year_2018['numVotes'] >= films_year_2018['numVotes'].mean()]

# display the filtered data
print(best_films_2018.head().to_string())

       tconst                primaryTitle               originalTitle  year  runtimeMinutes                 genres  averageRating  numVotes
0   tt0069049  The Other Side of the Wind  The Other Side of the Wind  2018             122                  Drama            6.9      4937
9   tt0360556              Fahrenheit 451              Fahrenheit 451  2018             100  Drama,Sci-Fi,Thriller            4.9     15202
10  tt0365545          Nappily Ever After          Nappily Ever After  2018              98   Comedy,Drama,Romance            6.4      6687
15  tt0859635            Super Troopers 2            Super Troopers 2  2018              99   Comedy,Crime,Mystery            6.1     21337
17  tt0972544                  Back Roads                  Back Roads  2018             101   Crime,Drama,Thriller            7.0      3482


In [8]:
# second we filter of the films with rating score that were rating score were average and above
best_films_2018 = best_films_2018[best_films_2018['averageRating'] >= best_films_2018['averageRating'].mean()]

# display the filtered data
print(best_films_2018.head().to_string())

       tconst                primaryTitle               originalTitle  year  runtimeMinutes                  genres  averageRating  numVotes
0   tt0069049  The Other Side of the Wind  The Other Side of the Wind  2018             122                   Drama            6.9      4937
10  tt0365545          Nappily Ever After          Nappily Ever After  2018              98    Comedy,Drama,Romance            6.4      6687
17  tt0972544                  Back Roads                  Back Roads  2018             101    Crime,Drama,Thriller            7.0      3482
59  tt1034415                    Suspiria                    Suspiria  2018             152  Fantasy,Horror,Mystery            6.8     45418
97  tt1137450                  Death Wish                  Death Wish  2018             107      Action,Crime,Drama            6.4     54054


In [9]:
# get the average run time of the films
average_film_run_time = best_films_2018['runtimeMinutes'].mean()

# get the overall statistics of the filter data
print("Average / mean of a film for year 2018: " + str(average_film_run_time))

Average / mean of a film for year 2018: 118.68646864686468


In [10]:
# filter out the films with longer run time that were above or average run time
longer_view_time_films = films_year_2018[films_year_2018['runtimeMinutes'] >= average_film_run_time]

# get the overall statistics of the filter data
print(longer_view_time_films.describe())

         year  runtimeMinutes  averageRating       numVotes
count  1027.0     1027.000000    1027.000000    1027.000000
mean   2018.0      138.746835       6.524245    8446.654333
std       0.0       41.581107       1.315267   43917.999202
min    2018.0      119.000000       1.500000       5.000000
25%    2018.0      123.000000       5.800000      36.500000
50%    2018.0      132.000000       6.700000     218.000000
75%    2018.0      142.500000       7.400000    1183.000000
max    2018.0      840.000000       9.500000  719146.000000


In [11]:
# filter out the films with shorter run time that were below the average run time
shorter_view_time_films = films_year_2018[films_year_2018['runtimeMinutes'] < average_film_run_time]

# get the overall statistics of the filter data
print(shorter_view_time_films .describe())

         year  runtimeMinutes  averageRating       numVotes
count  6202.0     6202.000000    6202.000000    6202.000000
mean   2018.0       89.237504       6.200177    1618.919381
std       0.0       15.517578       1.516928   11518.721608
min    2018.0        7.000000       1.000000       5.000000
25%    2018.0       81.000000       5.300000      16.000000
50%    2018.0       90.000000       6.400000      59.000000
75%    2018.0      100.000000       7.300000     282.000000
max    2018.0      118.000000      10.000000  329157.000000


In [12]:
print("Average rating score score of longer run or view time films: " + str(round(longer_view_time_films['averageRating'].mean(),4)))

Average rating score score of longer run or view time films: 6.5242


In [13]:
print("Average rating score score of shorter run or view time films: " + str(round(shorter_view_time_films['averageRating'].mean(),4)))

Average rating score score of shorter run or view time films: 6.2002


In [14]:
# compare the average rating score of longer run view time films and shorter run view time films
if longer_view_time_films['averageRating'].mean() > shorter_view_time_films['averageRating'].mean():
    print("Audiences in year 2018 prefer film with longer run view time")
else:
    print("Audiences in year 2018 prefer film with shorter run view time")


Audiences in year 2018 prefer film with longer run view time
