In [1]:
import pandas as pd

In [2]:
# read the title_basics_2018.csv file and load it into Pandas DataFrame
df_title_basics_2018 = pd.read_csv('./title_basics_2018.csv')

# print or display the dataframe structure and test the title_basics_2018.csv file was loaded correctly
print(df_title_basics_2018.head().to_string())

      tconst                primaryTitle               originalTitle  year  runtimeMinutes       genres
0  tt0069049  The Other Side of the Wind  The Other Side of the Wind  2018             122        Drama
1  tt0111414                 A Thin Life                 A Thin Life  2018              75       Comedy
2  tt0170651        T.G.M. - osvoboditel        T.G.M. - osvoboditel  2018              60  Documentary
3  tt0192528               Heaven & Hell              Reverse Heaven  2018             104        Drama
4  tt0253093                Gangavataran                Gangavataran  2018             134           \N


In [3]:
# read the title_ratings.csv file and load it into Pandas DataFrame
df_title_ratings = pd.read_csv('./title_ratings.csv')

# print or display the dataframe structure and test the title_ratings.csv file was loaded correctly
print(df_title_ratings.head().to_string())

      tconst  averageRating  numVotes
0  tt0000001            5.6      1543
1  tt0000002            6.1       186
2  tt0000003            6.5      1201
3  tt0000004            6.2       114
4  tt0000005            6.1      1921


In [4]:
# merge the df_title_basics_2018 and df_title_ratings based on the 'tconst' column, which works as primary key
merged_df_title = pd.merge(df_title_basics_2018, df_title_ratings, on='tconst')

# display the dataframe structure, data and check the overall merger of df_title_basics and df_title_ratings 
print(merged_df_title.head().to_string())

      tconst                primaryTitle                originalTitle  year  runtimeMinutes       genres  averageRating  numVotes
0  tt0069049  The Other Side of the Wind   The Other Side of the Wind  2018             122        Drama            6.9      4937
1  tt0170651        T.G.M. - osvoboditel         T.G.M. - osvoboditel  2018              60  Documentary            7.5         6
2  tt0192528               Heaven & Hell               Reverse Heaven  2018             104        Drama            3.9        74
3  tt0253093                Gangavataran                 Gangavataran  2018             134           \N            6.6         8
4  tt0262759    Seven Jews from My Class  Siedmiu Zydów z mojej klasy  2018              40  Documentary            7.0         6


In [5]:
# filter the films by year of 2018 only
films_year_2018 = merged_df_title[merged_df_title['year'] == 2018]

# check the overall statistics of filtered data
print(films_year_2018.describe()) # mean of year is 2018

         year  runtimeMinutes  averageRating       numVotes
count  7229.0     7229.000000    7229.000000    7229.000000
mean   2018.0       96.271130       6.246217    2588.912989
std       0.0       27.400867       1.494149   19831.779889
min    2018.0        7.000000       1.000000       5.000000
25%    2018.0       83.000000       5.400000      18.000000
50%    2018.0       93.000000       6.400000      69.000000
75%    2018.0      106.000000       7.300000     346.000000
max    2018.0      840.000000      10.000000  719146.000000


In [6]:
# filter out the rating score with 8 or higher from the 'averageRating' column of the merged dataframe 
films_scored_8_or_higer = films_year_2018[films_year_2018['averageRating'] >= 8.0]

# check the overall statistics of filtered data 
print(films_scored_8_or_higer.describe()) # averageRating min = 8.0 and count = 780

         year  runtimeMinutes  averageRating       numVotes
count   780.0      780.000000     780.000000     780.000000
mean   2018.0       93.455128       8.478590    2753.816667
std       0.0       37.425174       0.414829   32233.269853
min    2018.0       13.000000       8.000000       5.000000
25%    2018.0       75.000000       8.100000      10.000000
50%    2018.0       90.000000       8.400000      23.000000
75%    2018.0      104.250000       8.800000      71.250000
max    2018.0      601.000000      10.000000  719146.000000


In [7]:
# get the total number of films with a score of 8.0 or higher
print("Total Number of fils with a score of 8.0 or higer: " + str(films_scored_8_or_higer['averageRating'].count())) # 780

Total Number of fils with a score of 8.0 or higer: 780
