# IMDB Top 1000 analysis

## 1. Loading the from a CSV and putting it in a list

In [3]:
import csv

with open("imdb_top_1000.csv", encoding="utf-8") as csvfile:
    data = list(csv.DictReader(csvfile))


print(len(data))

## 2. Calculating the average rating
Below we calculate the average IMDB rating for all movies. As the Python CSV module turns everything into strings we need to convert it into a float so that we can add them up.

In [4]:

total = 0

for movie in data:
    total += float(movie["IMDB_Rating"])
    
rating_avg = total / len(data)
print(rating_avg)

7.949300000000012


## 3. Counting movies with specific director
Below we count all movies that have been directed by Peter Jackson. We do this through looping through our dataset and only increasing the count if the `movie["Director"]` is equal to `"Peter Jackson"`

In [29]:
count = 0

for movie in data:
    if movie["Director"] == "Peter Jackson":
        count += 1
    
print(count)

5


## 4. Filtering movies
Instead of just counting maybe we want to filter out all movies by a certain criteria so that we can perform more analysis on that subset. Below we create a new list only containing the movies that Peter Jackson has directed.

In [5]:
filtered_movies = []

for movie in data:
    if movie["Director"] == "Peter Jackson":
        filtered_movies.append(movie)

### 4.1 Average of a filtered list
We then use the filtered list to calculate the average in very much the same way as above.

In [6]:
total = 0

for movie in filtered_movies:
    print(f"{movie['Series_Title']} - {movie['IMDB_Rating']}")

for movie in filtered_movies:
    total += float(movie["IMDB_Rating"])
    
rating_avg = total / len(filtered_movies)

print(f"Average rating - {rating_avg}")

The Lord of the Rings: The Return of the King - 8.9
The Lord of the Rings: The Fellowship of the Ring - 8.8
The Lord of the Rings: The Two Towers - 8.7
The Hobbit: The Desolation of Smaug - 7.8
The Hobbit: An Unexpected Journey - 7.8
Average rating - 8.4


## 5. Grouping by a value


In [12]:
from pprint import pprint
from collections import defaultdict

group_by_director = defaultdict(int)

for movie in data:
    group_by_director[movie["Director"]] += 1

### 5.2 Sorting a dictionary
We then can sort this dictionary by its values using the `sorted` built-in function and passing the `key` keyword argument.

In [13]:
sorted_director = sorted(group_by_director.items(), key=lambda x: x[1], reverse=True)

pprint(sorted_director[:5])

[('Alfred Hitchcock', 14),
 ('Steven Spielberg', 13),
 ('Hayao Miyazaki', 11),
 ('Martin Scorsese', 10),
 ('Akira Kurosawa', 10)]


## 6. Exercises

### 6.1 Find highest rated movie

In [9]:
# Using a for-loop

highest_rated_movie = data[0]

for movie in data:
    if movie["IMDB_Rating"] > highest_rated_movie["IMDB_Rating"]:
        highest_rated_movie = movie

print(f"Highest rated movie is {highest_rated_movie['Series_Title']} with rating {highest_rated_movie['IMDB_Rating']}")

Highest rated movie is The Shawshank Redemption with rating 9.3


In [10]:
# Using the built-in max() function

highest_rated_movie = max(data, key=lambda movie: movie["IMDB_Rating"])

print(f"Highest rated movie is {highest_rated_movie['Series_Title']} with rating {highest_rated_movie['IMDB_Rating']}")

Highest rated movie is The Shawshank Redemption with rating 9.3


### 6.2 Find sum of all gross

In [23]:
total_gross = 0

for movie in data:
    # not all movies seem to have a gross
    if movie["Gross"]:

        # gross is a string formatted with commas in between 1000s.
        # Example: 28,341,469

        # To convert it into an int we need to first remove the commas
        gross = movie["Gross"].replace(",", "")
        
        # Then we can convert it to int
        gross = int(gross)

        total_gross += gross
        
print(total_gross)

56536877976


### 6.3 Find count of movies by director "Francis Ford Coppola"

In [25]:
# Using a for-loop

count = 0

for movie in data:
    if movie["Director"] == "Francis Ford Coppola":
        count += 1
        
print(count)

5


In [26]:
# Using a list comprehension and len()

ffc_movies = [movie for movie in data if movie["Director"] == "Francis Ford Coppola"]

print(len(ffc_movies))

5


### 6.4 Find sum of gross of movies by "Christopher Nolan"

In [31]:
# Use list comprehension to filter only movies by Christopher Nolan
cn_movies = [movie for movie in data if movie["Director"] == "Christopher Nolan"]

# Re-use gross calculation from above but looping throuhg cn_movies instead of all movies

# TODO: This cleaning is ideally something that should be done in the beginning. But more on that next week ;)
cn_gross = 0

for movie in cn_movies:
    if movie["Gross"]:
        gross = movie["Gross"].replace(",", "")
        
        gross = int(gross)

        cn_gross += gross
        
print(cn_gross)

# Can also be written as a list comprehension and using sum()
# But in this case I think it's harder to read and understand

cn_gross = sum([int(movie["Gross"].replace(",", "")) for movie in data if movie["Director"] == "Christopher Nolan"])

print(cn_gross)

1937454106
1937454106


### 6.5 Find the average length of a title

In [32]:
total_length = 0

for movie in data:
    total_length += len(movie["Series_Title"])
    
print(total_length / len(data))

15.452


### 6.6 Find count of movies per year

In [34]:
# Using a dictionary to group by year

from collections import defaultdict
from pprint import pprint

group_by_year = defaultdict(int)

for movie in data:
    group_by_year[movie["Released_Year"]] += 1
    
pprint(group_by_year)

defaultdict(<class 'int'>,
            {'1920': 1,
             '1921': 1,
             '1922': 1,
             '1924': 1,
             '1925': 2,
             '1926': 1,
             '1927': 2,
             '1928': 2,
             '1930': 1,
             '1931': 3,
             '1932': 2,
             '1933': 3,
             '1934': 2,
             '1935': 3,
             '1936': 1,
             '1937': 1,
             '1938': 3,
             '1939': 5,
             '1940': 7,
             '1941': 2,
             '1942': 3,
             '1943': 1,
             '1944': 4,
             '1945': 2,
             '1946': 5,
             '1947': 2,
             '1948': 6,
             '1949': 3,
             '1950': 5,
             '1951': 5,
             '1952': 4,
             '1953': 5,
             '1954': 6,
             '1955': 6,
             '1956': 5,
             '1957': 9,
             '1958': 4,
             '1959': 7,
             '1960': 11,
             '1961': 5,
            

### 6.7 Find the average rating for each year

In [40]:
# Using a dictionary to group movies by year where the value is a list of movies

from collections import defaultdict

group_by_year = defaultdict(list)

for movie in data:
    group_by_year[movie["Released_Year"]].append(movie)

for year, movies in group_by_year.items():
    sum_rating = sum([float(movie["IMDB_Rating"]) for movie in movies])
    avg_rating = sum_rating / len(movies)
    
    print(f"{year} - {avg_rating}")

1994 - 8.23846153846154
1972 - 8.087499999999999
2008 - 7.923809523809522
1974 - 8.05
1957 - 8.277777777777779
2003 - 7.9499999999999975
1993 - 7.904347826086957
2010 - 7.904347826086955
1999 - 8.017647058823528
2001 - 7.899999999999998
1966 - 8.057142857142859
2002 - 7.8999999999999995
1990 - 7.8875
1980 - 8.0875
1975 - 8.088888888888889
2020 - 8.133333333333331
2019 - 7.995652173913045
2014 - 7.9187499999999975
1998 - 7.929411764705882
1997 - 7.94736842105263
1995 - 8.010526315789473
1991 - 7.933333333333333
1977 - 8.066666666666668
1962 - 8.061538461538461
1954 - 8.166666666666666
1946 - 8.02
2011 - 7.877777777777776
2006 - 7.89230769230769
2000 - 7.957894736842104
1988 - 8.036363636363635
1985 - 7.966666666666665
1968 - 8.0
1960 - 7.972727272727273
1942 - 8.233333333333333
1936 - 8.5
1931 - 8.200000000000001
2018 - 7.994736842105263
2017 - 7.890909090909091
2016 - 7.93571428571429
2012 - 7.941666666666664
2009 - 7.86551724137931
2007 - 7.8769230769230765
1984 - 8.0
1981 - 7.9750000

### 6.8 Find count of movies per genre

In [41]:
from collections import defaultdict

# Use a dictionary to group movies by genre
group_by_genre = defaultdict(int)

for movie in data:
    # As genre is a comma separated list inside of each movie
    # we need to convert it into something we can use
    
    # We do this by splitting the string to get a list of all genres
    genres = movie["Genre"].split(", ")
    
    # Then we can loop through it and increase our counter
    for genre in genres:
        group_by_genre[genre] += 1
        
print(group_by_genre)

defaultdict(<class 'int'>, {'Drama': 724, 'Crime': 209, 'Action': 189, 'Adventure': 196, 'Biography': 109, 'History': 56, 'Sci-Fi': 67, 'Romance': 125, 'Western': 20, 'Fantasy': 66, 'Comedy': 233, 'Thriller': 137, 'Animation': 82, 'Family': 56, 'War': 51, 'Mystery': 99, 'Music': 35, 'Horror': 32, 'Musical': 17, 'Film-Noir': 19, 'Sport': 19})
