In [9]:
# import initial dependencies
import pandas as pd
import requests
import json
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from pprint import pprint
from api_key import omdb_key
import scipy.stats as stats
import csv
from collections import Counter
from collections import defaultdict

In [2]:
# create a path to academy awards csv and read it into a pandas dataframe
awards_csv = "academy_awards_data_2.csv"
awards_df = pd.read_csv(awards_csv, usecols = ['Nominee', 'Year', 'Category', 'Won?'], encoding = 'latin-1')

# create another dataframe that only includes nominees in the Best Picture category
award_data = awards_df.loc[awards_df["Category"] == "Best Picture", :]

# create list of best picture nominees
best_picture_noms = award_data["Nominee"]

# print(best_picture_noms)
# print(best_picture_noms[77])
# award_data.head()

In [None]:
# NO NEED TO RUN THIS CELL AGAIN BECAUSE DATA IS ALREADY READ INTO CSV BELOW, 
# BUT TOTALLY FEEL FREE TO TEST IF YOU'VE INCLUDED ADDITIONAL/DIFFERENT AWARD CATEGORIES

# print the corresponding number for each movie  
movie_number = 1

# empty lists for holding movie data
box_office = []
genre = []
meta_score = []
imdb_rating = []
title = []
poster_url = []
rated = []
release_date = []
studio = []

best_picture_noms = award_data["Nominee"]
base_url = "http://www.omdbapi.com/?"

    
# print statement as each movie is processed
print(f"Beginning Data Retrieval")
print(f"==============================")

# loop through the movies in the best picture noms dataframe 
for movie in best_picture_noms:
    
    params = {
    "apikey" : omdb_key,
    "t" : movie
    } 
    
    # try statement for each potential movie
    try: 
        omdb_data_raw = requests.get(base_url, params=params)
        omdb_data = omdb_data_raw.json()
        box_office.append(omdb_data["BoxOffice"])
        genre.append(omdb_data["Genre"])
        meta_score.append(omdb_data["Metascore"])
        imdb_rating.append(omdb_data["imdbRating"])
        title.append(omdb_data["Title"])
        poster_url.append(omdb_data["Poster"])
        rated.append(omdb_data["Rated"]) 
        release_date.append(omdb_data["Released"]) 
        studio.append(omdb_data["Production"])
        print_title = omdb_data["Title"]
        
        print(f"Processing Record {movie_number} | {print_title}")
        print(omdb_data_raw.url) 
        
        # increase movie number by one each loop
        movie_number = movie_number + 1
        
#         to avoid 60 rpm api limit i'm waiting just over 1 second per loop
#         https://www.pythoncentral.io/pythons-time-sleep-pause-wait-sleep-stop-your-code/)
        time.sleep(1.01)
        
    # skip if no movie is found or if data is missing
    except:
        print("Data missing or movie not found. Skipping...")
    continue
    
print(f"==============================")
print(f"Data Retrieval Complete")
print(f"==============================")

In [None]:
# NO NEED TO RUN THIS CELL AGAIN BECAUSE DATA IS ALREADY READ INTO CSV BELOW, 
# BUT TOTALLY FEEL FREE TO TEST IF YOU'VE INCLUDED ADDITIONAL/DIFFERENT AWARD CATEGORIES

# converting filtered api data into dataframe
filtered_omdb_data_df= pd.DataFrame ({
    "Title": title,
    "Genre": genre,
    "Meta_Score": meta_score,
    "imdb_Rating": imdb_rating,
    "Box_Office" : box_office,
    "Rated" : rated,
    "Studio" : studio,
    "Release_Date" : release_date,
    "Poster_URL" : poster_url
})

# coverting dataframe into csv-- this step isn't necessary, but did it so i'm not constantly dealing with the api directly
filtered_omdb_data_df.to_csv('filtered_omdb_data.csv', index=False)
# filtered_omdb_data_df.head()

In [None]:
merged_movie_data = 'merged_movie_data.csv'
movie_data_df = pd.read_csv(merged_movie_data)

movie_data_df['Season'] = ""

#Separate seasons into bins with the months that each contain inside
winter = ['Dec', 'Jan', 'Feb']
spring = ['Mar', 'Apr', 'May']
summer = ['Jun', 'Jul', 'Aug']
fall = ['Sep', 'Oct', 'Nov']

#iterate through rows and retrieve second word from 'Release Date'
for index, movie in movie_data_df.iterrows():
    
    try:
        release_date = movie['Release_Date']
        month = release_date.split(' ')[1]
        
#Print corresponding bin into 'Seasons' column
        if month in winter:
            movie_data_df.at[index, 'Season'] = 'Winter'
        elif month in spring:
            movie_data_df.at[index, 'Season'] = 'Spring'
        elif month in summer:
            movie_data_df.at[index, 'Season'] = 'Summer'
        elif month in fall:
            movie_data_df.at[index, 'Season'] = 'Fall'
    except:
        movie_data_df.at[index, 'Season'] = 'Unknown'

movie_data_df.to_csv('Seasons_Movie_Data.csv', index=False)

In [15]:
# created a path to the filteredd api csv and read it into a pandas dataframe
filtered_omdb_csv = "filtered_omdb_data.csv"
filtered_omdb_csv_df = pd.read_csv(filtered_omdb_csv)
# filtered_omdb_csv_df.count()
# award_data.count()

# merged the filtered api data csv and awards data csv into a single dataset
merged_movie_data_df = pd.merge(filtered_omdb_csv_df, award_data, left_on="Title", right_on="Nominee")
# merged_movie_data_df.to_csv('merged_movie_data.csv', index=False)
# merged_movie_data_df.head()

# import Eric's csv that adds a seasons column to "merged_movie_data_df"
seasons_omdb_csv = "Seasons_Movie_Data_2.csv"
seasons_omdb_csv_df = pd.read_csv(seasons_omdb_csv)
# seasons_omdb_csv_df.head()

In [16]:
# clean up Rated category
seasons_omdb_csv_df['Rated'] = seasons_omdb_csv_df['Rated'].replace(
    {'NOT RATED': 'Not Rated', 'PASSED': 'Passed', 'UNRATED': 'Not Rated', 'Unrated': 'Not Rated', 'APPROVED': 'Approved'})

In [None]:
#count of rated (R, PG, G, etc.) comparison nominees Won vs. Lost 

# set width of bar
barWidth = 0.25
 
# set height of bar
height_winning_rated = [22, 19, 12, 9, 7, 2, 6]
height_losing_rated = [89, 86, 67, 38, 39, 42, 20]

 
# Set position of bar on X axis
r1 = np.arange(len(height_winning_rated))
r2 = [x + barWidth for x in r1]
 
# Make the plot
plt.bar(r1, height_winning_rated, color='navy', width=barWidth, edgecolor='white', label='Winning Noms')
plt.bar(r2, height_losing_rated, color='orange', width=barWidth, edgecolor='white', label='Losing Noms')
 
# Add xticks on the middle of the group bars
plt.title("'Best Picture' Nominee Movie Rating (1927-2010)", fontweight='bold')
plt.xlabel('Movie Rating', fontweight='bold')
plt.ylabel("Count of Movie Rating", fontweight='bold')

plt.xticks([r + barWidth for r in range(len(height_winning_rated))], ['R', 'Not Rated', 'PG', 'PG-13', 'Passed', 'Approved', 'G'])

#create legend, show graphic, and push to .png
plt.legend()
# plt.savefig("count_of_rated_grouped.png")
plt.show()

In [None]:
winter_noms = seasons_omdb_csv_df[seasons_omdb_csv_df["Seasons"] == "Winter"]
spring_noms = seasons_omdb_csv_df[seasons_omdb_csv_df["Seasons"] == "Spring"]
summer_noms = seasons_omdb_csv_df[seasons_omdb_csv_df["Seasons"] == "Summer"]
fall_noms = seasons_omdb_csv_df[seasons_omdb_csv_df["Seasons"] == "Fall"]

winter_rated = winter_noms['Rated'].value_counts()
# print(winter_rated)

spring_rated = spring_noms['Rated'].value_counts()
# print(spring_rated)

summer_rated = summer_noms['Rated'].value_counts()
# print(summer_rated)

fall_rated = fall_noms['Rated'].value_counts()
# print(fall_rated)

In [None]:
# count of rated (R, PG, G, etc.) by season

# set width of bar
barWidth = 0.25

# set height of bar
height_winter_rated = [60, 36, 30, 25, 11, 7, 4]
height_spring_rated = [12, 12, 22, 5, 10, 13, 5]
height_summer_rated = [14, 16, 19, 7, 13, 10, 10]
height_fall_rated = [25, 15, 30, 10, 10, 16, 7]

 
# Set position of bar on X axis
r1 = np.arange(len(height_winter_rated))
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]
r4 = [x + barWidth for x in r3]
 
# Make the plot
plt.bar(r1, height_winter_rated, color='navy', width=barWidth, edgecolor='white', label='Winter')
plt.bar(r2, height_spring_rated, color='orange', width=barWidth, edgecolor='white', label='Spring')
plt.bar(r3, height_summer_rated, color='red', width=barWidth, edgecolor='white', label='Summer')
plt.bar(r4, height_fall_rated, color='black', width=barWidth, edgecolor='white', label='Fall')

 
# Add xticks on the middle of the group bars
plt.title("'Best Picture' Nominee Movie Rating by Season (1927-2010)", fontweight='bold')
plt.xlabel('Movie Rating', fontweight='bold')
plt.ylabel("Count of Movie Rating", fontweight='bold')

plt.xticks([r + barWidth for r in range(len(height_winter_rated))], ['R', 'PG', 'Not Rated', 'PG-13', 'Approved', 'Passed', 'G'])

#create legend, show graphic, and push to .png
plt.legend()
# plt.savefig("count_of_rated_by_season.png")
plt.show()

In [None]:
movie_data_to_read = 'Seasons_Movie_Data.csv'
movie_data_df = pd.read_csv(movie_data_to_read)

plt.scatter(movie_data_df['Meta_Score'], movie_data_df['imdb_Rating'], marker='o', s=10)

plt.title('Metascore vs. imdb Rating')
plt.ylabel('Metacritic Score')
plt.xlabel('imdb Rating')
plt.grid(True)

# plt.savefig('Metascore vs. imdb.png')

plt.show

In [None]:
#counting instances of genre
word_counts = {}
with open('merged_movie_data.csv', 'r') as csvfile:
 reader = csv.reader(csvfile)
 next(reader)
 for row in reader:
      csv_words = row[1].split(", ")
      for word in csv_words:
          if word in word_counts:
              word_counts[word] += 1;
          else:
              word_counts[word] = 1;

word_counts
#this is for all of the movies

In [None]:
genre = ['Drama', 'Adventure', 'Biography', 'Thriller', 'Comedy', 'Musical',
        'Romance', 'Crime', 'Sci-Fi', 'War', 'History', 'Fantasy', 'Mystery',
        'Music', 'Action', 'Family', 'Western', 'Animation', 'Sport', 'Film-Noir', 'Horror']
count = [413, 49, 89, 57, 100, 36, 203, 54, 4, 56, 65, 25, 35, 21, 24, 25, 17, 4, 17, 10, 3]

plt.bar(genre, count, color='r', alpha=1.0, align="center")
plt.xticks(rotation=90)

plt.title("Genre Count Of Best Picture Nominees")
plt.xlabel("Genre")
plt.ylabel("Count of Genre of Nominees")

plt.show()

In [None]:
# create a path to csv and read it into a pandas dataframe
movies_csv = 'merged_movie_data.csv'
movies_df = pd.read_csv(movies_csv, usecols = ['Title', 'Genre', 'Meta_Score', 'imdb_Rating',
                                              'Box_Office', 'Rated', 'Studio', 'Release_Date',
                                              'Poster_URL', 'Plot', 'Year', 'Category', 'Nominee',
                                              'Won?'], encoding = 'latin-1')

# create another dataframe that only includes winners of the Best Picture category
movies_data = movies_df.loc[movies_df['Won?'] == 'YES', :]

# create list of best picture nominees
best_picture_wins = movies_data["Won?"]

movies_data.head()

In [None]:
movies_data.to_csv('best_picture_winners.csv', index=False)

In [None]:
winner_word_counts = {}
with open('best_picture_winners.csv', 'r') as csvfile:
 reader = csv.reader(csvfile)
 next(reader)
 for row in reader:
      csv_words = row[1].split(", ")
      for word in csv_words:
          if word in winner_word_counts:
              winner_word_counts[word] += 1;
          else:
              winner_word_counts[word] = 1;

winner_word_counts

In [None]:
genre_winners = ['Drama', 'Adventure', 'Biography', 'Thriller', 'Comedy', 'Musical',
                'Romance', 'Crime', 'Sci-Fi', 'War', 'History', 'Fantasy', 'Mystery',
                'Music', 'Action', 'Family', 'Western', 'Animation', 'Sport', 'Film-Noir', 'Horror']
count_winners = [72, 9, 17, 10, 12, 9, 29, 11, 0, 15, 14, 1, 2, 2, 3, 6, 3, 0, 3, 1, 0]

plt.bar(genre_winners, count_winners, color='r', alpha=1.0, align="center")
plt.xticks(rotation=90)

plt.title("Genre Count Of Best Picture Winning Movies")
plt.xlabel("Genre")
plt.ylabel("Count of Genre of Winning Movies")

plt.show()

In [None]:
# set width of bar
barWidth = 0.25

# set height of bar
nominee_genre_count = [413, 49, 89, 57, 100, 36, 203, 54, 4, 56, 65, 25, 35, 21, 24, 25, 17, 4, 17, 10, 3]
winner_genre_count = [72, 9, 17, 10, 12, 9, 29, 11, 0, 15, 14, 1, 2, 2, 3, 6, 3, 0, 3, 1, 0]


# Set position of bar on X axis
r1 = np.arange(len(nominee_genre_count))
r2 = [x + barWidth for x in r1]

# Make the plot
plt.bar(r1, winner_genre_count, color='navy', width=barWidth, edgecolor='white', label='Winning Noms')
plt.bar(r2, nominee_genre_count, color='orange', width=barWidth, edgecolor='white', label='Losing Noms')


# Add xticks on the middle of the group bars
plt.title("Genre Count Of Best Picture Winning Movies vs Nominee", fontweight='bold')
plt.xlabel('Movie Genre', fontweight='bold')
plt.ylabel("Count of Genre", fontweight='bold')

plt.xticks([r + barWidth for r in range(len(nominee_genre_count))], ['Drama', 'Adventure', 'Biography',
                                                                    'Thriller', 'Comedy', 'Musical', 'Romance',
                                                                    'Crime', 'Sci-Fi', 'War', 'History', 'Fantasy',
                                                                    'Mystery', 'Music', 'Action', 'Family', 'Western',
                                                                    'Animation', 'Sport', 'Film-Noir', 'Horror'],
          rotation=90)


#create legend, show graphic
plt.legend()
plt.show()

In [None]:
merged_movie_data_df = pd.read_csv("merged_movie_data.csv")


# creating wordcloud (https://www.datacamp.com/community/tutorials/wordcloud-python)
word_cloud = WordCloud(max_font_size=75, max_words=100,
                      background_color="white").generate(' '.join(merged_movie_data_df['Genre']))

# generate plot
plt.title("Genre Wordcloud")
plt.imshow(word_cloud)
plt.axis("off")
plt.show()

In [None]:
winners_only_df = pd.read_csv("winners_only.csv")

# creating wordcloud (https://www.datacamp.com/community/tutorials/wordcloud-python)
word_cloud = WordCloud(max_font_size=75, max_words=100,
                      background_color="white").generate(' '.join(winners_only_df['Genre']))


# generate plot
plt.title("Genre Winners Wordcloud")
plt.imshow(word_cloud)
plt.axis("off")
plt.show()

In [None]:
seasons_csv = "Seasons_Movie_data.csv"
seasons_df = pd.read_csv(seasons_csv, usecols = ['Title', 'imdb_Rating', 'Nominee', 'Won?', 'Seasons'], encoding = 'latin-1')
winning_noms = seasons_df[seasons_df["Won?"] == "YES"]
winning_noms.head()

fall_movies = winning_noms[winning_noms['Seasons'] == 'Fall']
spring_movies = winning_noms[winning_noms['Seasons'] == 'Spring']
summer_movies = winning_noms[winning_noms['Seasons'] == 'Summer']
winter_movies = winning_noms[winning_noms['Seasons'] == 'Winter']

fall_movies_imdb = fall_movies['imdb_Rating']
spring_movies_imdb = spring_movies['imdb_Rating']
summer_movies_imdb = summer_movies['imdb_Rating']
winter_movies_imdb = winter_movies['imdb_Rating']

fall_movies_imdb.mean()
spring_movies_imdb.mean()
summer_movies_imdb.mean()
winter_movies_imdb.mean()

x_values = ['Fall','Winter','Summer','Spring']
y_values = [7.7, 7.9, 7.6, 7.7]
plt.ylim([0,10])
bars = plt.bar(x_values, y_values, color = 'g', alpha = 1.0, align="center")
plt.title('Average iMDB Rating Of Award-Winning Movies Per Season')
plt.xlabel('Seasons')
plt.ylabel('iMDB Rating')
for bar in bars:
   yval = bar.get_height()
   plt.text(bar.get_x() + .25, yval + 0.5, yval)
plt.show()

In [None]:
#example ttest
fall_movies = movie_data_df[movie_data_df['Season'] == 'Fall']
spring_movies = movie_data_df[movie_data_df['Season'] == 'Spring']
summer_movies = movie_data_df[movie_data_df['Season'] == 'Summer']
winter_movies = movie_data_df[movie_data_df['Season'] == 'Winter']

fall_movies_imdb = fall_movies['imdb_Rating']
spring_movies_imdb = spring_movies['imdb_Rating']
summer_movies_imdb = summer_movies['imdb_Rating']
winter_movies_imdb = winter_movies['imdb_Rating']

stats.ttest_ind(fall_movies_imdb, winter_movies_imdb, equal_var=False)

In [18]:
#example ttest
# create dataframe for winning and losing nominees
winning_noms = seasons_omdb_csv_df[seasons_omdb_csv_df["Won?"] == "YES"]
losing_noms = seasons_omdb_csv_df[seasons_omdb_csv_df["Won?"] == "NO"]

winning_noms_imdb = winning_noms['imdb_Rating']
losing_noms_imdb = losing_noms['imdb_Rating']

stats.ttest_ind(winning_noms_imdb, losing_noms_imdb, equal_var=False)

Ttest_indResult(statistic=3.5733867271080175, pvalue=0.0005327364755244779)