In [None]:
# We have to import pandas and matplotlib.pyplot so we can use the dataframe 
import pandas as pd import matplotlib.pyplot as plt 

# We then have to read in the csv as a DataFrame 
office_df = pd.read_csv('datasets/office_episodes.csv', parse_dates=['release_date'])

# Create two empty lists, columns and sizes
cols = [] 
sizes = []

# Then iterate through the DataFrame, assigning colors based on the rating 
for ind,row in office_df.iterrows(): 
    if row['scaled_ratings'] < 0.25: 
        cols.append('red') 
    elif row['scaled_ratings'] < 0.50: 
        cols.append('orange') 
    elif row['scaled_ratings'] < 0.75: 
        cols.append('lightgreen') 
    else: 
        cols.append('darkgreen') 
        
# Then we iterate through the DataFrame, assigning a size based on whether it has guests 
for ind, row in office_df.iterrows(): 
    if row['has_guests'] == False: 
        sizes.append(25) 
    else: 
        sizes.append(250) 
        
# To plot our findings easily, we add our lists as columns to the DataFrame 
office_df['colors'] = cols 
office_df['sizes'] = sizes 

# Now in the cinem, we have guest(those invited) and non_guest(those who have paid to watch). Because of this, we have to split data into guest and non_guest DataFrames 
non_guest_df = office_df[office_df['has_guests'] == False] 
guest_df = office_df[office_df['has_guests'] == True] 

# Then set the figure size and plot style 
plt.rcParams['figure.figsize'] = [11, 7] 
plt.style.use('fivethirtyeight') 

# Create the figure 
fig = plt.figure() 

# We then create two scatter plots with the episode number on the x axis, and the viewership on the y axis. Create a normal scatter plot for regular episodes. Assign our color list as the colors and set marker and size. 
plt.scatter(x=non_guest_df.episode_number, y=non_guest_df.viewership_mil, \ 
            c=non_guest_df['colors'], s=25) 

# Then create a starred scatterplot for guest star episodes, assigning our color list as the colors and set marker and size  
plt.scatter(x=guest_df.episode_number, y=guest_df.viewership_mil, \ 
            c=guest_df['colors'], marker='*', s=250) 

# We create a title 
plt.title("Popularity, Quality, and Guest Appearances on the Office", fontsize=28) 

# Create an x-axis label 
plt.xlabel("Episode Number", fontsize=18) 

# Create a y-axis label 
plt.ylabel("Viewership (Millions)", fontsize=18) 

# Show the plot 
plt.show() 

# To get the most popular guest star, we have to find the star with the viewership_mil greater than 20 
print(office_df[office_df['viewership_mil'] > 20]['guest_stars']) top_star = 'Jessica Alba'