In [1]:
#### Determine Outliers using the Quartile and Mean/SD Methods ####

# Importing necessary libraries
import pandas as pd

# Load the dataset
IMDb_df = pd.read_csv('IMDb.csv')

# Calculating the mean, standard deviation, quartiles and IQR for 'IMDB_Rating'
mean_IMDB_Rating = IMDb_df['IMDB_Rating'].mean()
std_IMDB_Rating = IMDb_df['IMDB_Rating'].std()
Q1 = IMDb_df['IMDB_Rating'].quantile(0.25)
Q3 = IMDb_df['IMDB_Rating'].quantile(0.75)
IQR = Q3 - Q1

# Quartile Method Thresholds
mild_outliers_q_low = Q1 - 1.5 * IQR
mild_outliers_q_high = Q3 + 1.5 * IQR
regular_outliers_q_low = Q1 - 3 * IQR
regular_outliers_q_high = Q3 + 3 * IQR

# Mean/SD Method Thresholds
mild_outliers_sd_low = mean_IMDB_Rating - 2 * std_IMDB_Rating
mild_outliers_sd_high = mean_IMDB_Rating + 2 * std_IMDB_Rating
regular_outliers_sd_low = mean_IMDB_Rating - 3 * std_IMDB_Rating
regular_outliers_sd_high = mean_IMDB_Rating + 3 * std_IMDB_Rating

# Creating a table to display the results
outlier_thresholds = pd.DataFrame({
    'Type': ['Mild Outliers', 'Regular Outliers', 'Mild Outliers', 'Regular Outliers'],
    'Method': ['Quartile Method', 'Quartile Method', 'Mean/SD Method', 'Mean/SD Method'],
    'Lower Threshold': [mild_outliers_q_low, regular_outliers_q_low, mild_outliers_sd_low, regular_outliers_sd_low],
    'Upper Threshold': [mild_outliers_q_high, regular_outliers_q_high, mild_outliers_sd_high, regular_outliers_sd_high]
})

print(outlier_thresholds)

# Counting the number of outliers above each UPPER threshold
outlier_count = {
     'High Mild Outliers (Quartile Method)': (IMDb_df['IMDB_Rating'] > mild_outliers_q_high).sum(),
     'High Regular Outliers (Quartile Method)': (IMDb_df['IMDB_Rating'] > regular_outliers_q_high).sum(),
     'High Mild Outliers (Mean/SD Method)': (IMDb_df['IMDB_Rating'] > mild_outliers_sd_high).sum(),
     'High Regular Outliers (Mean/SD Method)': (IMDb_df['IMDB_Rating'] > regular_outliers_sd_high).sum()
 }

# Print the count of outliers
print(outlier_count)

# Counting the number of outliers below each LOWER threshold
outlier_count = {
     'Low Mild Outliers (Quartile Method)': (IMDb_df['IMDB_Rating'] < mild_outliers_q_low).sum(),
     'Low Regular Outliers (Quartile Method)': (IMDb_df['IMDB_Rating'] < regular_outliers_q_low).sum(),
     'Low Mild Outliers (Mean/SD Method)': (IMDb_df['IMDB_Rating'] < mild_outliers_sd_low).sum(),
     'Low Regular Outliers (Mean/SD Method)': (IMDb_df['IMDB_Rating'] < regular_outliers_sd_low).sum()
 }

# Print the count of outliers
print(outlier_count)

               Type           Method  Lower Threshold  Upper Threshold
0     Mild Outliers  Quartile Method         7.100000         8.700000
1  Regular Outliers  Quartile Method         6.500000         9.300000
2     Mild Outliers   Mean/SD Method         7.380524         8.513339
3  Regular Outliers   Mean/SD Method         7.097320         8.796543
{'High Mild Outliers (Quartile Method)': 13, 'High Regular Outliers (Quartile Method)': 1, 'High Mild Outliers (Mean/SD Method)': 29, 'High Regular Outliers (Mean/SD Method)': 13}
{'Low Mild Outliers (Quartile Method)': 0, 'Low Regular Outliers (Quartile Method)': 0, 'Low Mild Outliers (Mean/SD Method)': 0, 'Low Regular Outliers (Mean/SD Method)': 0}
