In [3]:
import numpy as np
import pydp as dp  
from pydp.algorithms.laplacian import BoundedSum, BoundedMean, Count, Max,Count,Min
import pandas as pd
import re
import statistics

In [4]:
class DataClean:
    def __init__(self):
        self.column_name = []
        self.titles = []
        self.find_score = dict()
  
    def add_column(self, column: str):
        name_regex = re.compile(r'^[a-zA-Z ]+$')
        if not name_regex.match(column):
            raise RuntimeError("illegal column adding")
        self.column_name.append(column)

    def data_clean(self, name: str, columns: list[str]):
        df = pd.read_excel(name)
        for index, column in enumerate(columns):
            self.add_column(column)
            self.find_score[column] = index
        df = df.loc[:, self.column_name]
        df = df.dropna(axis=0, how='any')
        df.loc[:, "Hidden Gem Score"]= df["Hidden Gem Score"].apply(lambda x:x * 10)
        df.loc[:, "IMDb Score"]= df["IMDb Score"].apply(lambda x:x* 10)
        df.loc[:, "Genre"]= df["Genre"].apply(lambda x:[i.strip() for i in x.split(",")])
        return df

In [5]:
dc = DataClean()
filename = "Netflix Dataset Latest 2021.xlsx"
column_name = ['Title','Genre','Hidden Gem Score', 'IMDb Score', 'Metacritic Score', 'Rotten Tomatoes Score']
df = dc.data_clean(filename, column_name)

In [20]:
def highest_rated_movie(df,privacy_budget):
    means = [(df.iloc[i, 2:6].mean()) for i in range(df.shape[0])]
    max = Max(epsilon = privacy_budget, lower_bound = 0, upper_bound = 100, dtype="float")
    max_rating = max.quick_result(means)
    process = [abs(x - max_rating) for x in means]
    orig_indices = list(df.index)
    min_index = orig_indices[np.argmin(process)]
    selected_row = df.loc[min_index, :]
    return selected_row

def highest_rated_movie_per_genre(df,privacy_budget,genre):
    means = [(df.iloc[i, 2:6].mean()) for i in range(df.shape[0]) if genre in df.iloc[i, 1]]
    max = Max(epsilon = privacy_budget, lower_bound = 0, upper_bound = 100, dtype="float")
    max_rating = max.quick_result(means)
    process = [abs(x - max_rating) for x in means]
    orig_indices = list(df.index)
    min_index = orig_indices[np.argmin(process)]
    selected_row = df.loc[min_index, :]
    return selected_row

def lowest_rated_movie(df,privacy_budget):
    means = [(df.iloc[i, 2:6].mean()) for i in range(df.shape[0])]
    max = Min(epsilon = privacy_budget ,lower_bound = 0, upper_bound = 100, dtype="float")
    max_rating = max.quick_result(means)
    orig_indices = list(df.index)
    min_index = orig_indices[np.argmin(means)]
    selected_row = df.loc[min_index, :]
    return selected_row

def movie_count_above_rating(df,privacy_budget,thresh):
    if not 0.0 <= thresh <= 100.0:
        raise ValueError("invalid threshhold")
    means = [(df.iloc[i, 2:6].mean()) for i in range(df.shape[0]) if(df.iloc[i, 2:6].mean()) > thresh ]
    x = Count(privacy_budget, dtype="float")
    return x.quick_result(means)

def average_per_genre(df,privacy_budget,genre):
    means = [(df.iloc[i, 2:6].mean()) for i in range(df.shape[0]) if genre in df.iloc[i, 1] ]
    x = BoundedMean(privacy_budget, 0, 1, 100,dtype="float")
    return x.quick_result(means)

def average_per_genre_scoretype(df,privacy_budget,genre,scoretype):
    try:
        column_num = df.columns.get_loc(scoretype)
    except:
        raise ValueError("invalid scoretype argument")
    means = [df.iloc[i, column_num] for i in range(df.shape[0]) if genre in df.iloc[i, 1] ]
    x = BoundedMean(privacy_budget, 0, 1, 100,dtype="float")
    return x.quick_result(means)


In [22]:
#non-dp
def highest_rated_movie_nodf(df):
        means = [(df.iloc[i, 2:6].mean()) for i in range(df.shape[0])]
        max_rating = max(means)
        process = [abs(x - max_rating) for x in means]
        orig_indices = list(df.index)
        min_index = orig_indices[np.argmin(process)]
        selected_row = df.loc[min_index, :]
        return selected_row

def highest_rated_movie_per_genre_nodf( df, genre):
    means = [(df.iloc[i, 2:6].mean()) for i in range(df.shape[0]) if genre in df.iloc[i, 1]]
    max_rating = max(means)
    process = [abs(x - max_rating) for x in means]
    orig_indices = list(df.index)
    min_index = orig_indices[np.argmin(process)]
    selected_row = df.loc[min_index, :]
    return selected_row

def movie_count_above_rating_nodf(df, thresh):
    if not 0.0 <= thresh <= 100.0:
        raise ValueError("invalid threshhold")
    means = [(df.iloc[i, 2:6].mean()) for i in range(df.shape[0]) if(df.iloc[i, 2:6].mean()) > thresh ]
    return len(means)


def average_per_genre_nodf( df, genre):
    means = [(df.iloc[i, 2:6].mean()) for i in range(df.shape[0]) if genre in df.iloc[i, 1] ]
    return sum(means) / len(means) if len(means) > 0 else 0.0


def average_per_genre_scoretype_nodf( df, genre, scoretype):
    try:
        column_num = df.columns.get_loc(scoretype)
    except:
        raise ValueError("invalid scoretype argument")
    means = [df.iloc[i, column_num] for i in range(df.shape[0]) if genre in df.iloc[i, 1] ]
    return sum(means) / len(means) if len(means) > 0 else 0.0

def lowest_rated_movie_nodf( df):
    means = [(df.iloc[i, 2:6].mean()) for i in range(df.shape[0])]
    max_rating = min(means)
    #process = [abs(x - max_rating) for x in means]
    orig_indices = list(df.index)
    min_index = orig_indices[np.argmin(means)]
    selected_row = df.loc[min_index, :]
    return selected_row


In [31]:
lisst= highest_rated_movie_nodf(df)
lisst1= highest_rated_movie(df,0.1)
lisst2= highest_rated_movie(df,0.9)
lisst

Title                    Bending the Arc
Genre                      [Documentary]
Hidden Gem Score                    92.0
IMDb Score                          77.0
Metacritic Score                    80.0
Rotten Tomatoes Score              100.0
Name: 734, dtype: object

In [32]:
lisst1

Title                        Screwball
Genre                    [Documentary]
Hidden Gem Score                  83.0
IMDb Score                        71.0
Metacritic Score                  72.0
Rotten Tomatoes Score             94.0
Name: 2514, dtype: object

In [33]:
lisst2

Title                    Hotel by the River
Genre                               [Drama]
Hidden Gem Score                       85.0
IMDb Score                             67.0
Metacritic Score                       79.0
Rotten Tomatoes Score                  95.0
Name: 344, dtype: object