# TMDB Movie Dataset 

In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import random
import string
import hashlib
import json

#Loading data
df = pd.read_csv('TMDB_movie_dataset_v11.csv', sep = ',')
df['release_date'] = df['release_date'].str.slice(stop = 16)
df['release_date'] = pd.to_datetime(df['release_date'], format = "%Y-%m-%d %H:%M")
df['week'] = df['release_date'].dt.isocalendar().week

In [2]:
df = df.drop(columns=['title','status','runtime', 'budget', 'adult', 'backdrop_path', 'popularity', 'original_language', 'original_title', 'overview', 'poster_path', 'tagline', 'genres', 'production_companies', 'imdb_id', 'homepage', 'production_countries', 'spoken_languages'])
df = df.dropna(subset=["release_date"])
df = df[df['revenue'] != 0]
df = df.reset_index(drop=True)

In [3]:
df

Unnamed: 0,id,vote_average,vote_count,release_date,revenue,week
0,27205,8.364,34495,2010-07-15,825532764,28
1,157336,8.417,32571,2014-11-05,701729206,45
2,155,8.512,30619,2008-07-16,1004558444,29
3,19995,7.573,29815,2009-12-15,2923706026,51
4,24428,7.710,29166,2012-04-25,1518815515,17
...,...,...,...,...,...,...
18130,658886,0.000,0,2022-09-25,2000,38
18131,659129,0.000,0,2019-10-20,10,42
18132,655521,0.000,0,2019-12-14,278,50
18133,656654,0.000,0,1969-01-01,105000,1


In [4]:
mean_vote = round(df["vote_average"].mean(), 3)
df["vote_average"] = df["vote_average"].replace(0.000, mean_vote)

In [5]:
median_vote = df["vote_count"].median()
df["vote_count"] = df["vote_count"].replace(0, median_vote)

In [6]:
most_frequent_revenue = df["revenue"].median()
df["revenue"] = df["revenue"].replace(0, most_frequent_revenue)

In [7]:
df

Unnamed: 0,id,vote_average,vote_count,release_date,revenue,week
0,27205,8.364,34495,2010-07-15,825532764,28
1,157336,8.417,32571,2014-11-05,701729206,45
2,155,8.512,30619,2008-07-16,1004558444,29
3,19995,7.573,29815,2009-12-15,2923706026,51
4,24428,7.710,29166,2012-04-25,1518815515,17
...,...,...,...,...,...,...
18130,658886,5.773,109,2022-09-25,2000,38
18131,659129,5.773,109,2019-10-20,10,42
18132,655521,5.773,109,2019-12-14,278,50
18133,656654,5.773,109,1969-01-01,105000,1


In [8]:
# Save file
df_ori = df.drop('week', axis=1)
df_ori.to_csv('original_2.csv', sep = '\t', index=False, header=False, columns=None)

In [9]:
df_ori

Unnamed: 0,id,vote_average,vote_count,release_date,revenue
0,27205,8.364,34495,2010-07-15,825532764
1,157336,8.417,32571,2014-11-05,701729206
2,155,8.512,30619,2008-07-16,1004558444
3,19995,7.573,29815,2009-12-15,2923706026
4,24428,7.710,29166,2012-04-25,1518815515
...,...,...,...,...,...
18130,658886,5.773,109,2022-09-25,2000
18131,659129,5.773,109,2019-10-20,10
18132,655521,5.773,109,2019-12-14,278
18133,656654,5.773,109,1969-01-01,105000


In [10]:
df['vote_average'].describe()

count    18135.000000
mean         6.345845
std          1.223960
min          0.500000
25%          5.773000
50%          6.273000
75%          7.000000
max         10.000000
Name: vote_average, dtype: float64

In [11]:
df['vote_count'].describe()

count    18135.000000
mean       912.899752
std       2325.908380
min          1.000000
25%         28.000000
50%        109.000000
75%        646.000000
max      34495.000000
Name: vote_count, dtype: float64

In [12]:
df['revenue'].describe()

count    1.813500e+04
mean     3.997324e+07
std      1.236662e+08
min     -1.200000e+01
25%      1.410270e+05
50%      3.003296e+06
75%      2.300000e+07
max      3.000000e+09
Name: revenue, dtype: float64

In [13]:
df.isnull().sum()

id              0
vote_average    0
vote_count      0
release_date    0
revenue         0
week            0
dtype: int64

In [14]:
df.duplicated().sum()

0

In [15]:
#Change vote_average
df['vote_average'] = df['vote_average'] + 0.017 * (np.sin(df['vote_average']) + np.cos(df['vote_average']))
df['vote_average'] = df['vote_average'].round(decimals=3)

In [16]:
df

Unnamed: 0,id,vote_average,vote_count,release_date,revenue,week
0,27205,8.371,34495,2010-07-15,825532764,28
1,157336,8.422,32571,2014-11-05,701729206,45
2,155,8.515,30619,2008-07-16,1004558444,29
3,19995,7.594,29815,2009-12-15,2923706026,51
4,24428,7.729,29166,2012-04-25,1518815515,17
...,...,...,...,...,...,...
18130,658886,5.780,109,2022-09-25,2000,38
18131,659129,5.780,109,2019-10-20,10,42
18132,655521,5.780,109,2019-12-14,278,50
18133,656654,5.780,109,1969-01-01,105000,1


In [17]:
#Change date to random (same week)
def change_date(date):
    rand_day = int(np.random.choice([-2,-1,0,1,2], p=[0.03, 0.03, 0.88, 0.03, 0.03]))
    return date - dt.timedelta(days=rand_day) if date.isocalendar()[1] == (date - dt.timedelta(days=rand_day)).isocalendar()[1] else date + dt.timedelta(days=rand_day)

In [18]:
#Apply change_date
df['release_date'] = df['release_date'].apply(change_date) 

In [19]:
#Change vote_count
def change_count(count):
    rand_count = np.random.randint(1, 100)
    return count + rand_count

In [20]:
#Apply vote_count
df['vote_count'] = df['vote_count'].apply(change_count) 

In [21]:
df

Unnamed: 0,id,vote_average,vote_count,release_date,revenue,week
0,27205,8.371,34506,2010-07-15,825532764,28
1,157336,8.422,32632,2014-11-05,701729206,45
2,155,8.515,30664,2008-07-16,1004558444,29
3,19995,7.594,29858,2009-12-15,2923706026,51
4,24428,7.729,29171,2012-04-27,1518815515,17
...,...,...,...,...,...,...
18130,658886,5.780,160,2022-09-25,2000,38
18131,659129,5.780,130,2019-10-20,10,42
18132,655521,5.780,133,2019-12-14,278,50
18133,656654,5.780,125,1969-01-01,105000,1


In [22]:
#Change revenue
def change_revenue(revenue):
    rand_revenue = 0
    if revenue >= 700000000:
        rand_revenue = np.random.randint(-500000, -100)
    elif revenue <= 7000:
        rand_revenue = np.random.randint(50, 1000)
    else:
        rand_revenue = np.random.randint(0, 10)
    return revenue + rand_revenue

In [23]:
df['revenue'] = df['revenue'].apply(change_revenue) 

In [24]:
df

Unnamed: 0,id,vote_average,vote_count,release_date,revenue,week
0,27205,8.371,34506,2010-07-15,825480790,28
1,157336,8.422,32632,2014-11-05,701571283,45
2,155,8.515,30664,2008-07-16,1004518715,29
3,19995,7.594,29858,2009-12-15,2923472296,51
4,24428,7.729,29171,2012-04-27,1518435968,17
...,...,...,...,...,...,...
18130,658886,5.780,160,2022-09-25,2598,38
18131,659129,5.780,130,2019-10-20,345,42
18132,655521,5.780,133,2019-12-14,941,50
18133,656654,5.780,125,1969-01-01,105006,1


In [25]:
#Pseudo id (random id ---> hash)
df['id'] =  str(random.randrange(3000, 3999)) + df['id'].astype(str) + 2*df['week'].astype(str)
df['id'] = df['id'].apply(lambda x: hashlib.sha256(x.encode()).hexdigest())

In [26]:
df

Unnamed: 0,id,vote_average,vote_count,release_date,revenue,week
0,bc9da647234904060b0aa8295c9536f99f212a5cda1c75...,8.371,34506,2010-07-15,825480790,28
1,5c421dab899b74c46d89ca7ad600ea712b1fe0261664ce...,8.422,32632,2014-11-05,701571283,45
2,940c05d8ce43d93508bf9d556a941c58338c147945ce3d...,8.515,30664,2008-07-16,1004518715,29
3,8f28ae544afa4fcbd393e539a37018ecf5e031cb289561...,7.594,29858,2009-12-15,2923472296,51
4,ab067a6c7f358e2f5d31271858356c2974c589ee5e319f...,7.729,29171,2012-04-27,1518435968,17
...,...,...,...,...,...,...
18130,df0899e502af7fcc32ee106206d40160327c420c15061c...,5.780,160,2022-09-25,2598,38
18131,576c4756a6ca35b839579e97170f235aa3bfc6e52cb29a...,5.780,130,2019-10-20,345,42
18132,1508f1164e8b445a2745253b8706bc49f80de5e3427934...,5.780,133,2019-12-14,941,50
18133,0128efe4bf0dbf1053f87f08b6d2950daabdb5e15cfb81...,5.780,125,1969-01-01,105006,1


In [27]:
df = df.drop('week', axis=1)

In [28]:
df

Unnamed: 0,id,vote_average,vote_count,release_date,revenue
0,bc9da647234904060b0aa8295c9536f99f212a5cda1c75...,8.371,34506,2010-07-15,825480790
1,5c421dab899b74c46d89ca7ad600ea712b1fe0261664ce...,8.422,32632,2014-11-05,701571283
2,940c05d8ce43d93508bf9d556a941c58338c147945ce3d...,8.515,30664,2008-07-16,1004518715
3,8f28ae544afa4fcbd393e539a37018ecf5e031cb289561...,7.594,29858,2009-12-15,2923472296
4,ab067a6c7f358e2f5d31271858356c2974c589ee5e319f...,7.729,29171,2012-04-27,1518435968
...,...,...,...,...,...
18130,df0899e502af7fcc32ee106206d40160327c420c15061c...,5.780,160,2022-09-25,2598
18131,576c4756a6ca35b839579e97170f235aa3bfc6e52cb29a...,5.780,130,2019-10-20,345
18132,1508f1164e8b445a2745253b8706bc49f80de5e3427934...,5.780,133,2019-12-14,941
18133,0128efe4bf0dbf1053f87f08b6d2950daabdb5e15cfb81...,5.780,125,1969-01-01,105006


In [29]:
# Save file
df.to_csv('self_anonymisation_2.csv', sep = '\t', index=False, header=False, columns=None)