# SDV applied to the film style data of Ingmar Bergman

## Importing the libraries

In [None]:
!pip install numpy==1.20
!pip install sdv
!pip install xlrd==1.2.0

In [3]:
import numpy as np
from numpy import log as ln
import matplotlib.pyplot as plt
import pandas as pd
from google.colab import files
import sdv
from sdv.evaluation import evaluate
import statistics

# Training data generation

The same training data set that was employed in the preliminary analysis of Module 1.5 of the CineMLA framework is employed in the synthetic data generation pipeline

Import the dataset

In [4]:
data = pd.read_excel('prelim_bergman_training_cleaned.xlsx')
data.pop('Director')
data.pop('Total') # The 'Total' camera moves will br calculated and added in for the synthetic data
data.pop('Year') # Does not serve as an input variale to the classification models
data.pop('COUNTRY') # Does not serve as an input variale to the classification models

data.head()

Unnamed: 0,Film,Pan,Tilt,Pan w. Tilt,Track,Track w. Pan & tilt,Crane,Zoom,Zoom w. Pan & Tilt,BCU,...,MS,MLS,LS,VLS,INS,POV,ASL,RA,CU,BERGMAN
0,Deep Blue Sea,14,7,8,27,9,7,2,0,140,...,46,51,94,9,23,10,2.6,24,106,0.0
1,Verboten!,35,7,19,19,22,12,2,0,56,...,53,68,155,28,13,20,10.7,22,77,0.0
2,"Reckless Moment, The",92,2,21,24,62,11,3,0,22,...,118,86,113,49,0,0,15.5,0,37,0.0
3,Kris,30,3,13,43,70,0,0,0,30,...,105,55,128,10,9,1,27.2,17,80,1.0
4,Shadows,55,4,11,2,2,2,0,0,122,...,50,46,22,0,1,1,7.1,31,182,0.0


Data pre-processing

In [5]:
# Split the data set based on the class label (BERGMAN) attribute
data_bergman = data[data['BERGMAN'] == 1]
data_bergman.head()

Unnamed: 0,Film,Pan,Tilt,Pan w. Tilt,Track,Track w. Pan & tilt,Crane,Zoom,Zoom w. Pan & Tilt,BCU,...,MS,MLS,LS,VLS,INS,POV,ASL,RA,CU,BERGMAN
3,Kris,30,3,13,43,70,0,0,0,30,...,105,55,128,10,9,1,27.2,17,80,1.0
9,Sommarnattens leende,58,2,16,26,35,0,1,0,20,...,103,53,52,19,8,3,16.0,40,132,1.0
12,Kvinnodröm,48,6,20,9,28,0,0,2,86,...,73,38,47,8,23,10,16.1,48,145,1.0
16,Smultronstället,26,3,22,30,17,0,0,0,36,...,65,39,97,12,12,7,9.3,49,123,1.0
26,En lektion I kärlek,67,8,46,32,40,0,2,0,29,...,67,38,74,19,13,3,23.7,28,97,1.0


In [6]:
data_non_bergman = data[data['BERGMAN'] == 0]
data_non_bergman.head()

Unnamed: 0,Film,Pan,Tilt,Pan w. Tilt,Track,Track w. Pan & tilt,Crane,Zoom,Zoom w. Pan & Tilt,BCU,...,MS,MLS,LS,VLS,INS,POV,ASL,RA,CU,BERGMAN
0,Deep Blue Sea,14,7,8,27,9,7,2,0,140,...,46,51,94,9,23,10,2.6,24,106,0.0
1,Verboten!,35,7,19,19,22,12,2,0,56,...,53,68,155,28,13,20,10.7,22,77,0.0
2,"Reckless Moment, The",92,2,21,24,62,11,3,0,22,...,118,86,113,49,0,0,15.5,0,37,0.0
4,Shadows,55,4,11,2,2,2,0,0,122,...,50,46,22,0,1,1,7.1,31,182,0.0
5,Eva,56,9,37,6,48,0,0,0,30,...,114,63,102,29,11,4,15.5,30,98,0.0


# Generate an instance of the Gaussian Copula model

Define SDV constraints

In [7]:
# BERGMAN Data sampling constraints

# Reasonable ASL
from sdv.constraints import ScalarRange

bergman_asl_constraint = ScalarRange(
  column_name='ASL',
  low_value=min(data_bergman['ASL']),
  high_value=max(data_bergman['ASL']),
  strict_boundaries=False
  )

# Constraints
constraints_bergman = [
    bergman_asl_constraint
  ]


In [8]:
# NON-BERGMAN Data sampling constraints

# Reasonable ASL
from sdv.constraints import ScalarRange

non_bergman_asl_constraint = ScalarRange(
  column_name='ASL',
  low_value=min(data['ASL']),
  high_value=max(data['ASL']),
  strict_boundaries=False
  )

# Constraints
constraints_non_bergman = [
    non_bergman_asl_constraint
  ]

Instantiate the Gaussian Copula model

In [9]:
from sdv.tabular import GaussianCopula

# BERGMAN data
model_gc_bergman = GaussianCopula(
    constraints=constraints_bergman,
    primary_key='Film' # ensures no duplicate key values
    )
model_gc_bergman.fit(data_bergman)

# NON-BERGMAN data
model_gc_non_bergman = GaussianCopula(
    constraints=constraints_non_bergman,
    primary_key='Film', # ensures no duplicate key values
    )
model_gc_non_bergman.fit(data_non_bergman)

  a = (self.min - loc) / scale
  b = (self.max - loc) / scale


## Sample new data from Gaussian Copula model

## Sample synthetic Bergman data

In [10]:
synth_data_gc_bergman = model_gc_bergman.sample(num_rows=len(data_bergman)) # Generate a synthetic data set with the same number of observations as the original data

#Calculate the 'Total' number of camera moves per film and add it to the dataframe
sum_camera_moves = synth_data_gc_bergman['Pan'] + synth_data_gc_bergman['Tilt'] + synth_data_gc_bergman['Pan w. Tilt'] + synth_data_gc_bergman['Track'] + synth_data_gc_bergman['Track w. Pan & tilt'] + synth_data_gc_bergman['Crane'] + synth_data_gc_bergman['Zoom'] + synth_data_gc_bergman['Zoom w. Pan & Tilt']
synth_data_gc_bergman.insert(10, 'Total', sum_camera_moves)

# Add the 'Total' column back into the original Bergman data
total_camera_moves = data_bergman['Pan'] + data_bergman['Tilt'] + data_bergman['Pan w. Tilt'] + data_bergman['Track'] + data_bergman['Track w. Pan & tilt'] + data_bergman['Crane'] + data_bergman['Zoom'] + data_bergman['Zoom w. Pan & Tilt']
data_bergman.insert(10, 'Total', total_camera_moves)

# Rename the films in the synthetic data to indicate that they are BERGMAN films
synth_data_gc_bergman['Film'] = 'BERGMAN_' + synth_data_gc_bergman['Film'].astype(str)

synth_data_gc_bergman.head()

Sampling rows: 100%|██████████| 21/21 [00:00<00:00, 421.04it/s]


Unnamed: 0,Film,Pan,Tilt,Pan w. Tilt,Track,Track w. Pan & tilt,Crane,Zoom,Zoom w. Pan & Tilt,BCU,...,MS,MLS,LS,VLS,INS,POV,ASL,RA,CU,BERGMAN
0,BERGMAN_a,18,7,25,36,13,1,0,1,80,...,70,55,70,13,7,3,12.275057,54,129,1.0
1,BERGMAN_b,25,11,33,21,32,2,2,6,228,...,40,35,106,11,9,4,24.832833,35,79,1.0
2,BERGMAN_c,40,14,51,8,18,3,3,5,229,...,35,20,72,5,17,2,16.419683,19,111,1.0
3,BERGMAN_d,25,7,20,11,22,2,3,7,158,...,76,41,88,5,5,2,20.274907,47,96,1.0
4,BERGMAN_e,61,6,20,20,49,3,3,3,61,...,103,61,69,6,7,1,22.991278,42,88,1.0


In [11]:
data_bergman.head()

Unnamed: 0,Film,Pan,Tilt,Pan w. Tilt,Track,Track w. Pan & tilt,Crane,Zoom,Zoom w. Pan & Tilt,BCU,...,MS,MLS,LS,VLS,INS,POV,ASL,RA,CU,BERGMAN
3,Kris,30,3,13,43,70,0,0,0,30,...,105,55,128,10,9,1,27.2,17,80,1.0
9,Sommarnattens leende,58,2,16,26,35,0,1,0,20,...,103,53,52,19,8,3,16.0,40,132,1.0
12,Kvinnodröm,48,6,20,9,28,0,0,2,86,...,73,38,47,8,23,10,16.1,48,145,1.0
16,Smultronstället,26,3,22,30,17,0,0,0,36,...,65,39,97,12,12,7,9.3,49,123,1.0
26,En lektion I kärlek,67,8,46,32,40,0,2,0,29,...,67,38,74,19,13,3,23.7,28,97,1.0


### Evaluate the Bergman data

In [12]:
# KS-Test and Chi-2 Test evaluation scores
eval_gs_KS_bergman=evaluate(synth_data_gc_bergman, data_bergman, metrics=['KSTest'])
eval_gs_Chi_bergman=evaluate(synth_data_gc_bergman, data_bergman, metrics=['CSTest'])



In [13]:
print("Gaussian Copula KS Test evaluation score =", eval_gs_KS_bergman)
print("Gaussian Copula Chi-Squared Test evaluation score =", eval_gs_Chi_bergman)

Gaussian Copula KS Test evaluation score = 0.7233560090702948
Gaussian Copula Chi-Squared Test evaluation score = 0.0


Perform KS-Test on Bergman data

In [14]:
# Calculate the KS-Test D-statistic
bergman_D = 1-eval_gs_KS_bergman

# Calculate KS-Test D_critical value
alpha = 0.05 # significance level
c_alpha = np.sqrt(-0.5*ln(alpha))
n = len(data_bergman)
m = len(synth_data_gc_bergman)
D_critical = c_alpha*np.sqrt((n+m)/(n*m))

print("The Gaussian Copula KS Test evaluation delivers a D-value of", bergman_D, 'with a critical D-value of', D_critical)

if bergman_D < D_critical:
  print('At a significance level of', alpha, 'there is insufficient evidence to reject the null hypothesis.')
else:
  print('At a significance level of', alpha, 'there is sufficient evidence to reject the null hypothesis.')

The Gaussian Copula KS Test evaluation delivers a D-value of 0.27664399092970515 with a critical D-value of 0.37769553583314336
At a significance level of 0.05 there is insufficient evidence to reject the null hypothesis.


## Sample synthetic non-Bergman data

In [15]:
synth_data_gc_non_bergman = model_gc_non_bergman.sample(num_rows=len(data_non_bergman)) # Generate a synthetic data set with the same number of observations as the original data

#Calculate the 'Total' number of camera moves per film and add it to the dataframe
sum_camera_moves_non_berg = synth_data_gc_non_bergman['Pan'] + synth_data_gc_non_bergman['Tilt'] + synth_data_gc_non_bergman['Pan w. Tilt'] + synth_data_gc_non_bergman['Track'] + synth_data_gc_non_bergman['Track w. Pan & tilt'] + synth_data_gc_non_bergman['Crane'] + synth_data_gc_non_bergman['Zoom'] + synth_data_gc_non_bergman['Zoom w. Pan & Tilt']
synth_data_gc_non_bergman.insert(10, 'Total', sum_camera_moves_non_berg)

# Add the 'Total' column back into the original Bergman data
total_camera_moves_non_berg = data_non_bergman['Pan'] + data_non_bergman['Tilt'] + data_non_bergman['Pan w. Tilt'] + data_non_bergman['Track'] + data_non_bergman['Track w. Pan & tilt'] + data_non_bergman['Crane'] + data_non_bergman['Zoom'] + data_non_bergman['Zoom w. Pan & Tilt']
data_non_bergman.insert(10, 'Total', total_camera_moves_non_berg)

# Rename the films in the synthetic data to indicate that they are BERGMAN films
synth_data_gc_non_bergman['Film'] = 'NON_BERGMAN_' + synth_data_gc_non_bergman['Film'].astype(str)

synth_data_gc_non_bergman.head()

Sampling rows: 100%|██████████| 74/74 [00:00<00:00, 1657.69it/s]


Unnamed: 0,Film,Pan,Tilt,Pan w. Tilt,Track,Track w. Pan & tilt,Crane,Zoom,Zoom w. Pan & Tilt,BCU,...,MS,MLS,LS,VLS,INS,POV,ASL,RA,CU,BERGMAN
0,NON_BERGMAN_a,48,10,14,5,5,4,9,4,19,...,103,60,106,49,6,6,3.118488,35,103,0.0
1,NON_BERGMAN_b,32,0,5,9,10,5,6,0,75,...,64,69,46,17,3,2,9.592118,20,140,0.0
2,NON_BERGMAN_c,50,4,19,16,45,3,1,0,26,...,108,117,105,23,8,6,19.546858,13,80,0.0
3,NON_BERGMAN_d,12,10,21,20,10,11,3,2,133,...,63,57,44,7,17,8,5.024535,29,145,0.0
4,NON_BERGMAN_e,32,4,16,51,68,8,15,1,24,...,58,61,55,17,12,9,13.494272,38,189,0.0


In [16]:
data_non_bergman.head()

Unnamed: 0,Film,Pan,Tilt,Pan w. Tilt,Track,Track w. Pan & tilt,Crane,Zoom,Zoom w. Pan & Tilt,BCU,...,MS,MLS,LS,VLS,INS,POV,ASL,RA,CU,BERGMAN
0,Deep Blue Sea,14,7,8,27,9,7,2,0,140,...,46,51,94,9,23,10,2.6,24,106,0.0
1,Verboten!,35,7,19,19,22,12,2,0,56,...,53,68,155,28,13,20,10.7,22,77,0.0
2,"Reckless Moment, The",92,2,21,24,62,11,3,0,22,...,118,86,113,49,0,0,15.5,0,37,0.0
4,Shadows,55,4,11,2,2,2,0,0,122,...,50,46,22,0,1,1,7.1,31,182,0.0
5,Eva,56,9,37,6,48,0,0,0,30,...,114,63,102,29,11,4,15.5,30,98,0.0


### Evaluate the non-Bergman data

In [17]:
# KS-Test and Chi-2 Test evaluation scores
eval_gs_KS_non_bergman=evaluate(synth_data_gc_non_bergman, data_non_bergman, metrics=['KSTest'])
eval_gs_Chi_non_bergman=evaluate(synth_data_gc_non_bergman, data_non_bergman, metrics=['CSTest'])



In [18]:
print("Non-Bergman data Gaussian Copula KS Test evaluation score =", eval_gs_KS_non_bergman)
print("Non-Bergman data Gaussian Copula Chi-Squared Test evaluation score =", eval_gs_Chi_non_bergman)

Non-Bergman data Gaussian Copula KS Test evaluation score = 0.8268983268983268
Non-Bergman data Gaussian Copula Chi-Squared Test evaluation score = 0.0


Perform KS-Test on non-Bergman data

In [19]:
# Calculate the KS-Test D-statistic
non_bergman_D = 1-eval_gs_KS_non_bergman

# Calculate KS-Test D_critical value
alpha_0 = 0.05 # significance level
c_alpha_0 = np.sqrt(-0.5*ln(alpha_0))
n_0 = len(data_non_bergman)
m_0 = len(synth_data_gc_non_bergman)
D_critical_0 = c_alpha_0*np.sqrt((n_0+m_0)/(n_0*m_0))

print("The non-Bergman data Gaussian Copula KS Test evaluation delivers a D-value of", non_bergman_D, 'with a critical D-value of', D_critical_0)

if non_bergman_D < D_critical_0:
  print('At a significance level of', alpha_0, 'there is insufficient evidence to reject the null hypothesis.')
else:
  print('At a significance level of', alpha_0, 'there is sufficient evidence to reject the null hypothesis.')

The non-Bergman data Gaussian Copula KS Test evaluation delivers a D-value of 0.17310167310167324 with a critical D-value of 0.20120355007191204
At a significance level of 0.05 there is insufficient evidence to reject the null hypothesis.


# Save the synthetic film style data

In [20]:
# Label the real and synthetic data
data['Synth']='0'
synth_data_gc_bergman['Synth']='1'
synth_data_gc_non_bergman['Synth']='1'

# Add the 'Total' column back into the original data
sum_camera_moves_data = data['Pan'] + data['Tilt'] + data['Pan w. Tilt'] + data['Track'] + data['Track w. Pan & tilt'] + data['Crane'] + data['Zoom'] + data['Zoom w. Pan & Tilt']
data.insert(10, 'Total', sum_camera_moves_data)

In [21]:
# Save the Bergman data
synth_data_gc_bergman.to_csv('synthetic_train_bergman.csv', encoding = 'utf-8-sig') 
# files.download('synthetic_train_bergman.csv') # download data set

# Save the non-Bergman data
synth_data_gc_non_bergman.to_csv('synthetic_train_non_bergman.csv', encoding = 'utf-8-sig') 
# files.download('synthetic_train_non_bergman.csv') # download data set

# Combine the data sets and save
synth_train_combined = synth_data_gc_non_bergman.append(synth_data_gc_bergman)
synth_train_combined.to_csv('synth_train_combined.csv', encoding = 'utf-8-sig') 

# Combine the real and the synthetic data

In [22]:
synth_train_combined

Unnamed: 0,Film,Pan,Tilt,Pan w. Tilt,Track,Track w. Pan & tilt,Crane,Zoom,Zoom w. Pan & Tilt,BCU,...,MLS,LS,VLS,INS,POV,ASL,RA,CU,BERGMAN,Synth
0,NON_BERGMAN_a,48,10,14,5,5,4,9,4,19,...,60,106,49,6,6,3.118488,35,103,0.0,1
1,NON_BERGMAN_b,32,0,5,9,10,5,6,0,75,...,69,46,17,3,2,9.592118,20,140,0.0,1
2,NON_BERGMAN_c,50,4,19,16,45,3,1,0,26,...,117,105,23,8,6,19.546858,13,80,0.0,1
3,NON_BERGMAN_d,12,10,21,20,10,11,3,2,133,...,57,44,7,17,8,5.024535,29,145,0.0,1
4,NON_BERGMAN_e,32,4,16,51,68,8,15,1,24,...,61,55,17,12,9,13.494272,38,189,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16,BERGMAN_q,51,10,32,7,52,4,2,4,156,...,17,60,21,10,8,15.439411,42,187,1.0,1
17,BERGMAN_r,37,10,44,22,35,0,3,5,124,...,40,78,5,13,5,10.802196,52,88,1.0,1
18,BERGMAN_s,41,4,29,11,20,5,1,5,227,...,32,57,13,9,4,11.863576,31,95,1.0,1
19,BERGMAN_t,48,7,21,34,19,0,2,2,79,...,29,73,8,10,3,12.022702,43,147,1.0,1


In [23]:
data

Unnamed: 0,Film,Pan,Tilt,Pan w. Tilt,Track,Track w. Pan & tilt,Crane,Zoom,Zoom w. Pan & Tilt,BCU,...,MLS,LS,VLS,INS,POV,ASL,RA,CU,BERGMAN,Synth
0,Deep Blue Sea,14,7,8,27,9,7,2,0,140,...,51,94,9,23,10,2.6,24,106,0.0,0
1,Verboten!,35,7,19,19,22,12,2,0,56,...,68,155,28,13,20,10.7,22,77,0.0,0
2,"Reckless Moment, The",92,2,21,24,62,11,3,0,22,...,86,113,49,0,0,15.5,0,37,0.0,0
3,Kris,30,3,13,43,70,0,0,0,30,...,55,128,10,9,1,27.2,17,80,1.0,0
4,Shadows,55,4,11,2,2,2,0,0,122,...,46,22,0,1,1,7.1,31,182,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,"Saint Strikes Back, The",52,6,10,24,42,0,0,0,55,...,107,32,15,9,7,8.4,27,135,0.0,0
91,"Five Pennies, The",28,4,11,30,34,8,4,2,23,...,132,90,4,7,7,15.9,34,47,0.0,0
92,Skammen,71,8,39,11,21,0,0,0,110,...,38,102,16,12,4,14.8,23,100,1.0,0
93,"Next of Kin, The",10,33,14,31,16,0,0,0,28,...,85,113,27,17,4,7.0,22,92,0.0,0


In [24]:
synth_train_real_synth_combined = synth_train_combined.append(data)
synth_train_real_synth_combined
synth_train_real_synth_combined.to_csv('synth_train_real_synth_combined.csv', encoding = 'utf-8-sig') 

In [25]:
synth_train_real_synth_combined

Unnamed: 0,Film,Pan,Tilt,Pan w. Tilt,Track,Track w. Pan & tilt,Crane,Zoom,Zoom w. Pan & Tilt,BCU,...,MLS,LS,VLS,INS,POV,ASL,RA,CU,BERGMAN,Synth
0,NON_BERGMAN_a,48,10,14,5,5,4,9,4,19,...,60,106,49,6,6,3.118488,35,103,0.0,1
1,NON_BERGMAN_b,32,0,5,9,10,5,6,0,75,...,69,46,17,3,2,9.592118,20,140,0.0,1
2,NON_BERGMAN_c,50,4,19,16,45,3,1,0,26,...,117,105,23,8,6,19.546858,13,80,0.0,1
3,NON_BERGMAN_d,12,10,21,20,10,11,3,2,133,...,57,44,7,17,8,5.024535,29,145,0.0,1
4,NON_BERGMAN_e,32,4,16,51,68,8,15,1,24,...,61,55,17,12,9,13.494272,38,189,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90,"Saint Strikes Back, The",52,6,10,24,42,0,0,0,55,...,107,32,15,9,7,8.400000,27,135,0.0,0
91,"Five Pennies, The",28,4,11,30,34,8,4,2,23,...,132,90,4,7,7,15.900000,34,47,0.0,0
92,Skammen,71,8,39,11,21,0,0,0,110,...,38,102,16,12,4,14.800000,23,100,1.0,0
93,"Next of Kin, The",10,33,14,31,16,0,0,0,28,...,85,113,27,17,4,7.000000,22,92,0.0,0


# Test data generation

In [26]:
test_data = pd.read_excel('prelim_bergman_test_cleaned.xlsx')
test_data.pop('Director')
test_data.pop('Total') # The 'Total' camera moves will br calculated and added in for the synthetic data
test_data.pop('Year') # Does not serve as an input variale to the classification models
test_data.pop('COUNTRY') # Does not serve as an input variale to the classification models

test_data.head()

Unnamed: 0,Film,Pan,Tilt,Pan w. Tilt,Track,Track w. Pan & tilt,Crane,Zoom,Zoom w. Pan & Tilt,BCU,...,MS,MLS,LS,VLS,INS,POV,ASL,RA,CU,BERGMAN
0,Gaslight,7,34,27,29,20,3,0,0,25,...,105,97,70,0,9.0,8.0,10.5,30,83,0.0
1,"Frisco Kid, The",29,5,19,11,3,0,2,1,81,...,53,56,87,13,7.0,8.0,5.1,44,130,0.0
2,Hets,56,15,47,18,67,2,0,0,21,...,80,65,161,8,3.0,2.0,18.1,36,88,0.0
3,Madame Bovary,49,2,5,50,37,12,4,0,18,...,130,131,114,31,0.0,0.0,15.0,0,15,0.0
4,Rose of Washington Square,43,1,6,30,21,2,0,0,28,...,88,91,121,12,4.0,9.0,10.8,32,82,0.0


In [27]:
# Split the data set based on the class label (BERGMAN) attribute
test_data_bergman = test_data[test_data['BERGMAN'] == 1]
test_data_bergman

Unnamed: 0,Film,Pan,Tilt,Pan w. Tilt,Track,Track w. Pan & tilt,Crane,Zoom,Zoom w. Pan & Tilt,BCU,...,MS,MLS,LS,VLS,INS,POV,ASL,RA,CU,BERGMAN
7,Sommaren med Monika,49,17,82,25,72,0,0,0,40,...,91,68,135,59,28.0,0.0,23.9,3,49,1.0
12,Hamnstad,54,15,44,19,61,1,0,0,41,...,94,64,82,7,7.0,1.0,16.6,25,113,1.0
14,Tystnaden,61,12,58,12,42,0,0,0,63,...,65,56,83,2,9.0,10.0,19.8,34,150,1.0
19,För att inte tala om alla dessa kvinnor,35,5,9,18,9,0,2,0,34,...,72,51,178,5,9.0,4.0,16.0,18,85,1.0
27,Fängelse,32,8,16,24,83,0,0,0,103,...,60,71,88,12,13.0,2.0,18.9,20,97,1.0
35,Skepp till Indialand,40,1,16,6,13,3,0,0,16,...,95,79,150,17,11.0,3.0,16.1,32,72,1.0
40,Fanny och Alexander,47,11,46,17,53,1,5,3,185,...,51,38,81,3,10.0,5.0,15.1,38,88,1.0
42,Såsom i en spegel,55,6,31,19,12,0,0,0,37,...,72,58,97,14,4.0,2.0,16.4,54,106,1.0
45,Nara livet,45,7,18,24,25,0,0,0,113,...,100,46,17,0,4.0,5.0,14.4,44,140,1.0
47,The Serpent's Egg,32,7,41,16,50,10,14,22,167,...,48,34,73,5,13.0,7.0,13.7,41,111,1.0


In [28]:
test_data_non_bergman = test_data[test_data['BERGMAN'] == 0]
test_data_non_bergman.head()

Unnamed: 0,Film,Pan,Tilt,Pan w. Tilt,Track,Track w. Pan & tilt,Crane,Zoom,Zoom w. Pan & Tilt,BCU,...,MS,MLS,LS,VLS,INS,POV,ASL,RA,CU,BERGMAN
0,Gaslight,7,34,27,29,20,3,0,0,25,...,105,97,70,0,9.0,8.0,10.5,30,83,0.0
1,"Frisco Kid, The",29,5,19,11,3,0,2,1,81,...,53,56,87,13,7.0,8.0,5.1,44,130,0.0
2,Hets,56,15,47,18,67,2,0,0,21,...,80,65,161,8,3.0,2.0,18.1,36,88,0.0
3,Madame Bovary,49,2,5,50,37,12,4,0,18,...,130,131,114,31,0.0,0.0,15.0,0,15,0.0
4,Rose of Washington Square,43,1,6,30,21,2,0,0,28,...,88,91,121,12,4.0,9.0,10.8,32,82,0.0


## Generate the test instance of the Gaussian model

In [29]:
from sdv.tabular import GaussianCopula

# BERGMAN test data
model_test_bergman = GaussianCopula(
    primary_key='Film' # ensures no duplicate key values
    )
model_test_bergman.fit(test_data_bergman)

# NON-BERGMAN test data
model_test_non_bergman = GaussianCopula(
    primary_key='Film', # ensures no duplicate key values
    )
model_test_non_bergman.fit(test_data_non_bergman)

  a = (self.min - loc) / scale
  b = (self.max - loc) / scale
  a = (self.min - loc) / scale
  b = (self.max - loc) / scale


# Sample synthetic test Bergman data

In [30]:
test_synth_data_gc_bergman = model_test_bergman.sample(num_rows=len(test_data_bergman)) # Generate a synthetic data set with the same number of observations as the original data

#Calculate the 'Total' number of camera moves per film and add it to the dataframe
test_sum_camera_moves = test_synth_data_gc_bergman['Pan'] + test_synth_data_gc_bergman['Tilt'] + test_synth_data_gc_bergman['Pan w. Tilt'] + test_synth_data_gc_bergman['Track'] + test_synth_data_gc_bergman['Track w. Pan & tilt'] + test_synth_data_gc_bergman['Crane'] + test_synth_data_gc_bergman['Zoom'] + test_synth_data_gc_bergman['Zoom w. Pan & Tilt']
test_synth_data_gc_bergman.insert(10, 'Total', test_sum_camera_moves)

# Add the 'Total' column back into the original Bergman data
test_total_camera_moves = test_data_bergman['Pan'] + test_data_bergman['Tilt'] + test_data_bergman['Pan w. Tilt'] + test_data_bergman['Track'] + test_data_bergman['Track w. Pan & tilt'] + test_data_bergman['Crane'] + test_data_bergman['Zoom'] + test_data_bergman['Zoom w. Pan & Tilt']
test_data_bergman.insert(10, 'Total', test_total_camera_moves)

# Rename the films in the synthetic data to indicate that they are BERGMAN films
test_synth_data_gc_bergman['Film'] = 'Test_BERGMAN_' + test_synth_data_gc_bergman['Film'].astype(str)

test_synth_data_gc_bergman.head()

Unnamed: 0,Film,Pan,Tilt,Pan w. Tilt,Track,Track w. Pan & tilt,Crane,Zoom,Zoom w. Pan & Tilt,BCU,...,MS,MLS,LS,VLS,INS,POV,ASL,RA,CU,BERGMAN
0,Test_BERGMAN_a,55,7,59,17,42,3,0,1,38,...,100,78,159,24,10.0,6.0,16.3,17,52,1.0
1,Test_BERGMAN_b,46,13,65,13,42,4,9,10,95,...,42,54,147,39,25.0,2.0,22.7,13,57,1.0
2,Test_BERGMAN_c,44,10,56,10,60,1,0,0,23,...,98,75,65,21,9.0,1.0,18.7,21,68,1.0
3,Test_BERGMAN_d,37,2,9,7,11,2,8,6,40,...,62,49,85,1,7.0,4.0,14.8,49,155,1.0
4,Test_BERGMAN_e,39,9,31,30,23,1,4,4,170,...,52,20,93,13,5.0,5.0,16.0,54,110,1.0


In [31]:
test_data_bergman.head()

Unnamed: 0,Film,Pan,Tilt,Pan w. Tilt,Track,Track w. Pan & tilt,Crane,Zoom,Zoom w. Pan & Tilt,BCU,...,MS,MLS,LS,VLS,INS,POV,ASL,RA,CU,BERGMAN
7,Sommaren med Monika,49,17,82,25,72,0,0,0,40,...,91,68,135,59,28.0,0.0,23.9,3,49,1.0
12,Hamnstad,54,15,44,19,61,1,0,0,41,...,94,64,82,7,7.0,1.0,16.6,25,113,1.0
14,Tystnaden,61,12,58,12,42,0,0,0,63,...,65,56,83,2,9.0,10.0,19.8,34,150,1.0
19,För att inte tala om alla dessa kvinnor,35,5,9,18,9,0,2,0,34,...,72,51,178,5,9.0,4.0,16.0,18,85,1.0
27,Fängelse,32,8,16,24,83,0,0,0,103,...,60,71,88,12,13.0,2.0,18.9,20,97,1.0


## Evaluate test Bergman data

In [32]:
# KS-Test and Chi-2 Test evaluation scores
test_eval_gs_KS_bergman=evaluate(test_synth_data_gc_bergman, test_data_bergman, metrics=['KSTest'])
test_eval_gs_Chi_bergman=evaluate(test_synth_data_gc_bergman, test_data_bergman, metrics=['CSTest'])



In [33]:
print("Gaussian Copula KS Test evaluation score =", test_eval_gs_KS_bergman)
print("Gaussian Copula Chi-Squared Test evaluation score =", test_eval_gs_Chi_bergman)

Gaussian Copula KS Test evaluation score = 0.7414965986394557
Gaussian Copula Chi-Squared Test evaluation score = 0.0


In [34]:
# Calculate the KS-Test D-statistic
test_bergman_D = 1-test_eval_gs_KS_bergman

# Calculate KS-Test D_critical value
alpha = 0.05 # significance level
c_alpha = np.sqrt(-0.5*ln(alpha))
n = len(test_data_bergman)
m = len(test_synth_data_gc_bergman)
test_D_critical = c_alpha*np.sqrt((n+m)/(n*m))

print("The Gaussian Copula KS Test evaluation delivers a D-value of", test_bergman_D, 'with a critical D-value of', test_D_critical)

if test_bergman_D < test_D_critical:
  print('At a significance level of', alpha, 'there is insufficient evidence to reject the null hypothesis.')
else:
  print('At a significance level of', alpha, 'there is sufficient evidence to reject the null hypothesis.')

The Gaussian Copula KS Test evaluation delivers a D-value of 0.2585034013605443 with a critical D-value of 0.4625806704591405
At a significance level of 0.05 there is insufficient evidence to reject the null hypothesis.


# Sample synthetic test non-Bergman data

In [35]:
test_synth_data_gc_non_bergman = model_test_non_bergman.sample(num_rows=len(test_data_non_bergman)) # Generate a synthetic data set with the same number of observations as the original data

#Calculate the 'Total' number of camera moves per film and add it to the dataframe
test_sum_camera_moves_non_berg = test_synth_data_gc_non_bergman['Pan'] + test_synth_data_gc_non_bergman['Tilt'] + test_synth_data_gc_non_bergman['Pan w. Tilt'] + test_synth_data_gc_non_bergman['Track'] + test_synth_data_gc_non_bergman['Track w. Pan & tilt'] + test_synth_data_gc_non_bergman['Crane'] + test_synth_data_gc_non_bergman['Zoom'] + test_synth_data_gc_non_bergman['Zoom w. Pan & Tilt']
test_synth_data_gc_non_bergman.insert(10, 'Total', test_sum_camera_moves_non_berg)

# Add the 'Total' column back into the original Bergman data
test_total_camera_moves_non_berg = test_data_non_bergman['Pan'] + test_data_non_bergman['Tilt'] + test_data_non_bergman['Pan w. Tilt'] + test_data_non_bergman['Track'] + test_data_non_bergman['Track w. Pan & tilt'] + test_data_non_bergman['Crane'] + test_data_non_bergman['Zoom'] + test_data_non_bergman['Zoom w. Pan & Tilt']
test_data_non_bergman.insert(10, 'Total', test_total_camera_moves_non_berg)

# Rename the films in the synthetic data to indicate that they are BERGMAN films
test_synth_data_gc_non_bergman['Film'] = 'Test_NON_BERGMAN_' + test_synth_data_gc_non_bergman['Film'].astype(str)

test_synth_data_gc_non_bergman.head()

Unnamed: 0,Film,Pan,Tilt,Pan w. Tilt,Track,Track w. Pan & tilt,Crane,Zoom,Zoom w. Pan & Tilt,BCU,...,MS,MLS,LS,VLS,INS,POV,ASL,RA,CU,BERGMAN
0,Test_NON_BERGMAN_a,9,10,26,20,19,21,5,7,38,...,81,136,70,19,14.1,7.2,4.8,34,55,0.0
1,Test_NON_BERGMAN_b,35,20,34,6,30,28,0,2,21,...,145,137,89,4,14.6,7.8,9.6,44,110,0.0
2,Test_NON_BERGMAN_c,15,2,10,45,60,57,5,4,6,...,124,118,105,29,1.8,0.2,12.9,9,37,0.0
3,Test_NON_BERGMAN_d,13,9,7,9,20,20,8,6,8,...,81,106,132,57,11.6,3.9,5.0,54,79,0.0
4,Test_NON_BERGMAN_e,44,0,4,4,26,2,5,0,22,...,85,135,202,47,0.0,0.0,6.7,9,45,0.0


In [36]:
test_data_non_bergman.head()

Unnamed: 0,Film,Pan,Tilt,Pan w. Tilt,Track,Track w. Pan & tilt,Crane,Zoom,Zoom w. Pan & Tilt,BCU,...,MS,MLS,LS,VLS,INS,POV,ASL,RA,CU,BERGMAN
0,Gaslight,7,34,27,29,20,3,0,0,25,...,105,97,70,0,9.0,8.0,10.5,30,83,0.0
1,"Frisco Kid, The",29,5,19,11,3,0,2,1,81,...,53,56,87,13,7.0,8.0,5.1,44,130,0.0
2,Hets,56,15,47,18,67,2,0,0,21,...,80,65,161,8,3.0,2.0,18.1,36,88,0.0
3,Madame Bovary,49,2,5,50,37,12,4,0,18,...,130,131,114,31,0.0,0.0,15.0,0,15,0.0
4,Rose of Washington Square,43,1,6,30,21,2,0,0,28,...,88,91,121,12,4.0,9.0,10.8,32,82,0.0


## Ealuate test non-Bergman data

In [37]:
# KS-Test and Chi-2 Test evaluation scores
test_eval_gs_KS_non_bergman=evaluate(test_synth_data_gc_non_bergman, test_data_non_bergman, metrics=['KSTest'])
test_eval_gs_Chi_non_bergman=evaluate(test_synth_data_gc_non_bergman, test_data_non_bergman, metrics=['CSTest'])



In [38]:
print("Non-Bergman data Gaussian Copula KS Test evaluation score =", test_eval_gs_KS_non_bergman)
print("Non-Bergman data Gaussian Copula Chi-Squared Test evaluation score =", test_eval_gs_Chi_non_bergman)

Non-Bergman data Gaussian Copula KS Test evaluation score = 0.7900874635568513
Non-Bergman data Gaussian Copula Chi-Squared Test evaluation score = 0.0


In [39]:
# Calculate the KS-Test D-statistic
test_non_bergman_D = 1-test_eval_gs_KS_non_bergman

# Calculate KS-Test D_critical value
alpha_0 = 0.05 # significance level
c_alpha_0 = np.sqrt(-0.5*ln(alpha_0))
n_0 = len(test_data_non_bergman)
m_0 = len(test_synth_data_gc_non_bergman)
test_D_critical_0 = c_alpha_0*np.sqrt((n_0+m_0)/(n_0*m_0))

print("The non-Bergman data Gaussian Copula KS Test evaluation delivers a D-value of", test_non_bergman_D, 'with a critical D-value of', test_D_critical_0)

if test_non_bergman_D < test_D_critical_0:
  print('At a significance level of', alpha_0, 'there is insufficient evidence to reject the null hypothesis.')
else:
  print('At a significance level of', alpha_0, 'there is sufficient evidence to reject the null hypothesis.')

The non-Bergman data Gaussian Copula KS Test evaluation delivers a D-value of 0.20991253644314867 with a critical D-value of 0.2472597689431836
At a significance level of 0.05 there is insufficient evidence to reject the null hypothesis.


# Save the synthetic test data

In [40]:
# Label the real and synthetic data
test_data['Synth']='0'
test_synth_data_gc_bergman['Synth']='1'
test_synth_data_gc_non_bergman['Synth']='1'

# Add the 'Total' column back into the original data
test_sum_camera_moves_data = test_data['Pan'] + test_data['Tilt'] + test_data['Pan w. Tilt'] + test_data['Track'] + test_data['Track w. Pan & tilt'] + test_data['Crane'] + test_data['Zoom'] + test_data['Zoom w. Pan & Tilt']
test_data.insert(10, 'Total', test_sum_camera_moves_data)

In [41]:
# Save the Bergman data
test_synth_data_gc_bergman.to_csv('test_synthetic_train_bergman.csv', encoding = 'utf-8-sig') 
# files.download('synthetic_train_bergman.csv') # download data set

# Save the non-Bergman data
test_synth_data_gc_non_bergman.to_csv('test_synthetic_train_non_bergman.csv', encoding = 'utf-8-sig') 
# files.download('synthetic_train_non_bergman.csv') # download data set

# Combine the data sets and save
test_synth_train_combined = test_synth_data_gc_non_bergman.append(test_synth_data_gc_bergman)
test_synth_train_combined.to_csv('test_synth_train_combined.csv', encoding = 'utf-8-sig') 

# Combine real and synthetic test data

In [42]:
test_synth_train_combined.head()

Unnamed: 0,Film,Pan,Tilt,Pan w. Tilt,Track,Track w. Pan & tilt,Crane,Zoom,Zoom w. Pan & Tilt,BCU,...,MLS,LS,VLS,INS,POV,ASL,RA,CU,BERGMAN,Synth
0,Test_NON_BERGMAN_a,9,10,26,20,19,21,5,7,38,...,136,70,19,14.1,7.2,4.8,34,55,0.0,1
1,Test_NON_BERGMAN_b,35,20,34,6,30,28,0,2,21,...,137,89,4,14.6,7.8,9.6,44,110,0.0,1
2,Test_NON_BERGMAN_c,15,2,10,45,60,57,5,4,6,...,118,105,29,1.8,0.2,12.9,9,37,0.0,1
3,Test_NON_BERGMAN_d,13,9,7,9,20,20,8,6,8,...,106,132,57,11.6,3.9,5.0,54,79,0.0,1
4,Test_NON_BERGMAN_e,44,0,4,4,26,2,5,0,22,...,135,202,47,0.0,0.0,6.7,9,45,0.0,1


In [43]:
test_data.head()

Unnamed: 0,Film,Pan,Tilt,Pan w. Tilt,Track,Track w. Pan & tilt,Crane,Zoom,Zoom w. Pan & Tilt,BCU,...,MLS,LS,VLS,INS,POV,ASL,RA,CU,BERGMAN,Synth
0,Gaslight,7,34,27,29,20,3,0,0,25,...,97,70,0,9.0,8.0,10.5,30,83,0.0,0
1,"Frisco Kid, The",29,5,19,11,3,0,2,1,81,...,56,87,13,7.0,8.0,5.1,44,130,0.0,0
2,Hets,56,15,47,18,67,2,0,0,21,...,65,161,8,3.0,2.0,18.1,36,88,0.0,0
3,Madame Bovary,49,2,5,50,37,12,4,0,18,...,131,114,31,0.0,0.0,15.0,0,15,0.0,0
4,Rose of Washington Square,43,1,6,30,21,2,0,0,28,...,91,121,12,4.0,9.0,10.8,32,82,0.0,0


In [44]:
test_synth_train_real_synth_combined = test_synth_train_combined.append(test_data)
test_synth_train_real_synth_combined
test_synth_train_real_synth_combined.to_csv('test_synth_train_real_synth_combined.csv', encoding = 'utf-8-sig') 

In [45]:
test_synth_train_real_synth_combined

Unnamed: 0,Film,Pan,Tilt,Pan w. Tilt,Track,Track w. Pan & tilt,Crane,Zoom,Zoom w. Pan & Tilt,BCU,...,MLS,LS,VLS,INS,POV,ASL,RA,CU,BERGMAN,Synth
0,Test_NON_BERGMAN_a,9,10,26,20,19,21,5,7,38,...,136,70,19,14.1,7.2,4.8,34,55,0.0,1
1,Test_NON_BERGMAN_b,35,20,34,6,30,28,0,2,21,...,137,89,4,14.6,7.8,9.6,44,110,0.0,1
2,Test_NON_BERGMAN_c,15,2,10,45,60,57,5,4,6,...,118,105,29,1.8,0.2,12.9,9,37,0.0,1
3,Test_NON_BERGMAN_d,13,9,7,9,20,20,8,6,8,...,106,132,57,11.6,3.9,5.0,54,79,0.0,1
4,Test_NON_BERGMAN_e,44,0,4,4,26,2,5,0,22,...,135,202,47,0.0,0.0,6.7,9,45,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58,Each Dawn I Die,50,7,44,28,31,0,0,0,27,...,70,88,1,6.0,8.0,7.9,27,165,0.0,0
59,Meteor,10,5,30,29,50,5,1,3,34,...,53,107,97,34.0,12.0,5.7,29,68,0.0,0
60,Jungfrukällan,31,11,38,30,23,0,0,0,63,...,65,107,5,4.0,4.0,13.3,42,62,1.0,0
61,Saraband,12,4,10,12,19,0,16,5,232,...,7,44,6,5.0,3.0,15.7,61,100,1.0,0
