In [1]:
import pandas as pd

In [3]:
import pandas as pd
import numpy as np
import random
from faker import Faker

# Load the base IMDB table
base_df = pd.read_csv(r"D:\uni\AdvancedTopics\project2\PossibileDB\IMDB\IMDB_Ver_0.csv")

# Initialize Faker for realistic fake data
fake = Faker()

# Set random seed for reproducibility
np.random.seed(42)
random.seed(42)

## Table 1: Movie Financials (could join on Series_Title)
num_movies = len(base_df)
movie_financials = pd.DataFrame({
    'Series_Title': base_df['Series_Title'],
    'Production_Budget': [random.randint(1, 200) * 1000000 for _ in range(num_movies)],
    'Box_Office_Gross': lambda x: [int(b * random.uniform(0.5, 10.0) for b in x['Production_Budget'])],
    'Profitability_Ratio': lambda x: x['Box_Office_Gross'] / x['Production_Budget'],
    'Primary_Production_Company': [fake.company() for _ in range(num_movies)],
    'Release_Season': [random.choice(['Spring', 'Summer', 'Fall', 'Winter']) for _ in range(num_movies)]
})

## Table 2: Director Information (could join on Series_Title)
directors = pd.DataFrame({
    'Series_Title': base_df['Series_Title'],
    'Director_Name': [fake.name() for _ in range(num_movies)],
    'Director_Gender': [random.choice(['Male', 'Female', 'Non-binary']) for _ in range(num_movies)],
    'Director_Nationality': [fake.country() for _ in range(num_movies)],
    'Director_Birth_Year': [random.randint(1940, 1990) for _ in range(num_movies)],
    'Director_Awards': [random.randint(0, 20) for _ in range(num_movies)]
})

## Table 3: Actor Information (many-to-many relationship, needs bridge table)
actors = pd.DataFrame({
    'Actor_ID': range(1, 201),
    'Actor_Name': [fake.name() for _ in range(200)],
    'Actor_Gender': [random.choice(['Male', 'Female']) for _ in range(200)],
    'Actor_Nationality': [fake.country() for _ in range(200)],
    'Actor_Debut_Year': [random.randint(1970, 2020) for _ in range(200)],
    'Actor_Awards': [random.randint(0, 15) for _ in range(200)]
})

# Bridge table for movie-actor relationships
movie_actor_bridge = pd.DataFrame({
    'Series_Title': random.choices(base_df['Series_Title'], k=500),
    'Actor_ID': random.choices(actors['Actor_ID'], k=500),
    'Role_Type': random.choices(['Lead', 'Supporting', 'Cameo'], weights=[0.5, 0.45, 0.05], k=500),
    'Salary': [random.randint(50000, 5000000) for _ in range(500)]
}).drop_duplicates()

## Table 4: Awards Information (could join on Series_Title)
awards = pd.DataFrame({
    'Series_Title': random.choices(base_df['Series_Title'], k=150),
    'Award_Name': random.choices(['Oscar', 'Golden Globe', 'BAFTA', 'Cannes', 'Sundance'], k=150),
    'Award_Category': random.choices(['Best Picture', 'Best Director', 'Best Actor', 'Best Actress', 
                                     'Best Screenplay', 'Best Cinematography'], k=150),
    'Award_Year': [random.randint(2000, 2023) for _ in range(150)],
    'Won': random.choices([True, False], weights=[0.3, 0.7], k=150)
}).drop_duplicates()

## Table 5: Streaming Availability (could join on Series_Title)
streaming = pd.DataFrame({
    'Series_Title': base_df['Series_Title'],
    'Available_On': [random.choice(['Netflix', 'Amazon Prime', 'HBO Max', 'Disney+', 'Hulu', 'None']) 
                     for _ in range(num_movies)],
    'Subscription_Required': [random.choice([True, False]) for _ in range(num_movies)],
    'Release_Date': [fake.date_between(start_date='-10y', end_date='today') for _ in range(num_movies)],
    'Price': [round(random.uniform(0, 19.99), 2) if random.random() > 0.7 else 0 for _ in range(num_movies)]
})

## Table 6: User Ratings (could join on Series_Title)
user_ratings = pd.DataFrame({
    'Series_Title': random.choices(base_df['Series_Title'], k=1000),
    'User_ID': [fake.uuid4() for _ in range(1000)],
    'User_Rating': [round(random.uniform(1, 10), 1) for _ in range(1000)],
    'Review_Date': [fake.date_between(start_date='-5y', end_date='today') for _ in range(1000)],
    'Review_Text': [fake.text(max_nb_chars=200) for _ in range(1000)]
})

## Table 7: Country-Specific Data (could join on Series_Title)
country_data = pd.DataFrame({
    'Series_Title': random.choices(base_df['Series_Title'], k=300),
    'Country': [fake.country() for _ in range(300)],
    'Release_Date': [fake.date_between(start_date='-30y', end_date='today') for _ in range(300)],
    'Localized_Title': [fake.catch_phrase() if random.random() > 0.7 else None for _ in range(300)],
    'Censorship_Rating': random.choices(['G', 'PG', 'PG-13', 'R', 'NC-17', 'Unrated'], k=300)
}).drop_duplicates()

# Save all tables to CSV files
tables = {
    'movie_financials': movie_financials,
    'directors': directors,
    'actors': actors,
    'movie_actor_bridge': movie_actor_bridge,
    'awards': awards,
    'streaming': streaming,
    'user_ratings': user_ratings,
    'country_data': country_data
}

for name, table in tables.items():
    table.to_csv(f'{name}.csv', index=False)

print("Generated synthetic tables:")
print("\n".join(tables.keys()))

Generated synthetic tables:
movie_financials
directors
actors
movie_actor_bridge
awards
streaming
user_ratings
country_data


In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

# Set random seed for reproducibility
np.random.seed(42)

# Initialize ground truth dataframe
ground_truth = pd.DataFrame(columns=[
    'version_name',
    'version_number',
    'base_table',
    'joined_tables',
    'join_type',
    'missing_data_introduced',
    'missing_data_percentage',
    'description',
    'rows_in_output',
    'columns_in_output',
    'created_at'
])

# Function to record version metadata
def record_version(name, number, base, joins, join_type, missing, missing_pct, desc, df):
    new_record = pd.DataFrame([{
        'version_name': name,
        'version_number': number,
        'base_table': base,
        'joined_tables': joins,
        'join_type': join_type,
        'missing_data_introduced': missing,
        'missing_data_percentage': missing_pct,
        'description': desc,
        'rows_in_output': len(df),
        'columns_in_output': len(df.columns),
        'created_at': datetime.now()
    }])
    return new_record
# Function to introduce missing values
def add_missing_values(df, pct=0.1, exclude_cols=['Series_Title', 'Released_Year']):
    df_missing = df.copy()
    for col in df_missing.columns:
        if col not in exclude_cols:
            mask = np.random.random(len(df_missing)) < pct
            df_missing.loc[mask, col] = np.nan
    return df_missing

# Load all tables
base_df = pd.read_csv('sinteticDB/IMDB/IMDB_Base.csv')
movie_financials = pd.read_csv('sinteticDB/IMDB/externalTables/movie_financials.csv')
directors = pd.read_csv('sinteticDB/IMDB/externalTables/directors.csv')
streaming = pd.read_csv('sinteticDB/IMDB/externalTables/streaming.csv')
country_data = pd.read_csv('sinteticDB/IMDB/externalTables/country_data.csv')
awards = pd.read_csv('sinteticDB/IMDB/externalTables/awards.csv')
user_ratings = pd.read_csv('sinteticDB/IMDB/externalTables/user_ratings.csv')
movie_actor_bridge = pd.read_csv('sinteticDB/IMDB/externalTables/movie_actor_bridge.csv')
actors = pd.read_csv('sinteticDB/IMDB/externalTables/actors.csv')

# Define missing value percentages to simulate
missing_percentages = [0.05, 0.15]  # 5% and 15% missing data

# Version counter
version_counter = 1

# Original versions without missing data
versions = [
    # Version 1 left: Base + Financials
    ('imdb_with_financials_l', 
     lambda: pd.merge(base_df, movie_financials, on='Series_Title', how='left'),
     "Base table with financial information added","left"),

     # Version 1 inner: Base + Financials
    ('imdb_with_financials_i', 
     lambda: pd.merge(base_df, movie_financials, on='Series_Title', how='inner'),
     "Base table with financial information added","inner"),

     # Version 1 right: Base + Financials
    ('imdb_with_financials_r', 
     lambda: pd.merge(base_df, movie_financials, on='Series_Title', how='right'),
     "Base table with financial information added","right"),
    
    # Version 2 left: Base + Directors
    ('imdb_with_directors_l', 
     lambda: pd.merge(base_df, directors, on='Series_Title', how='left'),
     "Base table with director information added","left"),

    # Version 2 inner: Base + Directors
    ('imdb_with_directors_i', 
     lambda: pd.merge(base_df, directors, on='Series_Title', how='inner'),
     "Base table with director information added","inner"),
    
    # Version 2 right: Base + Directors
    ('imdb_with_directors_r', 
     lambda: pd.merge(base_df, directors, on='Series_Title', how='right'),
     "Base table with director information added","right"),

    # Version 3 left: Base + Streaming
    ('imdb_with_streaming_l', 
     lambda: pd.merge(base_df, streaming, on='Series_Title', how='left'),
     "Base table with streaming availability added","left"),

    # Version 3 inner: Base + Streaming
    ('imdb_with_streaming_i', 
     lambda: pd.merge(base_df, streaming, on='Series_Title', how='inner'),
     "Base table with streaming availability added","inner"),

    # Version 3 right: Base + Streaming
    ('imdb_with_streaming_r', 
     lambda: pd.merge(base_df, streaming, on='Series_Title', how='right'),
     "Base table with streaming availability added","right"),
    
    # Version 4 left: Base + Country Data
    ('imdb_with_country_data_l', 
     lambda: pd.merge(base_df, country_data, on='Series_Title', how='left'),
     "Base table with country-specific release data added","left"),

    # Version 4 inner: Base + Country Data
    ('imdb_with_country_data_i', 
     lambda: pd.merge(base_df, country_data, on='Series_Title', how='inner'),
     "Base table with country-specific release data added","inner"),

    # Version 4 right: Base + Country Data
    ('imdb_with_country_data_r', 
     lambda: pd.merge(base_df, country_data, on='Series_Title', how='right'),
     "Base table with country-specific release data added","right"),
    
    # Version 5 left: Base + Awards
    ('imdb_with_awards_l', 
     lambda: pd.merge(base_df, awards, on='Series_Title', how='left'),
     "Base table with awards information (may have duplicate movie rows)","left"),

    # Version 5 inner: Base + Awards
    ('imdb_with_awards_i', 
     lambda: pd.merge(base_df, awards, on='Series_Title', how='inner'),
     "Base table with awards information (may have duplicate movie rows)","inner"),

    # Version 5 right: Base + Awards
    ('imdb_with_awards_r', 
     lambda: pd.merge(base_df, awards, on='Series_Title', how='right'),
     "Base table with awards information (may have duplicate movie rows)","right"),
    
    
    # Version 6: Base + Lead Actors
    ('imdb_with_lead_actor', 
     lambda: pd.merge(
         base_df,
         pd.merge(
             movie_actor_bridge[movie_actor_bridge['Role_Type'] == 'Lead'].drop_duplicates('Series_Title', keep='first'),
             actors,
             on='Actor_ID'
         )[['Series_Title', 'Actor_Name']],
         on='Series_Title', how='left'),
     "Base table with name of lead actor added","left"),
    
    # Version 7: Base + Financials + Directors
    ('imdb_with_financials_and_directors', 
     lambda: pd.merge(
         pd.merge(base_df, movie_financials, on='Series_Title', how='left'),
         directors,
         on='Series_Title', how='left'),
     "Base table with both financial and director information","left"),
    
    # Version 10: Comprehensive (will be handled separately)
]

# Create original versions and missing data variants
for version_name, join_func, desc, join_type in versions:
    # Create original version
    df = join_func()
    df.to_csv(f'{version_name}.csv', index=False)
    ground_truth = pd.concat([
        ground_truth,
        record_version(
            version_name, version_counter,
            'IMDB_Base.csv', version_name.split('_with_')[1],
            join_type, False, 0,
            desc, df
        )
    ], ignore_index=True)
    version_counter += 1
    
    # Create versions with missing values
    for pct in missing_percentages:
        missing_version_name = f"{version_name}_missing_{int(pct*100)}pct"
        df_missing = add_missing_values(df, pct)
        df_missing.to_csv(f'{missing_version_name}.csv', index=False)
        ground_truth = pd.concat([
            ground_truth,
            record_version(
                missing_version_name, version_counter,
                'IMDB_Base.csv', version_name.split('_with_')[1],
                'left', True, pct,
                f"{desc} with {int(pct*100)}% missing values",
                df_missing
            )
        ], ignore_index=True)
        version_counter += 1

# Special case for comprehensive version
comp_df = pd.merge(base_df, movie_financials, on='Series_Title', how='left')
comp_df = pd.merge(comp_df, streaming, on='Series_Title', how='left')
comp_df = pd.merge(comp_df, 
                  user_ratings.groupby('Series_Title')['User_Rating'].mean().reset_index(), 
                  on='Series_Title', how='left')
comp_df.to_csv('imdb_comprehensive.csv', index=False)
ground_truth = pd.concat([
    ground_truth,
    record_version(
        'imdb_comprehensive', version_counter,
        'IMDB_Base.csv', 'movie_financials + streaming + user_ratings',
        'left', False, 0,
        "Base table with financials, streaming, and ratings combined",
        comp_df
    )
], ignore_index=True)
version_counter += 1

# Create missing versions for comprehensive
for pct in missing_percentages:
    missing_version_name = f"imdb_comprehensive_missing_{int(pct*100)}pct"
    comp_missing = add_missing_values(comp_df, pct)
    comp_missing.to_csv(f'{missing_version_name}.csv', index=False)
    ground_truth = pd.concat([
        ground_truth,
        record_version(
            missing_version_name, version_counter,
            'IMDB_Base.csv', 'movie_financials + streaming + user_ratings',
            'left', True, pct,
            f"Comprehensive version with {int(pct*100)}% missing values",
            comp_missing
        )
    ], ignore_index=True)
    version_counter += 1

# Save ground truth
ground_truth.to_csv('ground_truth_versions.csv', index=False)

print(f"Created {len(ground_truth)} total versions")
print(f"Saved ground truth to: ground_truth_versions.csv")

In [2]:
import pandas as pd
pd.read_csv(r"sinteticDB\IMDB\ground_truth_versions.csv").head()


Unnamed: 0,version_name,version_number,base_table,joined_tables,join_type,missing_data_introduced,missing_data_percentage,description,rows_in_output,columns_in_output,created_at
0,imdb_with_financials_l,1,IMDB_Base.csv,financials_l,left,False,0.0,Base table with financial information added,800,11,2025-04-07 19:00:56.962902
1,imdb_with_financials_l_missing_5pct,2,IMDB_Base.csv,financials_l,left,True,0.05,Base table with financial information added wi...,800,11,2025-04-07 19:00:56.972452
2,imdb_with_financials_l_missing_15pct,3,IMDB_Base.csv,financials_l,left,True,0.15,Base table with financial information added wi...,800,11,2025-04-07 19:00:56.981968
3,imdb_with_financials_i,4,IMDB_Base.csv,financials_i,inner,False,0.0,Base table with financial information added,800,11,2025-04-07 19:00:56.995157
4,imdb_with_financials_i_missing_5pct,5,IMDB_Base.csv,financials_i,left,True,0.05,Base table with financial information added wi...,800,11,2025-04-07 19:00:57.006195
