In [1]:
import pandas as pd

In [2]:
file_path = '/Users/jatinkumarparmar/Documents/GitHub/dsp-g6-s1-25-tfd/data/Clean_Dataset.csv'

df = pd.read_csv(file_path)

In [3]:
import pandas as pd
import numpy as np
import random

def introduce_specific_errors(df):
    
    df_error = df.copy()

    # 1. Blank value in price column for SpiceJet airline
    spicejet_indices = df_error[df_error['airline'] == 'SpiceJet'].index
    if len(spicejet_indices) > 0:
        random_spicejet = np.random.choice(spicejet_indices, size=min(500, len(spicejet_indices)), replace=False)
        df_error.loc[random_spicejet, 'price'] = np.nan

    # 2. Negative value in days_left column
    random_days_indices = np.random.choice(df_error.index, size=min(500, len(df_error)), replace=False)
    df_error.loc[random_days_indices, 'days_left'] *= -1

    # 3. Same source and destination cities
    random_city_indices = np.random.choice(df_error.index, size=min(500, len(df_error)), replace=False)
    df_error.loc[random_city_indices, 'destination_city'] = df_error.loc[random_city_indices, 'source_city']

    # 4. Yes and No values in duration column
    df_error['duration'] = df_error['duration'].astype('object')
    random_duration_indices = np.random.choice(df_error.index, size=min(500, len(df_error)), replace=False)
    df_error.loc[random_duration_indices, 'duration'] = np.random.choice(['Yes', 'No'], size=len(random_duration_indices))

    # 5. Invalid class types (add "Premium" besides Economy/Business)
    random_class_indices = np.random.choice(df_error.index, size=min(500, len(df_error)), replace=False)
    df_error.loc[random_class_indices, 'travel_class'] = 'Premium'

    # 6. Air India airline has flights with Vistara flight numbers
    air_india_indices = df_error[df_error['airline'] == 'Air_India'].index
    if len(air_india_indices) > 0:
        random_air_india = np.random.choice(air_india_indices, size=min(500, len(air_india_indices)), replace=False)
        vistara_flight_numbers = ['UK-' + str(random.randint(100, 999)) for _ in range(len(random_air_india))]
        df_error.loc[random_air_india, 'flight'] = vistara_flight_numbers

    # 7. Assign random durations > 20 hours to some "zero stops" flights
    zero_stop_indices = df_error[df_error['stops'] == 'zero'].index
    if len(zero_stop_indices) > 0:
        random_zero_stops = np.random.choice(zero_stop_indices, size=min(500, len(zero_stop_indices)), replace=False)
        df_error.loc[random_zero_stops, 'duration'] = np.random.uniform(20.1, 25.0, size=len(random_zero_stops))

    return df_error

def verify_errors(original_df, error_df):
    
    print("1. SpiceJet flights with blank prices:")
    print(error_df[(error_df['airline'] == 'SpiceJet') & (error_df['price'].isna())].head())

    print("\n2. Negative days_left values:")
    print(error_df[error_df['days_left'] < 0].head())

    print("\n3. Same source and destination cities:")
    print(error_df[error_df['source_city'] == error_df['destination_city']].head())

    print("\n4. Yes/No values in duration (before numeric conversion):")
    print(error_df[error_df['duration'].astype(str).isin(['Yes', 'No'])].head())

    print("\n5. Premium class entries:")
    print(error_df[error_df['travel_class'] == 'Premium'].head())

    print("\n6. Air India with Vistara flight numbers:")
    print(error_df[(error_df['airline'] == 'Air_India') & (error_df['flight'].str.startswith('UK-'))].head())

    print("\n7. Zero stops with duration > 20 hours:")
    temp_duration = pd.to_numeric(error_df['duration'], errors='coerce') 
    print(error_df[(error_df['stops'] == 'zero') & (temp_duration > 20)].head())


# Introduce errors
df_with_errors = introduce_specific_errors(df)

# Verify the introduced errors
verify_errors(df, df_with_errors)

# Save the dataset with errors
df_with_errors.to_csv('flight_dataset_with_errors.csv', index=False)

1. SpiceJet flights with blank prices:
     Unnamed: 0   airline   flight source_city departure_time stops  \
1             1  SpiceJet  SG-8157       Delhi  Early_Morning  zero   
400         400  SpiceJet  SG-1061       Delhi  Early_Morning  zero   
449         449  SpiceJet  SG-2277       Delhi      Afternoon   one   
620         620  SpiceJet  SG-2976       Delhi        Evening   one   
739         739  SpiceJet  SG-8709       Delhi        Evening  zero   

    arrival_time destination_city travel_class duration  days_left  price  
1        Morning           Mumbai      Economy     2.33          1    NaN  
400      Morning           Mumbai      Economy     2.33          4    NaN  
449      Evening           Mumbai      Economy    25.75          4    NaN  
620      Morning           Mumbai      Economy    15.25          5    NaN  
739        Night           Mumbai      Economy     2.17          6    NaN  

2. Negative days_left values:
      Unnamed: 0    airline   flight source_cit