In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
file_path = '/Users/jatinkumarparmar/Documents/GitHub/dsp_project/dsp-g6-s1-25-tfd/data/Clean_Dataset.csv'

df = pd.read_csv(file_path)

In [3]:
def introduce_specific_errors(df):
    
    df_error = df.copy()

    # 1. Remove airline from 500 rows
    random_airline_indices = np.random.choice(df_error.index, size=min(500, len(df_error)), replace=False)
    df_error.loc[random_airline_indices, 'airline'] = None

    # 2. Negative value in duration column
    random_days_indices = np.random.choice(df_error.index, size=min(500, len(df_error)), replace=False)
    df_error.loc[random_days_indices, 'duration'] *= -1

    # 3. Same source and destination cities
    random_city_indices = np.random.choice(df_error.index, size=min(500, len(df_error)), replace=False)
    df_error.loc[random_city_indices, 'destination_city'] = df_error.loc[random_city_indices, 'source_city']

    # 4. Yes and No values in days_left column
    df_error['days_left'] = df_error['days_left'].astype('object')
    random_days_left_indices = np.random.choice(df_error.index, size=min(500, len(df_error)), replace=False)
    df_error.loc[random_days_left_indices, 'days_left'] = np.random.choice(['Yes', 'No'], size=len(random_days_left_indices))

    # 5. Invalid class types (add "Premium" besides Economy/Business)
    random_class_indices = np.random.choice(df_error.index, size=min(500, len(df_error)), replace=False)
    df_error.loc[random_class_indices, 'travel_class'] = 'Premium'

    # 6. Air India airline has flights with Vistara flight numbers
    air_india_indices = df_error[df_error['airline'] == 'Air_India'].index
    if len(air_india_indices) > 0:
        random_air_india = np.random.choice(air_india_indices, size=min(500, len(air_india_indices)), replace=False)
        vistara_flight_numbers = ['UK-' + str(random.randint(100, 999)) for _ in range(len(random_air_india))]
        df_error.loc[random_air_india, 'flight'] = vistara_flight_numbers

    # 7. Assign random durations > 20 hours to some "zero stops" flights
    zero_stop_indices = df_error[df_error['stops'] == 'zero'].index
    if len(zero_stop_indices) > 0:
        random_zero_stops = np.random.choice(zero_stop_indices, size=min(500, len(zero_stop_indices)), replace=False)
        df_error.loc[random_zero_stops, 'duration'] = np.random.uniform(20.1, 25.0, size=len(random_zero_stops))

    return df_error

def verify_errors(original_df, error_df):
    
    print("1. Flights without Airline name:")
    print(error_df[(error_df['airline'].isna())].head())

    print("\n2. Negative duration values:")
    print(error_df[error_df['duration'] < 0].head())

    print("\n3. Same source and destination cities:")
    print(error_df[error_df['source_city'] == error_df['destination_city']].head())

    print("\n4. Yes/No values in days_left:")
    print(error_df[error_df['days_left'].astype(str).isin(['Yes', 'No'])].head())

    print("\n5. Premium class entries:")
    print(error_df[error_df['travel_class'] == 'Premium'].head())

    print("\n6. Air India with Vistara flight numbers:")
    print(error_df[(error_df['airline'] == 'Air_India') & (error_df['flight'].str.startswith('UK-'))].head())

    print("\n7. Zero stops with duration > 20 hours:")
    temp_duration = pd.to_numeric(error_df['duration'], errors='coerce') 
    print(error_df[(error_df['stops'] == 'zero') & (temp_duration > 20)].head())


# Introduce errors
df_with_errors = introduce_specific_errors(df)

# Verify the introduced errors
verify_errors(df, df_with_errors)

# Save the dataset with errors
df_with_errors.to_csv('/Users/jatinkumarparmar/Documents/GitHub/dsp_project/dsp-g6-s1-25-tfd/data/flight_dataset_with_errors.csv', index=False)

1. Flights without Airline name:
      Unnamed: 0 airline  flight source_city departure_time stops  \
713          713    None  AI-762       Delhi          Night   one   
1397        1397    None  UK-809       Delhi        Evening   one   
2268        2268    None  AI-502       Delhi      Afternoon   one   
2500        2500    None  AI-465       Delhi        Evening   one   
2722        2722    None  UK-951       Delhi      Afternoon  zero   

     arrival_time destination_city travel_class  duration days_left  price  
713     Afternoon           Mumbai      Economy     16.17         5  13410  
1397      Morning           Mumbai      Economy     12.42         9  12150  
2268      Morning           Mumbai      Economy     19.08        13  12990  
2500        Night           Mumbai      Economy      6.25        14  14670  
2722      Evening           Mumbai      Economy      2.17        16   2700  

2. Negative duration values:
      Unnamed: 0    airline   flight source_city departure_t