In [7]:
import pandas as pd

# Load the original dataset
file_path = 'C:/Users/charu/OneDrive/Desktop/Cpsc final exam/Credit Card Fraud Data/fraud_data.csv'
df = pd.read_csv(file_path)

# Display the initial column names for reference
print("Initial column names:")
print(df.columns)

# Check for problematic entries in the 'is_fraud' column
if 'is_fraud' in df.columns:
    problematic_is_fraud = df[df['is_fraud'].str.contains(r'\"', na=False)]
    print("Problematic 'is_fraud' entries:")
    print(problematic_is_fraud)

    # Clean the 'is_fraud' column by removing any appended date-time strings and keeping only numeric values
    df['is_fraud'] = df['is_fraud'].str.extract(r'(\d)').astype(int)
else:
    print("Column 'is_fraud' not found in the dataset.")

# Replace double quotes in 'merchant' and 'job' columns
df['merchant'] = df['merchant'].str.replace('"', '')
df['job'] = df['job'].str.replace('"', '')

# Convert 'trans_date_trans_time' and 'dob' to datetime, handling errors
df['trans_date_trans_time'] = pd.to_datetime(df['trans_date_trans_time'], errors='coerce')
df['dob'] = pd.to_datetime(df['dob'], errors='coerce')

# Standardize column names by removing spaces, special characters, and converting them to lowercase and single words
df.columns = df.columns.str.replace('[^A-Za-z0-9]+', '', regex=True).str.lower()

# Display the updated column names to check for any changes
print("Updated column names after cleaning:")
print(df.columns)

# Save the cleaned data to a new CSV file
cleaned_file_path = 'C:/Users/charu/OneDrive/Desktop/Cpsc final exam/Credit Card Fraud Data/fraud_data_clean.csv'
df.to_csv(cleaned_file_path, index=False)

print("\nData cleaning is complete. Cleaned data saved to 'fraud_data_clean.csv'.")

# Verify the changes
cleaned_df = pd.read_csv(cleaned_file_path)
print("First few rows of the cleaned dataset:")
print(cleaned_df.head())
print("\nData types after cleaning:")
print(cleaned_df.dtypes)

# Ensure that there are no more problematic 'is_fraud' entries, if it exists
if 'is_fraud' in cleaned_df.columns:
    problematic_is_fraud_final = cleaned_df[cleaned_df['is_fraud'].isnull()]
    print("Final problematic 'is_fraud' entries (should be empty):")
    print(problematic_is_fraud_final)
else:
    print("Column 'is_fraud' not found in the cleaned dataset.")


Initial column names:
Index(['trans_date_trans_time', 'merchant', 'category', 'amt', 'city', 'state',
       'lat', 'long', 'city_pop', 'job', 'dob', 'trans_num', 'merch_lat',
       'merch_long', 'is_fraud'],
      dtype='object')
Problematic 'is_fraud' entries:
     trans_date_trans_time          merchant        category    amt      city  \
1781      11-12-2020 23:19  Thompson-Gleason  health_fitness  19.45  Lakeport   
7780      31-12-2020 23:59   Breitenberg LLC          travel   7.99      Mesa   

     state      lat      long  city_pop           job         dob  \
1781    CA  39.0470 -122.9328     11256    Podiatrist  18-10-1972   
7780    ID  44.6255 -116.4493       129  Cartographer  15-12-1965   

                             trans_num  merch_lat  merch_long  \
1781  bfde75d978bb9905a4a8c87440692a4c  39.251880 -122.490946   
7780  14392d723bb7737606b2700ac791b7aa  44.470525 -117.080888   

                    is_fraud  
1781  1"2020-12-24 16:56:24"  
7780  0"2019-01-01 00:00:4

  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listlike)
  cache_array = _maybe_cache(arg, format, cache, convert_listl