Cleaning the data

In [13]:
import pandas as pd

# Load your data
# Replace 'your_file.csv' with the path to your actual file
file_name = 'RT.IRS_Data_2023'
file_path = file_name + '.csv'
data = pd.read_csv(file_path, dtype=str)

# Drop 'disseminationTimestamp' and its derived columns
data = data.drop(columns=['disseminationTimestamp','sDRreceiptTimestamp','disseminationIdentifier'])

# Convert any empty strings to NaN
data = data.applymap(lambda x: None if x == '' else x)

# Set your threshold
threshold = 0.3 * len(data)

# Collect removed columns and percentages
removed_columns = []
nan_percentages = []

# Identify columns to be removed and collect their names and NaN percentages
for column in data.columns:
    nan_count = data[column].isna().sum()
    nan_percentage = nan_count / len(data)
    
    if nan_percentage >= 0.3:
        removed_columns.append(column)
        nan_percentages.append(f"{nan_percentage:.2%}")

# Display removed columns and percentages in a table
removed_data = pd.DataFrame({
    'Removed Column': removed_columns,
    '% NaN Values': nan_percentages
})
print("Removed Columns:")
print(removed_data)

# Remove identified columns
cleaned_data = data.drop(columns=removed_columns)

# Display remaining columns
remaining_columns = list(cleaned_data.columns)
print("\nRemaining Columns:")
print(remaining_columns)

# Display the number of remaining columns
print(f"\nNumber of Remaining Columns: {len(remaining_columns)}")

print("\nRemoved Columns:")
print(removed_columns)



# Write cleaned data back to CSV
# Replace 'cleaned_file.csv' with the desired name/path for your output file
cleaned_file_path = file_name + '_Cleaned.csv'
cleaned_data.to_csv(cleaned_file_path, index=False)


Removed Columns:
              Removed Column % NaN Values
0         amendmentIndicator       99.72%
1        blockTradeIndicator       76.33%
2         embeddedOptionType      100.00%
3          firstExerciseDate      100.00%
4             leg1CallAmount      100.00%
..                       ...          ...
61        leg2SpreadNotation      100.00%
62    multiCurrencyIndicator       99.74%
63  leg1ResetFrequencyPeriod       97.62%
64     optionExerciseEndDate      100.00%
65             leg2FixedRate       98.23%

[66 rows x 2 columns]

Remaining Columns:
['action', 'assetClass', 'cleared', 'customBasketIndicator', 'deliveryType', 'effectiveDate', 'event', 'eventDateTime', 'executionDateTime', 'expirationDate', 'instrumentType', 'leg1FixedRateDayCount', 'leg1FloatingRateDayCount', 'leg1NotionalAmount', 'leg1NotionalCurrency', 'leg1NotionalScheduleType', 'leg1SettlementCurrency', 'leg1UnderlyingAssetOrContractType', 'leg2FloatingRateDayCount', 'leg2NotionalAmount', 'leg2NotionalCurren

  data = data.applymap(lambda x: None if x == '' else x)
