In [7]:
import os
import pandas as pd
import numpy as np
import json
from sklearn.preprocessing import LabelEncoder

In [8]:
data_folder = "/Users/abhishekkumar/Desktop/data_quality_project/data1"
cleaned_folder = "/Users/abhishekkumar/Desktop/data_quality_project/cleaned_data"
os.makedirs(cleaned_folder, exist_ok=True)

In [9]:
import json
all_files = [f for f in os.listdir(data_folder) if f.endswith(('.json', '.jsonl'))]
dfs = {}

In [14]:
dfs[f] = df

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 25641 entries, 0 to 25653
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   data      25641 non-null  object
 1   included  25641 non-null  object
dtypes: object(2)
memory usage: 1.6+ MB


In [16]:
df.describe(include="all")

Unnamed: 0,data,included
count,25641,25641
unique,25641,24820
top,[{'id': '45bbf6ee-348a-4ea6-8f1d-e453c1ecfe96'...,[{'id': 'd911f2fa-646c-5d15-bd61-eaf59ef68379'...
freq,1,5


In [10]:
for f in all_files:
    file_path = os.path.join(data_folder, f)
    with open(file_path, 'r', encoding='utf-8') as file:
        data = [json.loads(line) for line in file]
    df = pd.json_normalize(data)
    df_dedup = df.astype(str).drop_duplicates()  # initial deduplication
    dfs[f] = df_dedup

print(f"Loaded {len(dfs)} datasets.")


Loaded 24 datasets.


In [11]:
cleaned_dfs = {}
cleaning_summary = []

for file_name, df in dfs.items():
    df_clean = df.copy()
    
    # --- Remove rows with missing critical values ---
    if 'headline' in df_clean.columns:
        df_clean = df_clean[df_clean['headline'].notnull()]
    
    # --- Remove duplicate rows ---
    df_clean = df_clean.drop_duplicates()
    
    # --- Standardize text columns ---
    for col in df_clean.select_dtypes(include='object').columns:
        df_clean[col] = df_clean[col].str.strip().str.title()  # normalize capitalization
    
    # --- Standardize dates ---
    if 'event_date' in df_clean.columns:
        df_clean['event_date'] = pd.to_datetime(df_clean['event_date'], errors='coerce')
    
    # --- Correct numeric outliers using IQR ---
    numeric_cols = df_clean.select_dtypes(include=['int64','float64']).columns
    for col in numeric_cols:
        Q1 = df_clean[col].quantile(0.25)
        Q3 = df_clean[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        df_clean[col] = df_clean[col].clip(lower=lower_bound, upper=upper_bound)
    
    # --- Categorical consistency: encode and normalize categories ---
    cat_cols = df_clean.select_dtypes(include='object').columns
    for col in cat_cols:
        df_clean[col] = df_clean[col].astype(str)
        df_clean[col] = df_clean[col].str.replace(r'[^A-Za-z0-9\s]', '', regex=True)  # remove special chars
        df_clean[col] = df_clean[col].str.replace(r'\s+', ' ', regex=True)  # unify spaces
    
    cleaned_dfs[file_name] = df_clean
    
    # --- Document before/after metrics ---
    summary = {
        'file': file_name,
        'raw_rows': df.shape[0],
        'cleaned_rows': df_clean.shape[0],
        'removed_rows': df.shape[0] - df_clean.shape[0],
        'raw_duplicates': df.duplicated().sum(),
        'cleaned_duplicates': df_clean.duplicated().sum(),
        'raw_missing_headline': df['headline'].isnull().sum() if 'headline' in df.columns else 0,
        'clean_missing_headline': df_clean['headline'].isnull().sum() if 'headline' in df_clean.columns else 0
    }
    cleaning_summary.append(summary)
    
    # --- Save cleaned dataset ---
    save_path = os.path.join(cleaned_folder, file_name.replace('.jsonl', '_cleaned.jsonl'))
    df_clean.to_json(save_path, orient='records', lines=True, force_ascii=False)

In [12]:
# ---  Create cleaning summary DataFrame ---
summary_df = pd.DataFrame(cleaning_summary)
summary_excel_path = os.path.join(cleaned_folder, "Phase4_Cleaning_Summary_Enhanced.xlsx")
summary_df.to_excel(summary_excel_path, index=False)

print("Phase 4 Enhanced completed. Cleaned files saved.")
print(f"Cleaning summary saved at: {summary_excel_path}")
summary_df.head()

Phase 4 Enhanced completed. Cleaned files saved.
Cleaning summary saved at: /Users/abhishekkumar/Desktop/data_quality_project/cleaned_data/Phase4_Cleaning_Summary_Enhanced.xlsx


Unnamed: 0,file,raw_rows,cleaned_rows,removed_rows,raw_duplicates,cleaned_duplicates,raw_missing_headline,clean_missing_headline
0,news_events_2025_07_07.00086.jsonl,26131,26131,0,0,0,0,0
1,news_events_2025_07_07.00003.jsonl,25400,25400,0,0,0,0,0
2,news_events_2025_07_07.00001.jsonl,25922,25922,0,0,0,0,0
3,news_events_2025_07_07.00084.jsonl,25644,25644,0,0,0,0,0
4,news_events_2025_07_07.00005.jsonl,26185,26185,0,0,0,0,0


In [13]:
summary_df.head()  # Review metrics
cleaned_dfs['news_events_2025_07_07.00086.jsonl'].head()  # Check cleaned data sample

Unnamed: 0,data,included
0,Id 49C2548F7A5B45E39Bbc84477E7Fd0D0 Type NewsE...,Id 5C5Eebf036415525B95AEa2B75E0E508 Type Compa...
1,Id 49C25F6FEf44406EBa5437B4530E1642 Type NewsE...,Id C9B92FfdA2Ed57878B06F4Bf8D291E57 Type Compa...
2,Id 49C9867F4C204Be9B11AE7B8C9A71Ccb Type NewsE...,Id Af50948AE2B355B885C46A9881A55Ec2 Type Compa...
3,Id 49Cd397D1Fa64C20Beb5622D9Ec185C3 Type NewsE...,Id Caa07793A6E65E1284C8C6F88E68A965 Type Compa...
4,Id 49D417D9279845CbA20852Ed65329797 Type NewsE...,Id 3Bdf4A7591D854B39C8AD626B1A12864 Type Compa...
