In [29]:
!pip install XlsxWriter



In [30]:
import os
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import textwrap
from collections import Counter
import numpy as np

In [31]:
data_folder = "/Users/abhishekkumar/Desktop/data_quality_project/data1"
profile_folder = os.path.join("..", "docs", "phase2_profiles")
os.makedirs(profile_folder, exist_ok=True)

In [32]:
all_files = [f for f in os.listdir(data_folder) if f.endswith(('.json', '.jsonl'))]
print(f"Files found ({len(all_files)}): {all_files}")

Files found (24): ['news_events_2025_07_07.00086.jsonl', 'news_events_2025_07_07.00003.jsonl', 'news_events_2025_07_07.00001.jsonl', 'news_events_2025_07_07.00084.jsonl', 'news_events_2025_07_07.00005.jsonl', 'news_events_2025_07_07.00080.jsonl', 'news_events_2025_07_07.00103.jsonl', 'news_events_2025_07_07.00078.jsonl', 'news_events_2025_07_07.00085.jsonl', 'news_events_2025_07_07.00000.jsonl', 'news_events_2025_07_07.00104.jsonl', 'news_events_2025_07_07.00002.jsonl', 'news_events_2025_07_07.00083.jsonl', 'news_events_2025_07_07.00102.jsonl', 'news_events_2025_07_07.00098.jsonl', 'news_events_2025_07_07.00058.jsonl', 'news_events_2025_07_07.00004.jsonl', 'news_events_2025_07_07.00076.jsonl', 'news_events_2025_07_07.00052.jsonl', 'news_events_2025_07_07.00050.jsonl', 'news_events_2025_07_07.00090.jsonl', 'news_events_2025_07_07.00077.jsonl', 'news_events_2025_07_07.00053.jsonl', 'news_events_2025_07_07.00093.jsonl']


In [33]:
dfs = {}
for f in all_files:
    file_path = os.path.join(data_folder, f)
    with open(file_path, 'r', encoding='utf-8') as file:
        data = [json.loads(line) for line in file]
    df = pd.json_normalize(data)
    dfs[f] = df.astype(str).drop_duplicates()  # deduplicate safely

print(f"Loaded {len(dfs)} datasets.")


Loaded 24 datasets.


In [34]:
# -----------------------------
# Profiling function
# -----------------------------
profiling_summary_list = []

for file_name, df in dfs.items():
    profile = {}
    profile['file'] = file_name
    profile['rows'] = df.shape[0]
    profile['cols'] = df.shape[1]
    
    # -------------------------
    # Sample records
    # -------------------------
    profile['sample_records'] = df.head(5).to_dict(orient='records')
    
    # -------------------------
    # Missing values
    # -------------------------
    missing_per_col = df.isnull().sum()
    profile['total_missing'] = missing_per_col.sum()
    profile['missing_per_column'] = missing_per_col.to_dict()
    
    # -------------------------
    # Unique values
    # -------------------------
    unique_per_col = df.nunique()
    profile['unique_per_column'] = unique_per_col.to_dict()
    
    # -------------------------
    # Numeric stats
    # -------------------------
    numeric_cols = df.select_dtypes(include=['int64','float64']).columns
    if len(numeric_cols) > 0:
        stats = df[numeric_cols].describe().to_dict()
        profile['numeric_stats'] = stats
        
        # Outlier detection using IQR
        outliers = {}
        for col in numeric_cols:
            Q1 = df[col].quantile(0.25)
            Q3 = df[col].quantile(0.75)
            IQR = Q3 - Q1
            outliers[col] = df[(df[col] < Q1 - 1.5*IQR) | (df[col] > Q3 + 1.5*IQR)][col].count()
        profile['numeric_outliers_count'] = outliers
    
    # -------------------------
    # Text length stats
    # -------------------------
    text_cols = df.select_dtypes(include='object').columns
    text_length_stats = {}
    categorical_anomalies = {}
    for col in text_cols:
        lengths = df[col].dropna().apply(lambda x: len(str(x)))
        text_length_stats[col] = {
            'min_length': lengths.min(),
            'max_length': lengths.max(),
            'mean_length': lengths.mean(),
            'std_length': lengths.std()
        }
        
        # Detect anomalies: inconsistent capitalization or likely typos
        value_counts = df[col].value_counts()
        if len(value_counts) > 0:
            anomalies = [v for v in value_counts.index if sum(1 for c in v if c.isupper()) > 3 or len(v) > 100]
            if anomalies:
                categorical_anomalies[col] = anomalies[:10]  # show top 10
    profile['text_length_stats'] = text_length_stats
    profile['categorical_anomalies'] = categorical_anomalies
    
    profiling_summary_list.append(profile)
    
    # -------------------------
    # Plots
    # -------------------------
    for col in df.columns[:2]:  # first 2 columns only
        plt.figure(figsize=(8,5))
        if df[col].dtype in ['int64','float64']:
            sns.histplot(df[col], bins=30, kde=True)
        else:
            top_counts = df[col].value_counts().nlargest(10)
            top_labels = [str(x)[:50] + '...' if len(str(x)) > 50 else str(x) for x in top_counts.index]
            sns.barplot(x=top_counts.values, y=top_labels)
        
        title_text = f"{file_name} - {col} Distribution"
        title_wrapped = "\n".join(textwrap.wrap(title_text, width=70))
        plt.title(title_wrapped, usetex=False)
        try:
            plt.tight_layout()
        except:
            plt.subplots_adjust(left=0.2, right=0.9, top=0.9, bottom=0.2)
        
        safe_file_name = f"{file_name}_{col}_dist.png".replace('/', '_').replace('\\', '_')
        plt.savefig(os.path.join(profile_folder, safe_file_name))
        plt.close()

# -----------------------------
# Consolidate profiling into Excel
# -----------------------------
try:
    import xlsxwriter
    report_path = os.path.join(profile_folder, "Data_Profiling_Report.xlsx")
    
    # Convert each profile into a sheet
    with pd.ExcelWriter(report_path, engine='xlsxwriter') as writer:
        for p in profiling_summary_list:
            df_sheet = pd.DataFrame.from_dict({
                'column': list(p['missing_per_column'].keys()),
                'missing_count': list(p['missing_per_column'].values()),
                'unique_count': [p['unique_per_column'][c] for c in p['missing_per_column'].keys()]
            })
            df_sheet.to_excel(writer, sheet_name=p['file'][:31], index=False)
    print(f"Profiling report saved: {report_path}")
except ModuleNotFoundError:
    print("Please install xlsxwriter: pip install xlsxwriter")


Profiling report saved: ../docs/phase2_profiles/Data_Profiling_Report.xlsx


In [41]:
with pd.ExcelWriter(report_path, engine='xlsxwriter') as writer:
    for sheet_name, df_sheet in excel_sheets.items():
        df_sheet.to_excel(writer, sheet_name=sheet_name[:31], index=False)


In [42]:
import os
print(os.path.exists(report_path))  # Should return True if the file exists

True


In [81]:
dq_df = pd.DataFrame(profiling_summary_list)
#print(dq_df.info())
print(dq_df.describe(include="all"))

                                      file          rows  cols  \
count                                   24     24.000000  24.0   
unique                                  24           NaN   NaN   
top     news_events_2025_07_07.00086.jsonl           NaN   NaN   
freq                                     1           NaN   NaN   
mean                                   NaN  25853.583333   2.0   
std                                    NaN    386.277431   0.0   
min                                    NaN  25374.000000   2.0   
25%                                    NaN  25537.000000   2.0   
50%                                    NaN  25761.500000   2.0   
75%                                    NaN  26140.750000   2.0   
max                                    NaN  26837.000000   2.0   

                                           sample_records  total_missing  \
count                                                  24           24.0   
unique                                                 

In [43]:
print(len(profiling_summary_list))  # should match number of files
print(profiling_summary_list[0].keys())  # should include rows, cols, missing_per_column, unique_per_column, etc.

24
dict_keys(['file', 'rows', 'cols', 'sample_records', 'total_missing', 'missing_per_column', 'unique_per_column', 'text_length_stats', 'categorical_anomalies'])


In [44]:
os.listdir(profile_folder)

['news_events_2025_07_07.00058.jsonl_data_dist.png',
 'news_events_2025_07_07.00001.jsonl_included_dist.png',
 'news_events_2025_07_07.00052.jsonl_included_dist.png',
 'news_events_2025_07_07.00005.jsonl_included_dist.png',
 'news_events_2025_07_07.00085.jsonl_included_dist.png',
 'news_events_2025_07_07.00103.jsonl_included_dist.png',
 'news_events_2025_07_07.00086.jsonl_included_dist.png',
 'news_events_2025_07_07.00002.jsonl_data_dist.png',
 'news_events_2025_07_07.00053.jsonl_data_dist.png',
 'news_events_2025_07_07.00003.jsonl_data_dist.png',
 'news_events_2025_07_07.00052.jsonl_data_dist.png',
 'news_events_2025_07_07.00000.jsonl_data_dist.png',
 'news_events_2025_07_07.00077.jsonl_included_dist.png',
 'news_events_2025_07_07.00050.jsonl_data_dist.png',
 'news_events_2025_07_07.00001.jsonl_data_dist.png',
 'news_events_2025_07_07.00078.jsonl_included_dist.png',
 'news_events_2025_07_07.00104.jsonl_included_dist.png',
 'news_events_2025_07_07.00005.jsonl_data_dist.png',
 'news_eve

In [45]:
report_path = os.path.join(profile_folder, "Phase2_Profiling_Summary.xlsx")
print(os.path.exists(report_path))  # should be True
pd.read_excel(report_path).head()


True


Unnamed: 0,Column,Missing Values,Unique Values,Min Text Length,Max Text Length,Mean Text Length,Std Text Length,Plot Path
0,data,0,26131,1059,2122,1356.621675,115.384072,../docs/phase2_profiles/news_events_2025_07_07...
1,included,0,25525,535,67162,2741.469098,2259.563537,../docs/phase2_profiles/news_events_2025_07_07...


In [47]:
print(list(dfs.keys()))

['news_events_2025_07_07.00086.jsonl', 'news_events_2025_07_07.00003.jsonl', 'news_events_2025_07_07.00001.jsonl', 'news_events_2025_07_07.00084.jsonl', 'news_events_2025_07_07.00005.jsonl', 'news_events_2025_07_07.00080.jsonl', 'news_events_2025_07_07.00103.jsonl', 'news_events_2025_07_07.00078.jsonl', 'news_events_2025_07_07.00085.jsonl', 'news_events_2025_07_07.00000.jsonl', 'news_events_2025_07_07.00104.jsonl', 'news_events_2025_07_07.00002.jsonl', 'news_events_2025_07_07.00083.jsonl', 'news_events_2025_07_07.00102.jsonl', 'news_events_2025_07_07.00098.jsonl', 'news_events_2025_07_07.00058.jsonl', 'news_events_2025_07_07.00004.jsonl', 'news_events_2025_07_07.00076.jsonl', 'news_events_2025_07_07.00052.jsonl', 'news_events_2025_07_07.00050.jsonl', 'news_events_2025_07_07.00090.jsonl', 'news_events_2025_07_07.00077.jsonl', 'news_events_2025_07_07.00053.jsonl', 'news_events_2025_07_07.00093.jsonl']


In [48]:
import os
print(os.listdir(data_folder))

['news_events_2025_07_07.00086.jsonl', 'news_events_2025_07_07.00003.jsonl', 'news_events_2025_07_07.00001.jsonl', 'news_events_2025_07_07.00084.jsonl', 'news_events_2025_07_07.00005.jsonl', 'news_events_2025_07_07.00080.jsonl', 'news_events_2025_07_07.00103.jsonl', 'news_events_2025_07_07.00078.jsonl', 'news_events_2025_07_07.00085.jsonl', 'news_events_2025_07_07.00000.jsonl', 'news_events_2025_07_07.00104.jsonl', 'news_events_2025_07_07.00002.jsonl', 'news_events_2025_07_07.00083.jsonl', 'news_events_2025_07_07.00102.jsonl', 'dataset_summary.ipynb', 'news_events_2025_07_07.00098.jsonl', 'news_events_2025_07_07.00058.jsonl', 'news_events_2025_07_07.00004.jsonl', 'news_events_2025_07_07.00076.jsonl', 'news_events_2025_07_07.00052.jsonl', 'news_events_2025_07_07.00050.jsonl', 'news_events_2025_07_07.00090.jsonl', '.ipynb_checkpoints', 'news_events_2025_07_07.00077.jsonl', 'news_events_2025_07_07.00053.jsonl', 'news_events_2025_07_07.00093.jsonl']


In [49]:
print(list(dfs.keys()))

['news_events_2025_07_07.00086.jsonl', 'news_events_2025_07_07.00003.jsonl', 'news_events_2025_07_07.00001.jsonl', 'news_events_2025_07_07.00084.jsonl', 'news_events_2025_07_07.00005.jsonl', 'news_events_2025_07_07.00080.jsonl', 'news_events_2025_07_07.00103.jsonl', 'news_events_2025_07_07.00078.jsonl', 'news_events_2025_07_07.00085.jsonl', 'news_events_2025_07_07.00000.jsonl', 'news_events_2025_07_07.00104.jsonl', 'news_events_2025_07_07.00002.jsonl', 'news_events_2025_07_07.00083.jsonl', 'news_events_2025_07_07.00102.jsonl', 'news_events_2025_07_07.00098.jsonl', 'news_events_2025_07_07.00058.jsonl', 'news_events_2025_07_07.00004.jsonl', 'news_events_2025_07_07.00076.jsonl', 'news_events_2025_07_07.00052.jsonl', 'news_events_2025_07_07.00050.jsonl', 'news_events_2025_07_07.00090.jsonl', 'news_events_2025_07_07.00077.jsonl', 'news_events_2025_07_07.00053.jsonl', 'news_events_2025_07_07.00093.jsonl']
