In [0]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

In [0]:
# !uv sync

In [0]:
df = spark.read.csv(
    "/Volumes/aidetic_databricks/default/credit_card_transactions/credit_card_transactions.csv",
    header=True,
    inferSchema=True
)

df = df.drop("Unnamed: 0")
# Display the data
display(df)

In [0]:
from backend.core.utils import process_col_names

df = process_col_names(df)

print(f"Dataset shape: {df.count():,} rows x {len(df.columns)} columns")
df.printSchema()

---
## 2. üìã Data Quality Checking (NEW)

The `DataQualityChecker` performs PySpark-native quality analysis without converting to Pandas,
making it suitable for very large datasets.

In [0]:
from backend.core.profiling.data_quality import DataQualityChecker

# Run data quality checks
quality_checker = DataQualityChecker(df)
quality_report = quality_checker.run_all_checks()

print("=" * 60)
print("DATA QUALITY REPORT")
print("=" * 60)
print(f"\nüìä Quality Score: {quality_report.quality_score}/100")
print(f"üìà Row Count: {quality_report.row_count:,}")
print(f"üìã Column Count: {quality_report.column_count}")
print(f"üîÑ Duplicate Rows: {quality_report.duplicate_count:,}")

In [0]:
# Display quality issues
print("\n‚ö†Ô∏è DATA QUALITY ISSUES:")
print("-" * 40)
if quality_report.issues:
    for issue in quality_report.issues[:10]:
        severity_icon = "üî¥" if issue['severity'] == 'high' else "üü°" if issue['severity'] == 'medium' else "üü¢"
        print(f"{severity_icon} [{issue['severity'].upper()}] {issue['column']}: {issue['issue']}")
else:
    print("‚úÖ No major quality issues detected!")

In [0]:
# Display recommendations
print("\nüí° PREPROCESSING RECOMMENDATIONS:")
print("-" * 40)
if quality_report.recommendations:
    for rec in quality_report.recommendations[:10]:
        priority_icon = "üî¥" if rec['priority'] == 'high' else "üü°" if rec['priority'] == 'medium' else "üü¢"
        print(f"{priority_icon} [{rec['priority'].upper()}] {rec['column']}: {rec['action']}")
else:
    print("‚úÖ No preprocessing recommendations needed!")

In [0]:
# Detect outliers
print("\nüìä OUTLIER DETECTION (IQR Method):")
print("-" * 40)
outliers = quality_checker.detect_outliers(method='iqr', threshold=1.5)

for col, stats in outliers.items():
    if stats['outlier_pct'] > 0:
        print(f"  {col}: {stats['outlier_count']:,} outliers ({stats['outlier_pct']:.2f}%)")
        print(f"    Bounds: [{stats['lower_bound']:.2f}, {stats['upper_bound']:.2f}]")

---
## 3. üìä YData Profiling (NEW)

The `DataProfiler` generates comprehensive data profiles using ydata-profiling,
with automatic sampling for large datasets.

In [0]:
from ydata_profiling import ProfileReport

profile = ProfileReport(df.toPandas(), title="Profiling Report")

profile.to_file("data_profiling_report.html")

In [0]:
from IPython.core.display import HTML, display

display(HTML('/Workspace/Users/yadvendra@aidetic.in/spark_beyond/backend/notebooks/data_profiling_report.html'))

In [0]:
from backend.core.profiling.ydata_profiler import DataProfiler, quick_profile

# Quick profile (faster, minimal report)
print("Generating quick profile...")
quick_stats = quick_profile(df, max_rows=10000)

print("\nüìà QUICK PROFILE SUMMARY:")
print("-" * 40)
summary = quick_stats['summary']
print(f"  Rows: {summary.get('n_rows', 0):,}")
print(f"  Columns: {summary.get('n_columns', 0)}")
print(f"  Missing Cells: {summary.get('missing_cells_pct', 0):.2f}%")
print(f"  Duplicate Rows: {summary.get('duplicate_rows_pct', 0):.2f}%")

In [0]:
# Display alerts
print("\n‚ö†Ô∏è DATA ALERTS:")
print("-" * 40)
if quick_stats['alerts']:
    for alert in quick_stats['alerts'][:10]:
        print(f"  - {alert['column']}: {alert['type']}")
else:
    print("  ‚úÖ No alerts!")

In [0]:
# Display profiling recommendations
print("\nüí° PROFILING RECOMMENDATIONS:")
print("-" * 40)
if quick_stats['recommendations']:
    for rec in quick_stats['recommendations'][:10]:
        priority_icon = "üî¥" if rec['priority'] == 'high' else "üü°" if rec['priority'] == 'medium' else "üü¢"
        print(f"{priority_icon} {rec['column']}: {rec['action']}")

---
## 4. üéØ Problem Definition & Schema Validation

In [0]:
# drop_column = ["lat", "long", "zip"]



In [0]:
from backend.core.discovery import Problem, SchemaChecks

# Define the ML problem
problem = Problem(
    target="is_fraud",
    type="classification",
    desired_result=1,
    date_column="trans_date_trans_time"
)

print(f"Problem Type: {problem.type}")
print(f"Target Column: {problem.target}")
print(f"Desired Result: {problem.desired_result}")

In [0]:
# Validate schema
schema_checker = SchemaChecks(dataframe=df, problem=problem)
schema_info = schema_checker.check()

print(f"\nüìã SCHEMA SUMMARY:")
print(f"  Categorical columns: {len(schema_info['categorical'])}")
print(f"  Numerical columns: {len(schema_info['numerical'])}")
print(f"  Boolean columns: {len(schema_info['boolean'])}")

---
## 5. ‚è±Ô∏è Time Series Detection (NEW)

The `detect_time_series_structure` function automatically identifies temporal patterns
and recommends appropriate time-series features.

In [0]:
from backend.core.utils.time_series_detector import detect_time_series_structure, TimeSeriesFrequency

# Detect time-series structure
ts_info = detect_time_series_structure(df, schema_checker)

print("\n‚è±Ô∏è TIME SERIES DETECTION RESULTS:")
print("-" * 40)
print(f"  Is Time Series: {ts_info.is_time_series}")
print(f"  Time Column: {ts_info.time_column or 'N/A'}")
print(f"  Frequency: {ts_info.frequency.value if ts_info.frequency else 'N/A'}")
print(f"  Entity Columns: {ts_info.entity_columns or 'N/A'}")

if ts_info.warnings:
    print("\n‚ö†Ô∏è Warnings:")
    for warning in ts_info.warnings:
        print(f"    - {warning}")

if ts_info.recommended_features:
    print("\nüí° Recommended Time-Series Features:")
    for feature in ts_info.recommended_features:
        print(f"    - {feature}")