# Data Collection and Preprocessing
This notebook combines **data scraping** and **data preprocessing** steps.  

In [None]:
import sys
from pathlib import Path
import importlib.util
import matplotlib.pyplot as plt
import seaborn as sns

## Locate Project Root and `src` Directory
cwd = Path.cwd()
if cwd.name == "notebooks":
    project_root = cwd.parent
else:
    project_root = next((p for p in cwd.parents if (p / "src").exists()), cwd)
SRC_DIR = project_root / "src"
print(f"Project root: {project_root}")
print(f"Adding src to sys.path: {SRC_DIR}")
sys.path.insert(0, str(SRC_DIR))

# Validate `preprocessor.py` and Import It

preprocess_file = SRC_DIR / "preprocessor.py"
if not preprocess_file.exists():
    raise FileNotFoundError(f"Expected file not found: {preprocess_file}")
module_name = "preprocess"
try:
    preprocess_mod = importlib.import_module(module_name)
    print(f"Imported module '{module_name}' normally.")
except Exception as e:
    print(f"Normal import failed ({e}). Loading from file.")
    spec = importlib.util.spec_from_file_location(module_name, str(preprocess_file))
    preprocess_mod = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(preprocess_mod)
    sys.modules[module_name] = preprocess_mod
    print(f"Loaded module '{module_name}' from {preprocess_file}")

## call `ReviewPreprocessor` Class

if hasattr(preprocess_mod, "ReviewPreprocessor"):
    ReviewPreprocessor = preprocess_mod.ReviewPreprocessor
    print("ReviewPreprocessor is ready to use.")
else:
    raise ImportError("'ReviewPreprocessor' not found in preprocess.py. Check class name inside the file.")

# Import Scraper and call `run_scraper`
scrap_file = SRC_DIR / "scraper.py"
run_scraper = None

if scrap_file.exists():
    try:
        scrap_mod = importlib.import_module("scraper")
    except Exception:
        spec = importlib.util.spec_from_file_location("scraper", str(scrap_file))
        scrap_mod = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(scrap_mod)
        sys.modules["scraper"] = scrap_mod

    if hasattr(scrap_mod, "run_scraper") and callable(scrap_mod.run_scraper):
        run_scraper = scrap_mod.run_scraper
    elif hasattr(scrap_mod, "main") and callable(scrap_mod.main):
        run_scraper = scrap_mod.main
    else:
        print("No callable 'run_scraper' or 'main' found in scrap.py. Define one to run the scraper.")
else:
    print(f"Warning: expected scraper file not found: {scrap_file}")

# Run Scraper (If Avail
if run_scraper:
    raw_df = run_scraper()
    display(raw_df.head())
else:
    print("run_scraper is not available. Check src/scrap.py for a callable 'main' or 'run_scraper'.")

## Run Preprocessor
preprocessor = ReviewPreprocessor()
success = preprocessor.process()

if success:
    df = preprocessor.df
    print(f"Preprocessing finished. Rows: {len(df)}")
else:
    print("Preprocessing failed.")


# Basic Visualizations 
   Explore the cleaned dataset using plots:
- **Review Length Distribution by Bank**
- **Rating Distribution by Bank** 
- **Correlation Heatmap grouped by Bank** 


In [None]:

import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="whitegrid")

# 1) Review Length Distribution by Bank
plt.figure(figsize=(12, 6))
sns.histplot(data=df, x='text_length', bins=50, kde=True, hue='bank_code', palette='Set2')
plt.title('Distribution of Review Lengths by Bank')
plt.xlabel('Review Length (characters)')
plt.ylabel('Count')
plt.tight_layout()
plt.show()

# 2) Rating Distribution by Bank
if 'rating' in df.columns:
    plt.figure(figsize=(12, 6))
    sns.countplot(data=df, x='rating', hue='bank_code', palette='Set2')
    plt.title('Rating Distribution by Bank')
    plt.xlabel('Rating')
    plt.ylabel('Count')
    plt.tight_layout()
    plt.show()
else:
    print("Column 'rating' not found. Skipping rating distribution visualization.")

# Correlation Heatmap grouped by Bank
numeric_cols = df.select_dtypes(include=['int64', 'float64']).columns

if len(numeric_cols) > 1 and 'bank_code' in df.columns:
    unique_banks = df['bank_code'].unique()
    for bank in unique_banks:
        subset = df[df['bank_code'] == bank]
        corr = subset[numeric_cols].corr()

        plt.figure(figsize=(10, 6))
        sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f')
        plt.title(f'Correlation Heatmap for Bank: {bank}')
        plt.tight_layout()
        plt.show()
else:
    print("Not enough numeric columns or 'bank_code' missing for correlation heatmap.")

