In [6]:
# 02_preprocessing_checks.ipynb

# Markdown Cell
# # 02 Preprocessing & Data Checks
# Clean the raw reviews for analysis and ensure data quality.

# Code Cell
import pandas as pd
import sys
import re

# Add project path to import config
sys.path.append(r"E:\10 academy\weak2\fintech-review-analytics")
from config.config import RAW_CSV_PATH, CLEAN_CSV_PATH

# Load raw reviews
df = pd.read_csv(RAW_CSV_PATH)

# Markdown Cell
# ## Remove Duplicates

# Code Cell
print(f"Original rows: {len(df)}")
df = df.drop_duplicates(subset=['review'])
print(f"After removing duplicates: {len(df)}")

# Markdown Cell
# ## Handle Missing Data

# Code Cell
# Drop rows with missing review text or bank
df = df.dropna(subset=['review', 'bank'])
print(f"After dropping missing values: {len(df)}")

# Markdown Cell
# ## Clean Review Text

# Code Cell
def clean_text(text):
    text = str(text)
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # remove URLs
    text = re.sub(r"[^a-zA-Z0-9\s]", '', text)  # remove punctuation
    text = re.sub(r"\s+", ' ', text).strip()  # remove extra spaces
    return text

df['cleaned'] = df['review'].apply(clean_text)

# Check first rows
df[['review','cleaned']].head()

# Markdown Cell
# ## Normalize Dates

# Code Cell
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df.dropna(subset=['date'])  # drop rows with invalid dates
df['date'] = df['date'].dt.date

# Markdown Cell
# ## Save Cleaned CSV

# Code Cell
df.to_csv(CLEAN_CSV_PATH, index=False)
print(f"Saved cleaned reviews to {CLEAN_CSV_PATH}")

# Markdown Cell
# ## Quick Validation

# Code Cell
print(df.info())
print(df.describe())


Original rows: 1266
After removing duplicates: 1210
After dropping missing values: 1210
Saved cleaned reviews to E:\10 academy\weak2\fintech-review-analytics\data\processed\reviews_clean.csv
<class 'pandas.core.frame.DataFrame'>
Index: 1210 entries, 0 to 1265
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   bank     1210 non-null   object
 1   review   1210 non-null   object
 2   rating   1210 non-null   int64 
 3   date     1210 non-null   object
 4   source   1210 non-null   object
 5   cleaned  1210 non-null   object
dtypes: int64(1), object(5)
memory usage: 66.2+ KB
None
            rating
count  1210.000000
mean      3.638843
std       1.745886
min       1.000000
25%       1.000000
50%       5.000000
75%       5.000000
max       5.000000
