In [3]:
import pandas as pd
import numpy as np
from scipy.stats import chisquare
BASE_DIR = r"D:\1909term-thesis\thesis\ReviewsCrawler"
df = pd.read_csv(fr"{BASE_DIR}\clauses_pred_label.csv")

In [7]:
df.head(20)

Unnamed: 0,gender,category,file_name,rateDate,review_text,rateDate_parsed,review_length,clause,domain_pred,review_id
0,UNIQLO_female,UNIQLO_blouse_female,blouse_611817827239.csv,2021-01-08 07:59:09,一直喜欢这件衣服，一直不降价，终于逮住了，特卖会，79买的，真香，留着开春穿，优衣库的衣服真...,2021-01-08 07:59:09,65,一直喜欢这件衣服,Other,0
1,UNIQLO_female,UNIQLO_blouse_female,blouse_611817827239.csv,2021-01-08 07:59:09,一直喜欢这件衣服，一直不降价，终于逮住了，特卖会，79买的，真香，留着开春穿，优衣库的衣服真...,2021-01-08 07:59:09,65,一直不降价,Other,0
2,UNIQLO_female,UNIQLO_blouse_female,blouse_611817827239.csv,2021-01-08 07:59:09,一直喜欢这件衣服，一直不降价，终于逮住了，特卖会，79买的，真香，留着开春穿，优衣库的衣服真...,2021-01-08 07:59:09,65,终于逮住了,Other,0
3,UNIQLO_female,UNIQLO_blouse_female,blouse_611817827239.csv,2021-01-08 07:59:09,一直喜欢这件衣服，一直不降价，终于逮住了，特卖会，79买的，真香，留着开春穿，优衣库的衣服真...,2021-01-08 07:59:09,65,特卖会,Other,0
4,UNIQLO_female,UNIQLO_blouse_female,blouse_611817827239.csv,2021-01-08 07:59:09,一直喜欢这件衣服，一直不降价，终于逮住了，特卖会，79买的，真香，留着开春穿，优衣库的衣服真...,2021-01-08 07:59:09,65,79买的,Other,0
5,UNIQLO_female,UNIQLO_blouse_female,blouse_611817827239.csv,2021-01-08 07:59:09,一直喜欢这件衣服，一直不降价，终于逮住了，特卖会，79买的，真香，留着开春穿，优衣库的衣服真...,2021-01-08 07:59:09,65,真香,Other,0
6,UNIQLO_female,UNIQLO_blouse_female,blouse_611817827239.csv,2021-01-08 07:59:09,一直喜欢这件衣服，一直不降价，终于逮住了，特卖会，79买的，真香，留着开春穿，优衣库的衣服真...,2021-01-08 07:59:09,65,留着开春穿,Other,0
7,UNIQLO_female,UNIQLO_blouse_female,blouse_611817827239.csv,2021-01-08 07:59:09,一直喜欢这件衣服，一直不降价，终于逮住了，特卖会，79买的，真香，留着开春穿，优衣库的衣服真...,2021-01-08 07:59:09,65,优衣库的衣服真心不错,Other,0
8,UNIQLO_female,UNIQLO_blouse_female,blouse_611817827239.csv,2021-01-08 07:59:09,一直喜欢这件衣服，一直不降价，终于逮住了，特卖会，79买的，真香，留着开春穿，优衣库的衣服真...,2021-01-08 07:59:09,65,值得够买呀,Value Perception,0
9,UNIQLO_female,UNIQLO_blouse_female,blouse_611817827239.csv,2021-01-08 07:59:09,一直喜欢这件衣服，一直不降价，终于逮住了，特卖会，79买的，真香，留着开春穿，优衣库的衣服真...,2021-01-08 07:59:09,65,浪里个浪,Other,0


In [5]:
# ============================================================
# 1. Create a review_id
# ============================================================
df["review_id"] = (
    df[["file_name", "rateDate_parsed", "review_text"]]
    .astype(str)
    .agg("||".join, axis=1)
    .factorize()[0]
)

# List of domains
DOMAINS = ["Product Quality", "Aesthetic Attributes", "Functionality", "Size & Fit", "Value Perception"]
OTHER_LABEL = "Other"

In [13]:
# ============================================================
# 2. Remove reviews that contain only 'other' domains
# ============================================================

# Determine domain set for each review
review_domain_set = (
    df.groupby("review_id")["domain_pred"]
      .apply(set)
      .reset_index(name="domain_set")
)

# Reviews that contain ONLY 'other' domain
reviews_only_other = review_domain_set[
    review_domain_set["domain_set"].apply(lambda s: s.issubset({OTHER_LABEL}))
]["review_id"]

num_reviews_only_other = len(reviews_only_other)

# Reviews that contain at least one non-'other' domain
valid_review_ids = review_domain_set[
    review_domain_set["domain_set"].apply(lambda s: any(d != OTHER_LABEL for d in s))
]["review_id"]

# Filter dataset
df_filtered = df[df["review_id"].isin(valid_review_ids)].copy()

# ============================================================
# Printing summary
# ============================================================

print("=== Filtering Summary ===")
print(f"Total clauses before filtering reviews-only-'other': {len(df)}")
print(f"Total clauses after filtering: {len(df_filtered)}")

print(f"Total reviews before filtering: {df['review_id'].nunique()}")
print(f"Reviews containing ONLY 'other' domain: {num_reviews_only_other}")
print(f"Total reviews retained (at least one product domain): {df_filtered['review_id'].nunique()}")

=== Filtering Summary ===
Total clauses before filtering reviews-only-'other': 135252
Total clauses after filtering: 112204
Total reviews before filtering: 42107
Reviews containing ONLY 'other' domain: 9830
Total reviews retained (at least one product domain): 32277


In [14]:
# ============================================================
# 3. Data Summary
# ============================================================

domain_counts = df_filtered["domain_pred"].value_counts().sort_index()
domain_proportions = (domain_counts / domain_counts.sum()).round(4)

domain_summary = pd.DataFrame({
    "count": domain_counts,
    "proportion": domain_proportions
})

print("\nDomain Summary Table:")
print(domain_summary)


Domain Summary Table:
                      count  proportion
domain_pred                            
Aesthetic Attributes  14997      0.1337
Functionality          4446      0.0396
Other                 49039      0.4371
Product Quality       23146      0.2063
Size & Fit            15227      0.1357
Value Perception       5349      0.0477


In [16]:
# ============================================================
# 4. Construct Coverage Index (CCI)
# ============================================================
df_non_other = df_filtered[df_filtered["domain_pred"] != OTHER_LABEL].copy()

# --- 4.1 OCR-derived domain proportions (excluding 'Other') ---
def compute_ocr_proportions(df_non_other, domains):
    # Count only over df_non_other (no 'Other')
    counts = (
        df_non_other["domain_pred"]
        .value_counts()
        .reindex(domains, fill_value=0)
    )
    total = counts.sum()
    p_ocr = (counts / total).to_dict()
    return counts, p_ocr

ocr_counts, p_ocr = compute_ocr_proportions(df_non_other, DOMAINS)
print("\nDomain Summary Table:")
print(pd.DataFrame({
    "count": ocr_counts,
    "proportion": p_ocr
}))

# --- 4.2 Survey-derived domain proportions (user must fill) ---
# IMPORTANT: keys must match DOMAINS exactly
p_survey = {
    "Product Quality":      0.20,
    "Aesthetic Attributes": 0.20,
    "Functionality":        0.20,
    "Size & Fit":           0.20,
    "Value Perception":     0.20,
}

# Sanity check: should sum to 1
assert abs(sum(p_survey[d] for d in DOMAINS) - 1.0) < 1e-6, "Survey proportions must sum to 1."

# --- 4.3 CCI-domain and global CCI ---
def compute_cci(p_ocr, p_survey, domains):
    """
    Domain-level CCI(d) = p_ocr[d] / p_survey[d]
    Global CCI (similarity) = 1 - 0.5 * sum_d |p_ocr[d] - p_survey[d]|
    """
    cci_domain = {}
    for d in domains:
        if p_survey[d] > 0:
            cci_domain[d] = p_ocr[d] / p_survey[d]
        else:
            cci_domain[d] = np.nan  # or handle special case

    l1_distance = sum(abs(p_ocr[d] - p_survey[d]) for d in domains)
    cci_global = 1 - 0.5 * l1_distance  # in [0,1]

    return cci_domain, cci_global, l1_distance

cci_domain, cci_global, l1_distance = compute_cci(p_ocr, p_survey, DOMAINS)

print("\n=== Construct Coverage Index (CCI) ===")
print("Domain-level CCI(d):")
for d in DOMAINS:
    print(f"  {d}: {cci_domain[d]:.3f}")
print(f"\nL1 distance between OCR and survey distributions: {l1_distance:.4f}")
print(f"Global CCI (similarity index): {cci_global:.3f}")


Domain Summary Table:
                      count  proportion
Product Quality       23146    0.366437
Aesthetic Attributes  14997    0.237426
Functionality          4446    0.070387
Size & Fit            15227    0.241067
Value Perception       5349    0.084683

=== Construct Coverage Index (CCI) ===
Domain-level CCI(d):
  Product Quality: 1.832
  Aesthetic Attributes: 1.187
  Functionality: 0.352
  Size & Fit: 1.205
  Value Perception: 0.423

L1 distance between OCR and survey distributions: 0.4899
Global CCI (similarity index): 0.755


In [18]:
# ============================================================
# 6. Variance Gap Index (VGI)
# ============================================================

def compute_vgi(df_non_other, domains):
    """
    For each review i and domain d:
        p_{i,d} = (# of clauses in domain d for review i) / (# of non-'other' clauses in review i)

    Var_d = variance_i( p_{i,d} )
    Theoretical max variance on [0,1] is 0.25.
    Normalized variance = Var_d / 0.25.
    VGI(d) = normalized variance (since survey ideal variance ≈ 0).

    Global VGI = average_d VGI(d).
    """
    # Count per review-domain
    counts = (
        df_non_other.groupby(["review_id", "domain_pred"])
        .size()
        .unstack(fill_value=0)
        .reindex(columns=domains, fill_value=0)
    )

    # Total non-'other' clauses per review
    total_per_review = counts.sum(axis=1)

    # Avoid division by zero (should not happen after filtering, but safe)
    counts = counts[total_per_review > 0]
    total_per_review = total_per_review[total_per_review > 0]

    # Compute p_{i,d}
    p_id = counts.div(total_per_review, axis=0)

    # Variance per domain
    var_d = p_id.var(axis=0)  # variance across reviews

    # Normalize by maximum variance (0.25)
    vgi_domain = (var_d / 0.25).to_dict()

    # Global VGI as simple average
    vgi_global = np.mean(list(vgi_domain.values()))

    return var_d.to_dict(), vgi_domain, vgi_global

var_d, vgi_domain, vgi_global = compute_vgi(df_non_other, DOMAINS)

print("\n=== Variance Gap Index (VGI) ===")
print("Raw variances per domain:")
for d in DOMAINS:
    print(f"  {d}: {var_d[d]:.4f}")
print("Normalized VGI per domain:")
for d in DOMAINS:
    print(f"  {d}: {vgi_domain[d]:.3f}")
print(f"Global VGI: {vgi_global:.3f}")


=== Variance Gap Index (VGI) ===
Raw variances per domain:
  Product Quality: 0.1692
  Aesthetic Attributes: 0.1305
  Functionality: 0.0414
  Size & Fit: 0.1331
  Value Perception: 0.0614
Normalized VGI per domain:
  Product Quality: 0.677
  Aesthetic Attributes: 0.522
  Functionality: 0.165
  Size & Fit: 0.532
  Value Perception: 0.245
Global VGI: 0.428


In [19]:
# ============================================================
# 7. Review Sparsity Index (RSI)
# ============================================================

def compute_rsi(df_non_other, domains):
    """
    Let D = number of domains = 5
    For each review i:
        k_i = number of distinct domains (from {DOMAINS}) present at least once.
        RSI_i = 1 - (k_i / D)
    Overall RSI = mean_i RSI_i
    """
    D = len(domains)

    domains_per_review = (
        df_non_other.groupby("review_id")["domain_pred"]
        .apply(lambda s: set(s) & set(domains))
    )

    k_i = domains_per_review.apply(len)
    rsi_i = 1 - (k_i / D)
    rsi_global = rsi_i.mean()

    return rsi_global, rsi_i.describe()

rsi_global, rsi_desc = compute_rsi(df_non_other, DOMAINS)

print("\n=== Review Sparsity Index (RSI) ===")
print(f"Global RSI: {rsi_global:.3f}")
print("RSI per-review distribution (summary):")
print(rsi_desc)


=== Review Sparsity Index (RSI) ===
Global RSI: 0.701
RSI per-review distribution (summary):
count    32277.000000
mean         0.701478
std          0.142206
min          0.000000
25%          0.600000
50%          0.800000
75%          0.800000
max          0.800000
Name: domain_pred, dtype: float64


In [21]:
# ============================================================
# 8. Chi-square goodness-of-fit test
# ============================================================

def compute_chi_square(ocr_counts, p_survey, domains):
    """
    Observed: OCR clause counts per domain.
    Expected: T * p_survey[d] for each domain d,
              where T = sum of observed counts (non-'other' only).
    """
    observed = np.array([ocr_counts[d] for d in domains], dtype=float)
    T = observed.sum()
    expected = np.array([p_survey[d] * T for d in domains], dtype=float)

    chi2_stat, p_value = chisquare(f_obs=observed, f_exp=expected)
    return chi2_stat, p_value, observed, expected

chi2_stat, p_value, observed, expected = compute_chi_square(ocr_counts, p_survey, DOMAINS)


# ============================================================
# Print detailed per-domain results
# ============================================================

print("\n=== Chi-square Goodness-of-Fit Test ===")
print("Domain-wise Observed vs. Expected Counts:")
for d, obs, exp in zip(DOMAINS, observed, expected):
    print(f"  {d}: Observed = {int(obs):>6}, Expected = {exp:.2f}")

print(f"\nChi-square statistic: {chi2_stat:.3f}")
print(f"p-value: {p_value:.5f}")


=== Chi-square Goodness-of-Fit Test ===
Domain-wise Observed vs. Expected Counts:
  Product Quality: Observed =  23146, Expected = 12633.00
  Aesthetic Attributes: Observed =  14997, Expected = 12633.00
  Functionality: Observed =   4446, Expected = 12633.00
  Size & Fit: Observed =  15227, Expected = 12633.00
  Value Perception: Observed =   5349, Expected = 12633.00

Chi-square statistic: 19229.330
p-value: 0.00000
