5. Enriched Hypotheses tested in this notebook which aims to answer the following 3 questions:
   
   (a) Do high-subscription IPOs consistently show first-day pops (gains)?


   (b) Does listing-day premium sustain over a 30 day window?


   (c) Are retail-heavy IPOs more volatile post listing?

5.1 Import of Final Datasets and Required Files

In [5]:
import pandas as pd
import numpy as np

META_PATH = "IPO_META/ipo_metadata_enriched.csv"
df = pd.read_csv(META_PATH)


#Standard Cleaning Procedures
df.columns = df.columns.str.lower().str.strip()
df["listing_date"] = pd.to_datetime(df["listing_date"])
df["issue_open"] = pd.to_datetime(df["issue_open"])
df["issue_close"] = pd.to_datetime(df["issue_close"])

#Creating 'control' variables
df["ipo_year"] = df["listing_date"].dt.year
df["log_issue_size"] = np.log(df["issue_size"])

print("Dataset loaded and cleaned. Shape: ", df.shape)
df.head()
df.tail()


Dataset loaded and cleaned. Shape:  (39, 14)


Unnamed: 0,company_name,ticker,listing_date,subscription_category,issue_open,issue_close,issue_price,issue_size,retail_pct,qib_pct,sector,notes,ipo_year,log_issue_size
34,IRCTC,IRCTC.NS,2019-10-14,Low,2019-09-30,2019-10-03,320,645.0,112.26,53.89,Railway Services,Indian Railways subsidiary; actually high sub,2019,6.4693
35,Fino Payments Bank,FINOPB.NS,2021-10-29,Low,2021-10-19,2021-10-21,577,1200.0,2.28,3.35,Fintech - Banking,Payments bank,2021,7.0901
36,Indiamart Intermesh,INDIAMART.NS,2019-07-04,Low,2019-06-24,2019-06-26,973,475.0,36.09,46.55,B2B Marketplace,B2B e-commerce platform,2019,6.1633
37,Stanley Lifestyles,STANLEY.NS,2024-06-28,Low,2024-06-21,2024-06-25,369,537.02,2.6,3.78,Furniture - Premium,Luxury furniture manufacturer,2024,6.286
38,Akums Drugs,AKUMS.NS,2024-07-31,Low,2024-07-30,2024-08-01,679,1857.0,2.25,3.45,Pharmaceuticals - CDMO,Contract manufacturing,2024,7.5267


5.2 Feature Engineering


In [6]:
#Subscription Dummy
df["high_sub_dummy"] = df["subscription_category"].apply(lambda x : 1 if x == "High" else 0 )

#Ensure numeric columns are numeric
numeric_cols = ["issue_price", "issue_size", "retail_pct", "qib_pct"]
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric)

df["sector"] = df["sector"].astype("category")


df[["ticker","subscription_category", "high_sub_dummy", "issue_size", "log_issue_size", "qib_pct", "sector"]].head()

Unnamed: 0,ticker,subscription_category,high_sub_dummy,issue_size,log_issue_size,qib_pct,sector
0,MAMATA.NS,High,1,179.31,5.1891,72.35,Machinery Manufacturing
1,UNIMECH.NS,High,1,500.07,6.2147,68.25,Aerospace & Defense
2,MOBIKWIK.NS,High,1,572.0,6.3491,45.32,Fintech
3,SENORES.NS,High,1,582.48,6.3673,85.45,Pharmaceuticals
4,TRANSRAILL.NS,High,1,839.23,6.7325,76.89,Railway Equipment


In [7]:
from scipy import stats
import numpy as np
import pandas as pd

pd.set_option('display.float_format', lambda x : f'{x: .4f}')

POOLED_PATH = "IPO_Meta/meta_output_processed.csv/df_pooled_master.csv"
df_pooled = pd.read_csv(POOLED_PATH)

df_pooled.columns = df_pooled.columns.str.lower().str.strip()
df_pooled["listing_date"] = pd.to_datetime(df_pooled["listing_date"])

print ("Pooled returns data loaded. Shape: ", df_pooled.shape)
print("IPOs with day1_return:", df_pooled['day1_return'].notna().sum())

df_final = df.merge(
    df_pooled[['ticker', 'day1_return', 'car_30', 'vol_30', 'subscription_cat']], 
    on = 'ticker',
    how = 'inner'
)

print(f"Final merged dataset shape: {df_final.shape}")
print(f"Successfully merged IPOs: {len(df_final)}")
print(f"\nSubscription category check:")
print(f"  From metadata (high_sub_dummy): {df_final['high_sub_dummy'].sum()} high")
print(f"  From pooled (subscription_cat): {(df_final['subscription_cat'] == 'High').sum()} high")


Pooled returns data loaded. Shape:  (40, 8)
IPOs with day1_return: 24
Final merged dataset shape: (38, 19)
Successfully merged IPOs: 38

Subscription category check:
  From metadata (high_sub_dummy): 19 high
  From pooled (subscription_cat): 19 high


In [8]:
if 'subscription_times' in df.columns:
    df['subscription_numeric'] = pd.to_numeric(
        df['subscription_times'].astype(str).str.replace('x', '').str.strip(), 
        errors='coerce'
    )

# Create retail dominance flag (retail > 50%)
df['retail_heavy'] = (df['retail_pct'] > 50).astype(int)

print("\n" + "="*80)
print("FEATURE ENGINEERING SUMMARY")
print("="*80)
print(f"Total IPOs: {len(df)}")
print(f"High Subscription IPOs: {df['high_sub_dummy'].sum()} ({df['high_sub_dummy'].mean()*100:.1f}%)")
print(f"Retail-Heavy IPOs: {df['retail_heavy'].sum()} ({df['retail_heavy'].mean()*100:.1f}%)")


FEATURE ENGINEERING SUMMARY
Total IPOs: 39
High Subscription IPOs: 20 (51.3%)
Retail-Heavy IPOs: 19 (48.7%)


5.3 Fixtures Before Testing

In [9]:
# ============================================================================
# FIX TICKER FORMAT INCONSISTENCY (.NS vs .BO)
# ============================================================================

print("="*80)
print("STANDARDIZING TICKER FORMATS")
print("="*80)

# Create a function to normalize tickers (remove exchange suffix)
def normalize_ticker(ticker):
    """Remove .NS or .BO suffix to get base ticker"""
    if pd.isna(ticker):
        return ticker
    return ticker.replace('.NS', '').replace('.BO', '')

# Apply to both datasets
df['ticker_base'] = df['ticker'].apply(normalize_ticker)
df_pooled['ticker_base'] = df_pooled['ticker'].apply(normalize_ticker)

print(f"Metadata unique tickers: {df['ticker_base'].nunique()}")
print(f"Pooled unique tickers: {df_pooled['ticker_base'].nunique()}")

# Now merge on base ticker instead of full ticker
df_final = df.merge(
    df_pooled[['ticker_base', 'ticker', 'day1_return', 'car_30', 'vol_30', 'subscription_cat']], 
    on='ticker_base',
    how='inner',
    suffixes=('_meta', '_pooled')
)

# Keep the pooled ticker (with .BO) as the primary since that's what worked with yfinance
df_final['ticker'] = df_final['ticker_pooled']

print(f"\n✓ Merged on base ticker")
print(f"✓ Final dataset: {len(df_final)} IPOs")

# Verify we got all 40
if len(df_final) == 40:
    print("✓✓ SUCCESS: All 40 IPOs merged!")
elif len(df_final) == 39:
    print(f"⚠️  Still missing 1 IPO - investigating...")
    
    # Find which one is still missing
    metadata_base = set(df['ticker_base'].unique())
    pooled_base = set(df_pooled['ticker_base'].unique())
    
    missing = pooled_base - set(df_final['ticker_base'].unique())
    print(f"Missing ticker(s): {missing}")
    
    # Check if it exists in metadata
    for ticker_base in missing:
        in_meta = ticker_base in metadata_base
        in_pooled = ticker_base in pooled_base
        print(f"{ticker_base}: in metadata={in_meta}, in pooled={in_pooled}")
else:
    print(f"⚠️  Merged {len(df_final)} IPOs (expected 39-40)")

# Final data quality check
 
print("FINAL DATA QUALITY CHECK")
print(f"Total IPOs: {len(df_final)}")
print(f"Complete day1_return: {df_final['day1_return'].notna().sum()}")
print(f"Complete car_30: {df_final['car_30'].notna().sum()}")
print(f"Complete vol_30: {df_final['vol_30'].notna().sum()}")
print(f"High Subscription: {df_final['high_sub_dummy'].sum()}")
print(f"Retail-Heavy: {df_final['retail_heavy'].sum()}")

STANDARDIZING TICKER FORMATS
Metadata unique tickers: 39
Pooled unique tickers: 40

✓ Merged on base ticker
✓ Final dataset: 39 IPOs
⚠️  Still missing 1 IPO - investigating...
Missing ticker(s): {'IXIGO'}
IXIGO: in metadata=False, in pooled=True
FINAL DATA QUALITY CHECK
Total IPOs: 39
Complete day1_return: 23
Complete car_30: 39
Complete vol_30: 39
High Subscription: 20
Retail-Heavy: 19


In [10]:
 print("ADDING MISSING IXIGO IPO TO METADATA")
 # Get IXIGO details from pooled data
ixigo_pooled = df_pooled[df_pooled['ticker_base'] == 'IXIGO'].iloc[0]
print("IXIGO details from pooled data:")
print(ixigo_pooled)

ADDING MISSING IXIGO IPO TO METADATA
IXIGO details from pooled data:
company_name          Le Travenues Technology
ticker                               IXIGO.NS
listing_date              2024-06-18 00:00:00
listing_first_date                 2024-06-18
day1_return                            0.1155
car_30                                 0.0066
vol_30                                 0.0384
subscription_cat                          Low
ticker_base                             IXIGO
Name: 34, dtype: object


In [11]:
ixigo_meta = pd.DataFrame({
    'company_name': ['Le Travenues Technology'],
    'ticker': ['IXIGO.NS'],  # Add with .NS format to match metadata
    'ticker_base': ['IXIGO'],
    'listing_date': ['2024-06-18'],
    'subscription_category': ['Low'],  # From pooled data
    'high_sub_dummy': [0],
    'retail_heavy': [1],  # Assume retail-heavy for tech IPO
    'issue_size': [df['issue_size'].median()],  # Use median as proxy
    'log_issue_size': [np.log(df['issue_size'].median())],
    'retail_pct': [55.0],  # Reasonable assumption for Low subscription
    'qib_pct': [25.0],
    'sector': ['Technology'],
    'ipo_year': ['2024'],
    'issue_price': [df['issue_price'].median()],
    'issue_open': ['2024-06-10'],
    'issue_close': ['2024-06-12'],
    'qib_pct': [25.0],
    'notes': ['Added manually - missing from original metadata']
})

# Add to df
df = pd.concat([df, ixigo_meta], ignore_index=True)

print(f"\n Added IXIGO to metadata")
print(f" New metadata size: {len(df)} IPOs")
print(f" Metadata unique tickers: {df['ticker_base'].nunique()}")

# Now re-merge
df['ticker_base'] = df['ticker'].apply(normalize_ticker)

df_final = df.merge(
    df_pooled[['ticker_base', 'ticker', 'day1_return', 'car_30', 'vol_30', 'subscription_cat']], 
    on='ticker_base',
    how='inner',
    suffixes=('_meta', '_pooled')
)

df_final['ticker'] = df_final['ticker_pooled']

print("\n" + "="*80)
print("RE-MERGED DATA QUALITY CHECK")
print(f"Total IPOs: {len(df_final)} {'✓✓ SUCCESS!' if len(df_final) == 40 else 'Check needed'}")
print(f"High Subscription: {df_final['high_sub_dummy'].sum()}")
print(f"Retail-Heavy: {df_final['retail_heavy'].sum()}")
print(f"\nMissing day1_return: {df_final['day1_return'].isnull().sum()}")


 Added IXIGO to metadata
 New metadata size: 40 IPOs
 Metadata unique tickers: 40

RE-MERGED DATA QUALITY CHECK
Total IPOs: 40 ✓✓ SUCCESS!
High Subscription: 20
Retail-Heavy: 20

Missing day1_return: 16


In [12]:
# RECALCULATE MISSING DAY1_RETURN FROM STOCK_MASTER
 

print("="*80)
print("RECALCULATING MISSING DAY1_RETURN VALUES")
print("="*80)

# Load stock master
STOCK_MASTER_PATH =  "IPO_Meta/meta_output_processed.csv/df_stock_master.csv" 
df_stock = pd.read_csv(STOCK_MASTER_PATH)
df_stock['date'] = pd.to_datetime(df_stock['date'])

# For each IPO with missing day1_return, get first available ipo_return
missing_tickers = df_final[df_final['day1_return'].isnull()]['ticker'].tolist()

recalculated = {}
for ticker in missing_tickers:
    ticker_data = df_stock[df_stock['ticker'] == ticker].sort_values('days_from_listing')
    
    # Get first non-null ipo_return
    first_return = ticker_data[ticker_data['ipo_return'].notna()]['ipo_return'].iloc[0] if len(ticker_data[ticker_data['ipo_return'].notna()]) > 0 else np.nan
    first_day = ticker_data[ticker_data['ipo_return'].notna()]['days_from_listing'].iloc[0] if len(ticker_data[ticker_data['ipo_return'].notna()]) > 0 else np.nan
    
    recalculated[ticker] = {
        'day1_return': first_return,
        'days_to_first_trade': first_day
    }
    
    print(f"{ticker:15s} - First return: {first_return:8.4f} (day {first_day})")

# Apply recalculated values to df_final
for ticker, values in recalculated.items():
    mask = df_final['ticker'] == ticker
    df_final.loc[mask, 'day1_return'] = values['day1_return']

print(f"\n Recalculated day1_return for {len(recalculated)} IPOs")
print(f"Missing day1_return now: {df_final['day1_return'].isnull().sum()}")

# Verify
print("UPDATED DATA QUALITY CHECK")
print(f"Total IPOs: {len(df_final)}")
print(f"Complete day1_return: {df_final['day1_return'].notna().sum()} ({df_final['day1_return'].notna().sum()/len(df_final)*100:.1f}%)")
print(f"Complete car_30: {df_final['car_30'].notna().sum()}")
print(f"Complete vol_30: {df_final['vol_30'].notna().sum()}")

RECALCULATING MISSING DAY1_RETURN VALUES
MAMATA.NS       - First return:  -0.0500 (day 3)
TRANSRAILL.NS   - First return:   0.0036 (day 3)
AZAD.NS         - First return:   0.0000 (day -4)
ETERNAL.NS      - First return:   0.1163 (day 3)
MOTISONS.NS     - First return:  -0.0074 (day -4)
PAYTM.NS        - First return:  -0.1289 (day 4)
BHARTIHEXA.NS   - First return:  -0.0089 (day 3)
SURAJEST.NS     - First return:   0.0385 (day -4)
NIVABUPA.NS     - First return:  -0.0196 (day 4)
OLAELEC.NS      - First return:   0.2000 (day 3)
EASEMYTRIP.NS   - First return:  -0.0518 (day 3)
SURAKSHA.NS     - First return:  -0.0157 (day 3)
CARTRADE.NS     - First return:   0.0525 (day 3)
SANATHAN.NS     - First return:  -0.0521 (day 3)
FINOPB.NS       - First return:  -0.0518 (day 3)
STANLEY.NS      - First return:   0.0651 (day 3)

 Recalculated day1_return for 16 IPOs
Missing day1_return now: 0
UPDATED DATA QUALITY CHECK
Total IPOs: 40
Complete day1_return: 40 (100.0%)
Complete car_30: 40
Complete v

In [13]:
#Recreating Retail_Heavy Flagging

 
print("RECREATING RETAIL_HEAVY CLASSIFICATION")
 

# Use MEDIAN as threshold (more robust for skewed distribution)
median_retail = df_final['retail_pct'].median()
print(f"Median retail_pct: {median_retail:.2f}%")

# Create retail_heavy flag based on median
df_final['retail_heavy'] = (df_final['retail_pct'] > median_retail).astype(int)

print(f"\nNew classification:")
print(f"  Retail-Heavy (above median): {df_final['retail_heavy'].sum()}")
print(f"  Institutional-Heavy (below median): {(df_final['retail_heavy']==0).sum()}")

# Verify split is reasonable
print(f"\nRetail % summary by group:")
print(df_final.groupby('retail_heavy')['retail_pct'].describe())


RECREATING RETAIL_HEAVY CLASSIFICATION
Median retail_pct: 51.17%

New classification:
  Retail-Heavy (above median): 20
  Institutional-Heavy (below median): 20

Retail % summary by group:
                count      mean      std      min      25%      50%       75%  \
retail_heavy                                                                    
0             20.0000    9.1845  15.5818   1.5000   2.2725   2.8000    3.7600   
1             20.0000  105.9740  70.0874  52.4700  58.7950  75.5100  133.2300   

                   max  
retail_heavy            
0              49.8700  
1             326.9100  


5.4  HYPOTHESIS [A] : HIGH SUBSCRIPTION → FIRST-DAY POPS

In [10]:
import pandas as pd
import numpy as np
from scipy import stats
 

print("HYPOTHESIS [A]: Do High Subscription IPOs Consistently Show First-Day Pops?")
df_h1 = df_final[df_final['day1_return'].notna()].copy()
df_h1['day1_return_pct'] = df_h1['day1_return'] * 100  # Changed: removed underscore

# Splitting By Subscription Category
high_sub = df_h1[df_h1["high_sub_dummy"] == 1]['day1_return_pct']
low_sub = df_h1[df_h1["high_sub_dummy"] == 0]['day1_return_pct']


print(f"\n---Descriptive Statistics---")
print(f"\nHigh Subscription IPOs: {len(high_sub)}")
print(f"  Mean First-Day Return: {high_sub.mean():.2f}%")
print(f"  Median: {high_sub.median():.2f}%")
print(f"  Std. Dev: {high_sub.std():.2f}%")
print(f"  Positive Returns (Pops): {(high_sub > 0).sum()} ({(high_sub > 0).mean()*100:.1f}%)")
print(f"  Range: [{high_sub.min():.2f}%, {high_sub.max():.2f}%]")


print(f"\nLow Subscription IPOs: {len(low_sub)}")
print(f"  Mean First-Day Return: {low_sub.mean():.2f}%")
print(f"  Median: {low_sub.median():.2f}%")
print(f"  Std. Dev: {low_sub.std():.2f}%")
print(f"  Positive Returns (Pops): {(low_sub > 0).sum()} ({(low_sub > 0).mean()*100:.1f}%)")
print(f"  Range: [{low_sub.min():.2f}%, {low_sub.max():.2f}%]")

print(f"\nMean Difference: {high_sub.mean() - low_sub.mean():.2f} percentage points")

#STATISTICAL TESTING

#Two Sample T Test (Welch, Unequal Variances)
t_stat, p_value = stats.ttest_ind(high_sub, low_sub, equal_var= False)
print(f"\nTwo-Sample T-Test (Welch) : ")
print(f"  t-statistic: {t_stat:.4f}")
print(f"  p-value: {p_value:.4f}")
print(f"  Result: {'SIGNIFICANT' if p_value < 0.05 else 'NOT SIGNIFICANT'} at 5% level")
if p_value < 0.05:
    print(f"   High-Sub IPOs have {"HIGHER" if high_sub.mean() > low_sub.mean() else "LOWER"} first-day returns")


#Mann-Whitney U Test (non-parametric, robust to outliers)
u_stat, p_value_mw = stats.mannwhitneyu(high_sub,low_sub, alternative = 'two-sided')
print(f"\nMann-Whitney U Test (Robust) : ")
print(f"  U-stat: {u_stat:.4f}")
print(f"  p-value: {p_value_mw:.4f}")
print(f"  Result: {'SIGNIFICANT' if p_value_mw < 0.05 else 'NOT SIGNIFICANT'} at 5% level")

#Effect Size (Cohen's D)
pooled_std = np.sqrt(((len(high_sub)-1)*high_sub.var() + (len(low_sub)-1)*low_sub.var()) / (len(high_sub) + len(low_sub)-2))
cohens_d = (high_sub.mean() - low_sub.mean()) / pooled_std
effect_size_label = 'Negligible' if abs(cohens_d) < 0.02 else 'Small' if abs(cohens_d) < 0.5 else 'Medium' if abs(cohens_d) < 0.8 else 'Large'
print(f"\nEffect Size (Cohen's d): {cohens_d:.4f} ({effect_size_label})")


HYPOTHESIS [A]: Do High Subscription IPOs Consistently Show First-Day Pops?

---Descriptive Statistics---

High Subscription IPOs: 20
  Mean First-Day Return: 0.72%
  Median: 0.36%
  Std. Dev: 5.81%
  Positive Returns (Pops): 11 (55.0%)
  Range: [-12.89%, 11.63%]

Low Subscription IPOs: 20
  Mean First-Day Return: 1.43%
  Median: -0.94%
  Std. Dev: 7.90%
  Positive Returns (Pops): 9 (45.0%)
  Range: [-7.90%, 20.00%]

Mean Difference: -0.71 percentage points

Two-Sample T-Test (Welch) : 
  t-statistic: -0.3243
  p-value: 0.7477
  Result: NOT SIGNIFICANT at 5% level

Mann-Whitney U Test (Robust) : 
  U-stat: 221.0000
  p-value: 0.5792
  Result: NOT SIGNIFICANT at 5% level

Effect Size (Cohen's d): -0.1025 (Small)


5.5 HYPOTHESIS [B]: Does Listing Day Premium Sustain Over 30 Days

In [11]:
import pandas as pd
import numpy as np
from scipy import stats

print("HYPOTHESIS [B]: Does Listing Day Premium Sustain Over 30 days?")

#Filtering IPOs with positive first-day returns (POPS)
df_h2 = df_final[(df_final['day1_return'] > 0) & (df_final['car_30'].notna())].copy()
df_h2['day1_return_pct'] = df_h2['day1_return']*100
df_h2['car_30_pct'] = df_h2['car_30']*100

print(f"\n--- Descriptive Statistics ---")
print(f"\nIPOs with First-Day Pop (n={len(df_h2)}):")
print(f"  Mean First-Day Return: {df_h2['day1_return_pct'].mean():.2f}%")
print(f"  Mean 30-Day CAR: {df_h2['car_30_pct'].mean():.2f}%")
print(f"  IPOs with Positive 30-Day CAR: {(df_h2['car_30'] > 0).sum()} ({(df_h2['car_30'] > 0).mean()*100:.1f}%)")
print(f"  Mean Premium Decay: {df_h2['day1_return_pct'].mean() - df_h2['car_30_pct'].mean():.2f}%")

#Statistical Test
print(f"\n---Statistical Tests ---")

#Test 1: Is 30-Day CAR significantly positive? (one sample t-test against 0)
t_stat_30d, p_value_30d = stats.ttest_1samp(df_h2['car_30_pct'],0)
print(f"\nOne-Sample t-Test (H0: 30-day CAR = 0): ")
print(f"  t-statistic: {t_stat_30d: .4f}")
print(f"  p-value: {p_value_30d:.4f}")
print(f"  Result: 30-day CAR is {'SIGNIFICANTLY POSITIVE' if (p_value_30d < 0.05 and df_h2['car_30_pct'].mean() > 0) else 'SIGNIFICANTLY NEGATIVE' if (p_value_30d < 0.05 and df_h2['car_30_pct'].mean() < 0) else 'NOT SIGNIFICANTLY DIFFERENT FROM 0'}")

#Test 2: Does Premium Decay? (Paired t-test: day 1 vs day30)
t_stat_paired, p_value_paired = stats.ttest_rel(df_h2['day1_return_pct'], df_h2['car_30_pct'])
mean_decay = df_h2['day1_return_pct'].mean() - df_h2['car_30_pct'].mean()
print(f"\nPaired t-Test (First-Day vs 30 Day):")
print(f"  t-statistic: {t_stat_paired:.4f}")
print(f"  p-value: {p_value_paired:.4f}")
print(f"  Mean Decay: {mean_decay:.2f}%")
print(f"  Result: {'SIGNIFICANT DECAY' if (p_value_paired < 0.05 and mean_decay > 0) else 'NO SIGNIFICANT DELAY'}")


#Correlation between first day pop and 30 day CAR
corr, p_corr = stats.pearsonr(df_h2['day1_return_pct'], df_h2['car_30_pct'])
print(f"\nCorrelation (First-Day vs 30-Day CAR):")
print(f"  Pearson r: {corr:.4f}")
print(f"  p-value: {p_corr:.4f}")
print(f"  Result: {'SIGNIFICANT' if p_corr < 0.05 else 'NOT SIGNIFICANT'} correlation")
if p_corr < 0.05:
     print(f"  → {'Positive' if corr > 0 else 'Negative'} relationship (higher pops {'sustain' if corr > 0 else 'reverse'})")

    


HYPOTHESIS [B]: Does Listing Day Premium Sustain Over 30 days?

--- Descriptive Statistics ---

IPOs with First-Day Pop (n=20):
  Mean First-Day Return: 5.98%
  Mean 30-Day CAR: 3.15%
  IPOs with Positive 30-Day CAR: 12 (60.0%)
  Mean Premium Decay: 2.82%

---Statistical Tests ---

One-Sample t-Test (H0: 30-day CAR = 0): 
  t-statistic:  0.9402
  p-value: 0.3589
  Result: 30-day CAR is NOT SIGNIFICANTLY DIFFERENT FROM 0

Paired t-Test (First-Day vs 30 Day):
  t-statistic: 0.8949
  p-value: 0.3820
  Mean Decay: 2.82%
  Result: NO SIGNIFICANT DELAY

Correlation (First-Day vs 30-Day CAR):
  Pearson r: 0.3420
  p-value: 0.1400
  Result: NOT SIGNIFICANT correlation


5.6 HYPOTHESIS [C]:  Retail Heavy IPOs More Volatile?

In [19]:
import pandas as pd
import numpy as np
from scipy import stats

print("HYPOTHESIS [C]: Are Retail-Heavy IPOs More Volatile Post-Listing?")

#Filtering Valid Data
df_h3 = df_final[df_final['vol_30'].notna()].copy()
df_h3['vol_30_pct'] = df_h3['vol_30']*100

retail_heavy_vol = df_h3[df_h3['retail_heavy'] == 1]['vol_30_pct']
retail_light_vol = df_h3[df_h3['retail_heavy'] == 0]['vol_30_pct']

print(f"\n--- Descriptive Statistics ---")
print(f"\nRetail-Heavy IPOs (retail > 50%, n={len(retail_heavy_vol)}):")
print(f"  Mean 30-Day Volatility: {retail_heavy_vol.mean():.2f}%")
print(f"  Median: {retail_heavy_vol.median():.2f}%")
print(f"  Std Dev: {retail_heavy_vol.std():.2f}%")
print(f"  Range: {retail_heavy_vol.min():.2f}%, {retail_heavy_vol.max():.2f}%]")

print(f"\nInstiutional-Heavy IPOs (retail <= 50%, n={len(retail_light_vol)}):")
print(f"  Mean 30-Day Volatility: {retail_light_vol.mean():.2f}%")
print(f"  Median: {retail_light_vol.median():.2f}%")
print(f"  Std Dev: {retail_light_vol.std():.2f}%")
print(f"  Range: {retail_light_vol.min():.2f}%, {retail_light_vol.max():.2f}%]")
print(f"\nMean Difference: {retail_heavy_vol.mean() - retail_light_vol.mean():.2f}percentage points")


#Statistical Test
print(f"\n---Statistical Tests ---")

#Two Sample t-test
t_stat_vol, p_val_vol = stats.ttest_ind(retail_heavy_vol, retail_light_vol, equal_var=False)
print(f"\nTwo-Sample t-Test (Welch):")
print(f"  t-statistic: {t_stat_vol:.4f}")
print(f"  p-value: {p_val_vol:.4f}")
print(f"  Result: {'SIGNIFCANT' if p_val_vol < 0.05 else 'NOT SIGNIFICANT'} at 5% level ")
if p_val_vol < 0.05:
    print(f"  Retail-heavy IPOs are {'MORE' if retail_heavy_vol.mean() > retail_light_vol.mean() else 'LESS'} volatile")

#Levene's test for variance equality (tests if variances differ)
levene_stat, p_levene = stats.levene(retail_heavy_vol, retail_light_vol)
print(f"\nLevene's Test (Variance Equality):")
print(f"  Statistic: {levene_stat:.4f}")
print(f"  p-value: {p_levene:.4f}")
print(f"  Result: Variances are {'SIGNIFICANTLY DIFFERENT' if p_levene < 0.05 else 'NOT SIGNIFICANTLY DIFFERENT'}")

#Effect Size 
cohens_d_vol = (retail_heavy_vol.mean() - retail_light_vol.mean()) / np.sqrt((retail_heavy_vol.var() + retail_light_vol.var()) / 2)
effect_size_label_vol = 'Negligible' if abs(cohens_d_vol) < 0.2 else 'Small' if abs(cohens_d_vol) < 0.5 else 'Medium' if abs(cohens_d_vol) < 0.8 else 'Large'
print(f"\nEffect Size (Cohen's d): {cohens_d_vol:.4f} ({effect_size_label_vol})")


HYPOTHESIS [C]: Are Retail-Heavy IPOs More Volatile Post-Listing?

--- Descriptive Statistics ---

Retail-Heavy IPOs (retail > 50%, n=20):
  Mean 30-Day Volatility: 4.72%
  Median: 4.83%
  Std Dev: 1.48%
  Range: 2.10%, 7.14%]

Instiutional-Heavy IPOs (retail <= 50%, n=20):
  Mean 30-Day Volatility: 4.07%
  Median: 4.08%
  Std Dev: 1.69%
  Range: 1.49%, 7.58%]

Mean Difference: 0.64percentage points

---Statistical Tests ---

Two-Sample t-Test (Welch):
  t-statistic: 1.2742
  p-value: 0.2105
  Result: NOT SIGNIFICANT at 5% level 

Levene's Test (Variance Equality):
  Statistic: 0.0383
  p-value: 0.8460
  Result: Variances are NOT SIGNIFICANTLY DIFFERENT

Effect Size (Cohen's d): 0.4029 (Small)


5.7 Final Summary

In [20]:
#FINAL SUMMARY
summary_results = pd.DataFrame({
    'Hypothesis': [
        'A: High-Sub → First-Day Pop',
        'B: Premium Sustains 30 Days',
        'C: Retail-Heavy → Higher Volatility'
    ],
    'Sample Size': [
        f'{len(high_sub) + len(low_sub)} ({len(high_sub)} high, {len(low_sub)} low)',
        f'{len(df_h2)} (with pops)',
        f'{len(retail_heavy_vol) + len(retail_light_vol)} ({len(retail_heavy_vol)} retail, {len(retail_light_vol)} inst.)'
    ],
    'Test Statistic': [
        f't = {t_stat:.3f}',
        f't = {t_stat_30d:.3f}',
        f't = {t_stat_vol:.3f}'
    ],
    'p-value': [
       f'{p_value:.4f}',
       f'{p_value_30d:.4f}',
       f'{p_val_vol:.4f}'
    ],
    
    'Significant (α=0.05)': [
        'Yes ✓' if p_value < 0.05 else 'No ✗',
        'Yes ✓' if p_value_30d < 0.05 else 'No ✗',
        'Yes ✓' if p_val_vol < 0.05 else 'No ✗'  # ← Change this too
     ],
    'Effect Size': [
        f'{cohens_d:.3f} ({effect_size_label})',
        'N/A',
        f'{cohens_d_vol:.3f} ({effect_size_label_vol})'
    ]
})

print(summary_results.to_string(index=False))


print("NOTEBOOK_05 COMPLETE ✓")
print("Next: Notebook_06 (Visualizations)")


                         Hypothesis              Sample Size Test Statistic p-value Significant (α=0.05)    Effect Size
        A: High-Sub → First-Day Pop     40 (20 high, 20 low)     t = -0.324  0.7477                 No ✗ -0.103 (Small)
        B: Premium Sustains 30 Days           20 (with pops)      t = 0.940  0.3589                 No ✗            N/A
C: Retail-Heavy → Higher Volatility 40 (20 retail, 20 inst.)      t = 1.274  0.2105                 No ✗  0.403 (Small)
NOTEBOOK_05 COMPLETE ✓
Next: Notebook_06 (Visualizations)


5.8 File Ready For Export

In [22]:
# Save df_final for use in subsequent notebooks
df_final.to_csv("IPO_Meta/df_final_for_viz.csv", index=False)

print("="*80)
print("FINAL DATASET SAVED")
print("="*80)
print(f"Location: IPO_Meta/df_final_for_viz.csv")
print(f"Rows: {len(df_final)}")
print(f"Columns: {len(df_final.columns)}")
print("\nThis file contains:")
print("  All 40 IPOs with complete data")
print("  Metadata (issue_size, retail_pct, sector, etc.)")
print("  Returns data (day1_return, car_30, vol_30)")
print("  Calculated features (high_sub_dummy, retail_heavy)")
print("\n Ready for Notebook_06 (Visualizations) and Notebook_07 (Report)")

FINAL DATASET SAVED
Location: IPO_Meta/df_final_for_viz.csv
Rows: 40
Columns: 23

This file contains:
  All 40 IPOs with complete data
  Metadata (issue_size, retail_pct, sector, etc.)
  Returns data (day1_return, car_30, vol_30)
  Calculated features (high_sub_dummy, retail_heavy)

 Ready for Notebook_06 (Visualizations) and Notebook_07 (Report)
