## trying something else !

In [1]:
import pandas as pd

df = pd.read_csv('train.csv')

# --- OLD THRESHOLDS ---
def old_category(p):
    if p > 0.90: return 'Safe'
    elif p >= 0.70: return 'Medium'
    else: return 'Reach'

# --- NEW THRESHOLDS ---
def new_category(p):
    if p >= 0.85: return 'Safe'      # Changed from 0.90
    elif p >= 0.65: return 'Medium'  # Changed from 0.70
    else: return 'Reach'

print("--- OLD Distribution (Strict) ---")
print(df['Chance of Admit'].apply(old_category).value_counts())

print("\n--- NEW Distribution (Balanced) ---")
print(df['Chance of Admit'].apply(new_category).value_counts())

--- OLD Distribution (Strict) ---
Chance of Admit
Medium    413
Reach     275
Safe       12
Name: count, dtype: int64

--- NEW Distribution (Balanced) ---
Chance of Admit
Medium    476
Reach     153
Safe       71
Name: count, dtype: int64


In [2]:
import pandas as pd
from sklearn.utils import resample

# 1. Load Data
df = pd.read_csv('train.csv')

# 2. Define Categories (Using your new balanced thresholds)
def get_category(p):
    if p >= 0.85: return 'Safe'
    elif p >= 0.65: return 'Medium'
    else: return 'Reach'

df['Category'] = df['Chance of Admit'].apply(get_category)

# 3. Bootstrapping Function
def bootstrap_balance(df, target_count=500):
    df_balanced = pd.DataFrame()
    
    # Loop through each class (Safe, Medium, Reach)
    for category in ['Safe', 'Medium', 'Reach']:
        df_class = df[df['Category'] == category]
        
        # Resample (This is the "Bootstrapping" part)
        # replace=True allows us to pick the same student multiple times
        df_upsampled = resample(df_class, 
                                replace=True,     
                                n_samples=target_count,    
                                random_state=42) 
        
        df_balanced = pd.concat([df_balanced, df_upsampled])
        
    return df_balanced

# 4. Run it
df_bootstrapped = bootstrap_balance(df)

print("Original Size:", len(df))
print("Bootstrapped Size:", len(df_bootstrapped))
print(df_bootstrapped['Category'].value_counts())

# Save
df_bootstrapped.to_csv('train_bootstrapped.csv', index=False)

Original Size: 700
Bootstrapped Size: 1500
Category
Safe      500
Medium    500
Reach     500
Name: count, dtype: int64


In [3]:
import pandas as pd
import numpy as np

# 1. Load Data
df = pd.read_csv('train.csv')

# 2. Apply the NEW Categories
def get_category(p):
    if p >= 0.85: return 'Safe'
    elif p >= 0.65: return 'Medium'
    else: return 'Reach'

df['Category'] = df['Chance of Admit'].apply(get_category)

# 3. Targeted Augmentation Function
def augment_class(df, target_class, target_count):
    """
    Augments a specific class until it reaches the target_count.
    """
    existing_class = df[df['Category'] == target_class]
    current_count = len(existing_class)
    
    if current_count >= target_count:
        return existing_class # No need to augment if we already have enough
    
    needed = target_count - current_count
    
    # Generate the new samples
    indices = np.random.choice(existing_class.index, size=needed, replace=True)
    new_data = existing_class.loc[indices].copy()
    
    # Add Noise to numeric columns only
    numeric_cols = ['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'GPA']
    for col in numeric_cols:
        std_dev = df[col].std()
        noise = np.random.normal(0, std_dev * 0.05, size=needed) # 5% noise
        new_data[col] = new_data[col] + noise
        
    # Clipping to keep data realistic
    new_data['GRE Score'] = np.clip(new_data['GRE Score'], 290, 340)
    new_data['TOEFL Score'] = np.clip(new_data['TOEFL Score'], 92, 120)
    new_data['GPA'] = np.clip(new_data['GPA'], 0, 4.0)
    new_data['SOP'] = np.clip(new_data['SOP'], 1, 5)
    new_data['University Rating'] = np.clip(np.round(new_data['University Rating']), 1, 5)

    return pd.concat([existing_class, new_data], axis=0)

# --- EXECUTE BALANCING ---

# We want everyone to match the majority class (Medium ~476)
target_size = 500 

df_safe_balanced = augment_class(df, 'Safe', target_size)
df_reach_balanced = augment_class(df, 'Reach', target_size)
df_medium_balanced = augment_class(df, 'Medium', target_size) # Will just return original if > 500

# Combine them all back together
df_final = pd.concat([df_safe_balanced, df_reach_balanced, df_medium_balanced], axis=0)

# Shuffle the rows so they aren't in order
df_final = df_final.sample(frac=1, random_state=42).reset_index(drop=True)

print("--- FINAL BALANCED DISTRIBUTION ---")
print(df_final['Category'].value_counts())

# Save this file to train your models
df_final.to_csv('train_balanced_final.csv', index=False)

--- FINAL BALANCED DISTRIBUTION ---
Category
Medium    500
Safe      500
Reach     500
Name: count, dtype: int64


In [7]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# ==========================================
# 1. SETUP & HELPER FUNCTIONS
# ==========================================

# Define your Categorization Logic
def get_category(p):
    if p >= 0.85: return 'Safe'
    elif p >= 0.65: return 'Medium'
    else: return 'Reach'

# Load Data
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv') # Make sure this file exists!

# Add Categories to both
train_df['Category'] = train_df['Chance of Admit'].apply(get_category)
test_df['Category'] = test_df['Chance of Admit'].apply(get_category)

# Prepare X and y for Test Set (The "Truth")
X_test = test_df.drop(['Chance of Admit', 'Category'], axis=1)
y_test = test_df['Category']

# ==========================================
# 2. CREATE DATASET A: BOOTSTRAPPING
# ==========================================
def create_bootstrap_data(df, target_count=500):
    df_balanced = pd.DataFrame()
    for category in ['Safe', 'Medium', 'Reach']:
        subset = df[df['Category'] == category]
        # Just sample with replacement (Duplicates)
        resampled = subset.sample(n=target_count, replace=True, random_state=42)
        df_balanced = pd.concat([df_balanced, resampled])
    return df_balanced.sample(frac=1, random_state=42) # Shuffle

# ==========================================
# 3. CREATE DATASET B: NOISE INJECTION
# ==========================================
def create_noise_data(df, target_count=500):
    df_balanced = pd.DataFrame()
    numeric_cols = ['GRE Score', 'TOEFL Score', 'University Rating', 'SOP', 'LOR ', 'GPA']
    
    for category in ['Safe', 'Medium', 'Reach']:
        subset = df[df['Category'] == category]
        
        # Start with original data
        count = len(subset)
        needed = target_count - count
        
        # Create Synthetic data
        if needed > 0:
            indices = np.random.choice(subset.index, size=needed, replace=True)
            synthetic = subset.loc[indices].copy()
            
            # Inject Noise
            for col in numeric_cols:
                std = df[col].std()
                noise = np.random.normal(0, std * 0.05, size=needed)
                synthetic[col] += noise
            
            # Clip
            synthetic['GRE Score'] = np.clip(synthetic['GRE Score'], 290, 340)
            synthetic['GPA'] = np.clip(synthetic['GPA'], 0, 4.0)
            # (Add other clips as needed)
            
            combined = pd.concat([subset, synthetic])
        else:
            combined = subset.sample(target_count) # Downsample if too big
            
        df_balanced = pd.concat([df_balanced, combined])
        
    return df_balanced.sample(frac=1, random_state=42) # Shuffle

# ==========================================
# 4. RUN THE EXPERIMENT
# ==========================================

# A. Generate the two training sets
train_boot = create_bootstrap_data(train_df)
train_noise = create_noise_data(train_df)

print(f"Bootstrapped Train Size: {len(train_boot)}")
print(f"Noise Injected Train Size: {len(train_noise)}")

# B. Train Model A (Bootstrap)
clf_boot = RandomForestClassifier(n_estimators=100, random_state=42)
clf_boot.fit(train_boot.drop(['Chance of Admit', 'Category'], axis=1), train_boot['Category'])
preds_boot = clf_boot.predict(X_test)

# C. Train Model B (Noise)
clf_noise = RandomForestClassifier(n_estimators=100, random_state=42)
clf_noise.fit(train_noise.drop(['Chance of Admit', 'Category'], axis=1), train_noise['Category'])
preds_noise = clf_noise.predict(X_test)

# ==========================================
# 5. PRINT RESULTS
# ==========================================

print("\n" + "="*40)
print("RESULTS COMPARISON")
print("="*40)

print(f"\nModel A: BOOTSTRAPPING Accuracy: {accuracy_score(y_test, preds_boot):.4f}")
print("-" * 30)
print(classification_report(y_test, preds_boot, target_names=clf_boot.classes_))

print("\n" + "="*40)
print(f"\nModel B: NOISE INJECTION Accuracy: {accuracy_score(y_test, preds_noise):.4f}")
print("-" * 30)
print(classification_report(y_test, preds_noise, target_names=clf_noise.classes_))

Bootstrapped Train Size: 1500
Noise Injected Train Size: 1500

RESULTS COMPARISON

Model A: BOOTSTRAPPING Accuracy: 0.8100
------------------------------
              precision    recall  f1-score   support

      Medium       0.83      0.85      0.84       176
       Reach       0.84      0.80      0.82        81
        Safe       0.66      0.67      0.67        43

    accuracy                           0.81       300
   macro avg       0.78      0.77      0.78       300
weighted avg       0.81      0.81      0.81       300



Model B: NOISE INJECTION Accuracy: 0.7933
------------------------------
              precision    recall  f1-score   support

      Medium       0.80      0.87      0.83       176
       Reach       0.84      0.73      0.78        81
        Safe       0.68      0.60      0.64        43

    accuracy                           0.79       300
   macro avg       0.77      0.73      0.75       300
weighted avg       0.79      0.79      0.79       300



While Bootstrapping technically has a slightly higher overall accuracy ($0.81$ vs $0.806$), Noise Injection is the superior model for a real-world application.

Why did this happen? Bootstrapping works by duplicating exact rows. The model essentially "memorized" the specific GPA/GRE combinations of the few safe students in your training set. The Noise Injection model learned a "fuzzy" boundary around those scores, making it robust enough to reject students who were close but not quite good enough.

We will retrain the different models we used with the noise injection method.

## retrain XGBoost

In [8]:
import pandas as pd
import joblib
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

# ================================================================
# 1. Load the Balanced Data (Created via Noise Injection)
# ================================================================
df = pd.read_csv('train_balanced_final.csv')

print(f"Training on {len(df)} samples.")
print("Class Distribution:\n", df['Category'].value_counts())

# ================================================================
# 2. Preprocessing for XGBoost
# ================================================================
# XGBoost requires numerical classes (0, 1, 2)
# We define a specific mapping to ensure we know which is which
class_mapping = {'Reach': 0, 'Medium': 1, 'Safe': 2}
df['Encoded_Category'] = df['Category'].map(class_mapping)

# Define Features and Target
X = df.drop(['Chance of Admit', 'Category', 'Encoded_Category'], axis=1)
y = df['Encoded_Category']

# Optional: Split a tiny validation set just to check it works
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

# ================================================================
# 3. Train the XGBoost Model
# ================================================================
# We use standard parameters. You can tune these, but defaults work well here.
model = XGBClassifier(
    n_estimators=200,     # Number of trees
    learning_rate=0.05,   # Step size (lower is better for generalization)
    max_depth=4,          # Depth of trees (prevent overfitting)
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss' # Multi-class log loss
)

print("\nTraining Model...")
model.fit(X_train, y_train)

# ================================================================
# 4. Validate (Sanity Check)
# ================================================================
val_preds = model.predict(X_val)
acc = accuracy_score(y_val, val_preds)
print(f"Validation Accuracy: {acc:.4f}")
print("\nValidation Report:")
# We map the numbers back to names for the report
target_names = ['Reach', 'Medium', 'Safe'] 
print(classification_report(y_val, val_preds, target_names=target_names))

# ================================================================
# 5. Save Final Model
# ================================================================
joblib.dump(model, "xgb_grad_admission_model_bootstrap.joblib")
print("\nSUCCESS: Model saved as 'xgb_grad_admission_mode_bootstrap.joblib'")

# Example of using the saved model
loaded_model = joblib.load("xgb_grad_admission_model_bootstrap.joblib")

# Prediction for a new student
prediction_index = loaded_model.predict(new_student_data)[0] # Returns 0, 1, or 2

# Translation
classes = {0: "Reach", 1: "Medium", 2: "Safe"}
print(f"Result: {classes[prediction_index]}")

ModuleNotFoundError: No module named 'xgboost'