In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.preprocessing import RobustScaler
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print("TRY 13 - Current Best: 0.61408")
print("="*70)

# Load
train = pd.read_parquet('/Users/houlitong/Desktop/x coursework/python/train.parquet')
test = pd.read_parquet('/Users/houlitong/Desktop/x coursework/python/test.parquet')

churned = set(train[train['page'] == 'Cancellation Confirmation']['userId'].unique())

# Feature extraction
def fast_features(df, is_train=True):
    df['time_numeric'] = df['time'].astype(np.int64) / 1e9
    df['reg_numeric'] = df['registration'].astype(np.int64) / 1e9
    df['is_nextsong'] = (df['page'] == 'NextSong').astype(int)
    df['is_thumbsup'] = (df['page'] == 'Thumbs Up').astype(int)
    df['is_thumbsdown'] = (df['page'] == 'Thumbs Down').astype(int)
    df['is_error'] = (df['page'] == 'Error').astype(int)
    df['is_downgrade'] = (df['page'] == 'Downgrade').astype(int)
    df['is_playlist'] = (df['page'] == 'Add to Playlist').astype(int)
    df['is_home'] = (df['page'] == 'Home').astype(int)
    
    agg = {
        'page': 'count', 'sessionId': 'nunique',
        'time_numeric': ['min', 'max'], 'reg_numeric': 'first',
        'is_nextsong': 'sum', 'is_thumbsup': 'sum', 'is_thumbsdown': 'sum',
        'is_error': 'sum', 'is_downgrade': 'sum', 'is_playlist': 'sum', 'is_home': 'sum',
        'song': 'nunique', 'artist': 'nunique',
        'gender': lambda x: x.mode()[0] if x.notna().any() else 'Unknown',
        'level': lambda x: x.mode()[0]
    }
    
    feat = df.groupby('userId').agg(agg)
    feat.columns = ['_'.join(col).strip('_') for col in feat.columns]
    feat = feat.rename(columns={
        'page_count': 'total_events', 'sessionId_nunique': 'sessions',
        'time_numeric_min': 'time_min', 'time_numeric_max': 'time_max',
        'reg_numeric_first': 'reg_time', 'is_nextsong_sum': 'songs',
        'is_thumbsup_sum': 'thumbs_up', 'is_thumbsdown_sum': 'thumbs_down',
        'is_error_sum': 'errors', 'is_downgrade_sum': 'downgrades',
        'is_playlist_sum': 'add_playlist', 'is_home_sum': 'home',
        'song_nunique': 'unique_songs', 'artist_nunique': 'unique_artists',
        'gender_<lambda>': 'gender', 'level_<lambda>': 'level'
    })
    
    feat['days_active'] = ((feat['time_max'] - feat['time_min']) / 86400).clip(lower=0.1)
    feat['account_age'] = (feat['time_max'] - feat['reg_time']) / 86400
    feat['events_per_day'] = feat['total_events'] / feat['days_active']
    feat['sessions_per_day'] = feat['sessions'] / feat['days_active']
    feat['events_per_session'] = feat['total_events'] / (feat['sessions'] + 1)
    feat['music_ratio'] = feat['songs'] / (feat['total_events'] + 1)
    feat['thumbs_up_rate'] = feat['thumbs_up'] / (feat['total_events'] + 1)
    feat['thumbs_down_rate'] = feat['thumbs_down'] / (feat['total_events'] + 1)
    feat['error_rate'] = feat['errors'] / (feat['total_events'] + 1)
    feat['playlist_rate'] = feat['add_playlist'] / (feat['songs'] + 1)
    feat['home_rate'] = feat['home'] / (feat['total_events'] + 1)
    feat['song_diversity'] = feat['unique_songs'] / (feat['songs'] + 1)
    feat['artist_diversity'] = feat['unique_artists'] / (feat['songs'] + 1)
    feat['thumbs_ratio'] = feat['thumbs_up'] / (feat['thumbs_down'] + 1)
    feat['log_events'] = np.log1p(feat['total_events'])
    feat['log_days'] = np.log1p(feat['days_active'])
    feat['log_sessions'] = np.log1p(feat['sessions'])
    feat['log_songs'] = np.log1p(feat['songs'])
    feat['engagement'] = feat['thumbs_up_rate'] - feat['thumbs_down_rate'] + feat['playlist_rate']
    feat['negative'] = feat['error_rate'] + feat['thumbs_down_rate']
    feat['has_downgrade'] = (feat['downgrades'] > 0).astype(int)
    
    if is_train:
        feat['churn'] = feat.index.isin(churned).astype(int)
    
    return feat.reset_index()

train_f = fast_features(train, True)
test_f = fast_features(test, False)

# Outlier capping
for col in ['total_events', 'sessions', 'songs']:
    cap = train_f[col].quantile(0.98)
    test_f[col] = test_f[col].clip(upper=cap * 1.5)

# Features (22 total)
features = [
    'log_events', 'log_days', 'log_sessions', 'log_songs',
    'events_per_day', 'sessions_per_day', 'events_per_session',
    'music_ratio', 'thumbs_up_rate', 'thumbs_down_rate', 'error_rate',
    'playlist_rate', 'home_rate', 'song_diversity', 'artist_diversity',
    'thumbs_ratio', 'engagement', 'negative', 'has_downgrade',
    'thumbs_up', 'thumbs_down', 'downgrades'
]

X = train_f[features + ['gender', 'level']].copy()
y = train_f['churn']
X_test = test_f[features + ['gender', 'level']].copy()

X = pd.get_dummies(X, columns=['gender', 'level'], drop_first=True).fillna(0)
X_test = pd.get_dummies(X_test, columns=['gender', 'level'], drop_first=True).fillna(0)

for col in set(X.columns) - set(X_test.columns):
    X_test[col] = 0
X_test = X_test[X.columns]

scaler = RobustScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

# Models
gb = GradientBoostingClassifier(
    n_estimators=400, 
    max_depth=7, 
    learning_rate=0.03,
    subsample=0.85, 
    min_samples_leaf=3, 
    random_state=42
)

rf = RandomForestClassifier(
    n_estimators=250, 
    max_depth=12, 
    min_samples_leaf=6,
    class_weight='balanced', 
    random_state=43, 
    n_jobs=-1
)

gb.fit(X, y)
rf.fit(X, y)

proba = 0.65 * gb.predict_proba(X_test)[:, 1] + 0.35 * rf.predict_proba(X_test)[:, 1]

# Top 46%
n_churn = int(2904 * 0.46)
top_idx = np.argsort(proba)[-n_churn:]
predictions = np.zeros(len(proba), dtype=int)
predictions[top_idx] = 1

print(f"Top 46%: {n_churn} users")

save_path = '/Users/houlitong/Desktop/x coursework/python/untitled folder 2/try——0.614.csv'
submission = pd.DataFrame({
    'id': test_f['userId'],
    'target': predictions
})
submission.to_csv(save_path, index=False)



TRY 13 - Current Best: 0.61408
Top 46%: 1335 users
