### Step 1: Loading & Preparing Data

*We’re loading the saved CSVs for users, content, and training labels, then converting date strings into numeric timestamps for modeling.*


In [None]:
# Basic utilities
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Modeling
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from lightgbm import LGBMClassifier
import glob



In [None]:
# Google Drive mounting & project path setup
from google.colab import drive
import os, zipfile

# Step 1: Mounting Google Drive
drive.mount('/content/drive')

# Step 2: Defining paths
zip_path = '/content/drive/MyDrive/test_data (1).zip'
base_path = '/content/drive/MyDrive/projects/news-recommendation-systems'
extract_path = os.path.join(base_path, 'test_data')  # where your real files are

# Step 3: Unzipping if not already extracted
if not os.path.exists(extract_path):
    os.makedirs(base_path, exist_ok=True)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(base_path)
        print(" Extracted zip to:", base_path)
else:
    print(" Zip already extracted.")

# Step 4: Setting working directory to the folder containing CSVs
os.chdir(extract_path)
print(" Working directory set to:", extract_path)

# Step 5: Confirming
print(" Files found:", os.listdir())


Mounted at /content/drive
📦 Zip already extracted.
✅ Working directory set to: /content/drive/MyDrive/projects/inshorts-data-scientist-task/test_data
📂 Files found: ['.DS_Store', 'training_content', 'testing_content', 'event', 'devices']


In [None]:
os.listdir()


['.DS_Store', 'training_content', 'testing_content', 'event', 'devices']

In [None]:
from google.colab import drive
import os

drive.mount('/content/drive')
project_path = '/content/drive/MyDrive/test_data'
os.chdir(project_path)

print(" Google Drive mounted and path set.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Google Drive mounted and path set.


In [None]:
#  Loading user & content features
user_df = pd.read_csv('user_features.csv')
content_df = pd.read_csv('content_features.csv')

#  Loading training interactions from event/ folder
event_files = glob.glob('event/part-*.csv')
train_df = pd.concat([pd.read_csv(f, on_bad_lines='skip') for f in event_files], ignore_index=True)

#  Loading test candidates from testing_content
test_files = glob.glob('testing_content/part-*.csv')
test_candidates = pd.concat([pd.read_csv(f, on_bad_lines='skip') for f in test_files], ignore_index=True)

#  Converting datetime strings to numeric timestamps
for col in ['first_seen', 'last_seen']:
    for df in [train_df, test_candidates]:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce').astype('int64') // 10**9

#  Standardizing column names for merging
train_df.rename(columns={'deviceid': 'deviceId', 'hashid': 'hashId'}, inplace=True)
test_candidates.rename(columns={'deviceid': 'deviceId', 'hashid': 'hashId'}, inplace=True)
user_df.rename(columns={'deviceid': 'deviceId'}, inplace=True)
content_df.rename(columns={'hashid': 'hashId'}, inplace=True)

#  Filtering positive events only
event_types_we_want = ['CONTENT_CLICK']
train_df = train_df[train_df['event_type'].isin(event_types_we_want)].copy()
train_df['label'] = 1

#  Sanity check
print(f"Users: {user_df.shape}, Content: {content_df.shape}, Training Events: {train_df.shape}, Test: {test_candidates.shape}")

Users: (8977, 7), Content: (14622, 4), Training Events: (0, 14), Test: (970, 12)


In [None]:
import pandas as pd
import glob

# Loading engineered features
user_df = pd.read_csv('user_features.csv')
content_df = pd.read_csv('content_features.csv')

# Defining train/test file paths
train_files = glob.glob('training_content/part-*.csv')
test_files = glob.glob('testing_content/part-*.csv')

# Loading training interactions (skip corrupted rows)
train_df = pd.concat(
    [pd.read_csv(f, on_bad_lines='skip') for f in train_files],
    ignore_index=True
)

# Loading test candidates (skip corrupted rows)
test_candidates = pd.concat(
    [pd.read_csv(f, on_bad_lines='skip') for f in test_files],
    ignore_index=True
)

# Loading events if needed
event_files = glob.glob('event/part-*.csv')
event_df = pd.concat(
    [pd.read_csv(f, on_bad_lines='skip') for f in event_files],
    ignore_index=True
)

#  Summary print
print("DataFrames loaded:")
print(f" Users: {user_df.shape}")
print(f" Content: {content_df.shape}")
print(f" Training: {train_df.shape}")
print(f" Testing: {test_candidates.shape}")


DataFrames loaded:
 Users: (8977, 7)
 Content: (14622, 4)
 Training: (8170, 12)
 Testing: (970, 12)


In [None]:
print("train_df columns:", train_df.columns.tolist())
print("user_df columns:", user_df.columns.tolist())
print("event_df columns:", event_df.columns.tolist())


train_df columns: ['hashid', 'title', 'content', 'newsType', 'author', 'categories', 'hashtags', 'newsDistrict', 'createdAt', 'updatedAt', 'newsLanguage', 'sourceName']
user_df columns: ['deviceId', 'event_count', 'total_time_spent', 'unique_content_count', 'first_seen', 'last_seen', 'active_days']
event_df columns: ['deviceId', 'event_type', 'eventTimestamp', 'hashId', 'categoryWhenEventHappened', 'cardViewPosition', 'overallTimeSpent', 'searchTerm', 'relevancy_color', 'relevancy_topic', 'state', 'locality', 'district']


In [None]:
import pandas as pd
import numpy as np
import glob
import os
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from lightgbm import LGBMClassifier
import warnings
warnings.filterwarnings('ignore')

### Step 2: Feature Matrix Construction

*We’re merging user-level and content-level features into a single DataFrame, then splitting into X (features) and y (labels).*

In [None]:
print(train_df.columns.tolist())


['hashid', 'title', 'content', 'newsType', 'author', 'categories', 'hashtags', 'newsDistrict', 'createdAt', 'updatedAt', 'newsLanguage', 'sourceName']


In [None]:
# Harmonizing keys before merging
train_df = train_df.rename(columns={'hashid': 'hashId', 'deviceid': 'deviceId'})
content_df = content_df.rename(columns={'hashid': 'hashId'})
user_df = user_df.rename(columns={'deviceid': 'deviceId'})


In [None]:
#  Harmonizing key for content merge
train_df = train_df.rename(columns={'hashid': 'hashId'})
content_df = content_df.rename(columns={'hashid': 'hashId'})

#  Merging content metadata into training data
train = train_df.merge(content_df, on='hashId', how='left')

#  Adding dummy label for structure (e.g., 1 for all rows)
train['label'] = 1  # This is temporary; adjust based on actual labels if available

#  Preparing feature matrix
X = train.drop(columns=['hashId', 'label'])  # no deviceId here
X = X.select_dtypes(include=['int64', 'float64'])
y = train['label']

#  Saving for reproducibility
train.to_csv('train.csv', index=False)

print(f" X: {X.shape}, y distribution: \n{y.value_counts(normalize=True)}")


✅ X: (8170, 3), y distribution: 
label
1    1.0
Name: proportion, dtype: float64


In [None]:
import pandas as pd

#  Just in case: rename hashId again safely (won't hurt if repeated)
train_df = train_df.rename(columns={'hashid': 'hashId'})
content_df = content_df.rename(columns={'hashid': 'hashId'})
event_df = event_df.rename(columns={'hashid': 'hashId', 'deviceid': 'deviceId'})
user_df = user_df.rename(columns={'deviceid': 'deviceId'})

#  Merging deviceId into train_df from event_df (event has both deviceId and hashId)
event_meta = event_df[['deviceId', 'hashId']].drop_duplicates()
train_df = train_df.merge(event_meta, on='hashId', how='left')

#  Merging content features
train = train_df.merge(content_df, on='hashId', how='left')

#  Merging user features (after deviceId merge above)
train = train.merge(user_df, on='deviceId', how='left')

#  Adding dummy label (if not already there)
if 'label' not in train.columns:
    train['label'] = 1

#  Feature matrix construction
drop_cols = ['deviceId', 'hashId', 'title', 'content', 'author', 'categories',
             'hashtags', 'newsDistrict', 'createdAt', 'updatedAt',
             'newsLanguage', 'sourceName']

X = train.drop(columns=drop_cols + ['label'], errors='ignore')
X = X.select_dtypes(include=['int64', 'float64'])
y = train['label']

#  Saving for reproducibility
train.to_csv('train.csv', index=False)

#  Output
print(f" X: {X.shape}, y distribution:\n{y.value_counts(normalize=True)}")


✅ X: (1420453, 7), y distribution:
label
1    1.0
Name: proportion, dtype: float64


In [None]:
print("Train DataFrame shape before feature merge:", train_df.shape)
print("User features keys:", user_df.columns.tolist())
print("Content features keys:", content_df.columns.tolist())
print("Train DataFrame sample keys:", train_df.columns.tolist())


Train DataFrame shape before feature merge: (8170, 12)
User features keys: ['deviceId', 'event_count', 'total_time_spent', 'unique_content_count', 'first_seen', 'last_seen', 'active_days']
Content features keys: ['hashId', 'event_count', 'unique_viewers', 'avg_time_spent']
Train DataFrame sample keys: ['hashId', 'title', 'content', 'newsType', 'author', 'categories', 'hashtags', 'newsDistrict', 'createdAt', 'updatedAt', 'newsLanguage', 'sourceName']


### Step 3: Model Training & Validation

*We’re splitting the data, training a LightGBM classifier with early stopping, and evaluating via AUC.*

In [None]:
print("positive_df shape:", positive_df.shape)
print("negative_df shape:", negative_df.shape)
print("train_df shape:", train_df.shape)
print(train_df.head())


positive_df shape: (0, 3)
negative_df shape: (0, 3)
train_df shape: (0, 3)
Empty DataFrame
Columns: [deviceId, hashId, label]
Index: []


In [None]:
#  Step 3: Rebuilding training set from event logs
import numpy as np

# Step 1: Getting all positive samples — clicks only
positive_df = event_df[event_df['event_type'] == 'click'][['deviceId', 'hashId']].drop_duplicates()
positive_df['label'] = 1

# Step 2: Creating negative samples (user-content pairs not clicked)
all_users = positive_df['deviceId'].unique()
all_contents = event_df['hashId'].unique()

negatives = []
np.random.seed(42)

for user in all_users:
    clicked = set(positive_df[positive_df['deviceId'] == user]['hashId'])
    available = list(set(all_contents) - clicked)

    # Skipping if no options to sample from
    if len(available) == 0:
        continue

    # Sample 3 negative examples per user
    sample_size = min(3, len(available))
    sampled = np.random.choice(available, size=sample_size, replace=False)

    for c in sampled:
        negatives.append([user, c, 0])

negative_df = pd.DataFrame(negatives, columns=['deviceId', 'hashId', 'label'])

# Step 3: Combining and shuffling
train_df = pd.concat([positive_df, negative_df], ignore_index=True)
train_df = train_df.drop_duplicates().sample(frac=1, random_state=42).reset_index(drop=True)

# Checking class balance
print(" Label distribution AFTER rebuild:")
print(train_df['label'].value_counts(normalize=True))
print(train_df.head())


✅ Label distribution AFTER rebuild:
Series([], Name: proportion, dtype: float64)
Empty DataFrame
Columns: [deviceId, hashId, label]
Index: []


In [None]:
print(event_df['event_type'].value_counts(dropna=False))


event_type
TimeSpent-Front              3480131
TimeSpent-Back                 44933
News Bookmarked                10870
News Shared                     3517
News Unbookmarked               2275
Relevancy Option Selected       1312
Search                          1123
Name: count, dtype: int64


In [None]:
import numpy as np
import pandas as pd

#  Step 1: Using 'News Bookmarked' as positive signal
positive_df = event_df[event_df['event_type'] == 'News Bookmarked'][['deviceId', 'hashId']].drop_duplicates()
positive_df['label'] = 1

#  Step 2: Generating negative samples
all_users = user_df['deviceId'].unique()
all_content = content_df['hashId'].unique()
num_negatives = len(positive_df)

np.random.seed(42)
neg_users = np.random.choice(all_users, size=num_negatives)
neg_content = np.random.choice(all_content, size=num_negatives)

negative_df = pd.DataFrame({
    'deviceId': neg_users,
    'hashId': neg_content
})

#  Ensuring negatives are not positives
negative_df = negative_df.merge(
    positive_df[['deviceId', 'hashId']],
    on=['deviceId', 'hashId'],
    how='left',
    indicator=True
)

negative_df = negative_df[negative_df['_merge'] == 'left_only'].drop(columns=['_merge'])
negative_df['label'] = 0

# ✅ Step 3: Combining and shuffling
train_df = pd.concat([positive_df, negative_df], ignore_index=True)
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)

#  Sanity check
print("✅ Label distribution AFTER rebuild:")
print(train_df['label'].value_counts(normalize=True))
print(train_df.head())


✅ Label distribution AFTER rebuild:
label
1    0.500049
0    0.499951
Name: proportion, dtype: float64
                               deviceId      hashId  label
0  7cf72e69-0e42-47b7-9085-b3f46ca8be85  k7epp86p-1      0
1  81957c0d-44a4-45e8-b0e7-438f9aa1b9fc  d6ghkpsd-1      0
2  12b6f877-193b-456f-af55-994a98ab5123  hr1rbbai-1      0
3  049b40db-bfda-4f61-9118-708e831e9823  ysu3593v-1      1
4  7e5c2cb6-4761-48f4-b258-9f30c98ead45  zc2frrhg-1      0


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
from lightgbm import LGBMClassifier

#  Final merge to enrich with features
train = train_df.merge(user_df, on='deviceId', how='left')
train = train.merge(content_df, on='hashId', how='left')

#  Feature prep
drop_cols = ['deviceId', 'hashId', 'title', 'content', 'author', 'categories', 'hashtags',
             'newsDistrict', 'createdAt', 'updatedAt', 'newsLanguage', 'sourceName']
X = train.drop(columns=drop_cols, errors='ignore')
X = X.select_dtypes(include=['int64', 'float64'])
y = train['label']

#  Sanity check
print("Label distribution:\n", y.value_counts(normalize=True))

#  Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

#  Model training
model = LGBMClassifier(n_estimators=200, random_state=42)
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    callbacks=[lgb.early_stopping(20)]
)

#  Evaluation
val_preds = model.predict_proba(X_val)[:, 1]
auc_score = roc_auc_score(y_val, val_preds)
print(f"✅ Final Validation AUC: {auc_score:.4f}")


Label distribution:
 label
1    0.500049
0    0.499951
Name: proportion, dtype: float64
[LightGBM] [Info] Number of positive: 8109, number of negative: 8107
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000723 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1561
[LightGBM] [Info] Number of data points in the train set: 16216, number of used features: 8
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500062 -> initscore=0.000247
[LightGBM] [Info] Start training from score 0.000247
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[152]	valid_0's binary_logloss: 1.20822e-07
✅ Final Validation AUC: 1.0000


### Step 4: Prediction & Submission

*We’re loading (or regenerating) test candidates, merging features, scoring with our trained model, and exporting the top-50 predictions.*


In [None]:
test_candidates = test_candidates.rename(columns={'deviceid': 'deviceId', 'hashid': 'hashId'})


In [None]:
print(" Columns in test_candidates:\n", test_candidates.columns.tolist())
print(" First few rows:\n", test_candidates.head())


📂 Columns in test_candidates:
 ['hashId', 'title', 'content', 'newsType', 'author', 'categories', 'hashtags', 'newsDistrict', 'createdAt', 'updatedAt', 'newsLanguage', 'sourceName']
🔢 First few rows:
        hashId                                              title  \
0  zdw0jrig-1  Redmi 12 5G will be a game-changer for 5G conn...   
1  y5pfnbmp-1  Limited seats left for Hero Vired & MIT’s Prog...   
3  fknyydal-1  Which 14 teams have qualified for 20-team T20 ...   
4  61ogen4w-1  42-year-old woman shot dead near her house in ...   

                                             content newsType  \
0  Xiaomi will debut Redmi 12 5G alongside Redmi ...     NEWS   
1  Hero Group's EdTech company Hero Vired & MIT l...     NEWS   
2  IMD has issued heavy to very heavy rainfall wa...     NEWS   
3  Ireland and Scotland have qualified for the 20...     NEWS   
4  A 42-year-old woman was shot dead near her hou...     NEWS   

                     author  categories hashtags newsDistrict  \
0 

In [None]:
from google.colab import drive
import os
import pandas as pd

#  Mounting Google Drive (already mounted, so this is optional)
drive.mount('/content/drive')

#  Setting correct project path
project_path = '/content/drive/MyDrive/test_data'
os.chdir(project_path)

#  FIXED: Correct path to the file (only one test_data in path)
device_path = 'devices/part-00000-cdb2cdd7-9d14-4000-b947-4d0475444217-c000.csv'

#  Loadiing and standardize column
devices = pd.read_csv(device_path)
devices = devices.rename(columns={'deviceid': 'deviceId'})

#  Checking
print("✅ Devices loaded:", devices.shape)
print(devices[['deviceId']].head())


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ Devices loaded: (10400, 11)
                               deviceId
0  197b123e-eb9e-4fc1-a32d-aa86aaea425e
1  3c33c537-7c6c-40f5-835c-f997e883cae2
2  6c7be5d0-d4d8-469f-91be-8055021ceef9
3  0801af66-0a6f-4fdd-82a9-c2b15757b8f5
4  78b3c7a7-5881-42dc-9f8e-b4fc27f94360


In [None]:
#  STEP 4: Predicting and Generate Submission

# Step 1: Preparing test candidates → merge each test user with every test content
test_candidates = devices[['deviceId']].assign(dummy=1).merge(
    content_df[['hashId']].assign(dummy=1), on='dummy'
).drop(columns=['dummy'])

# Step 2: Merging user & content features into test set
X_test = test_candidates.merge(user_df, on='deviceId', how='left')
X_test = X_test.merge(content_df, on='hashId', how='left')

# Step 3: Converting datetime columns (if they exist)
for col in ['first_seen', 'last_seen']:
    if col in X_test.columns:
        X_test[col] = pd.to_datetime(X_test[col], errors='coerce').astype('int64') // 10**9

# Step 4: Aligning columns with training data
X_test_model = X_test[X.columns]  # X is the training feature set from Step 3

# Step 5: Predicting probabilities
X_test['label'] = model.predict_proba(X_test_model)[:, 1]

# Step 6: Sorting and select top 50 content per user
top_50 = (
    X_test.sort_values(['deviceId', 'label'], ascending=[True, False])
          .groupby('deviceId')
          .head(50)
          .reset_index(drop=True)
)

# Step 7: Exporting final submission
top_50[['deviceId', 'hashId', 'label']].to_csv('submission.csv', index=False)
print(" Submission file 'submission.csv' created with shape:", top_50.shape)
top_50[['deviceId', 'hashId', 'label']].head()


In [None]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from lightgbm import LGBMClassifier

#  Loading features and model inputs
user_df = pd.read_csv('/content/drive/MyDrive/test_data/user_features.csv')
content_df = pd.read_csv('/content/drive/MyDrive/test_data/content_features.csv')
devices = pd.read_csv('/content/drive/MyDrive/test_data/devices/part-00000-cdb2cdd7-9d14-4000-b947-4d0475444217-c000.csv')

#  Harmonizing naming
user_df.rename(columns={'deviceid': 'deviceId'}, inplace=True)
content_df.rename(columns={'hashid': 'hashId'}, inplace=True)
devices.rename(columns={'deviceid': 'deviceId'}, inplace=True)

#  Preparing test user list
test_users = devices[['deviceId']].drop_duplicates()


In [None]:
drop_cols = ['deviceId', 'hashId', 'title', 'content', 'author', 'categories', 'hashtags',
             'newsDistrict', 'createdAt', 'updatedAt', 'newsLanguage', 'sourceName']
train = pd.read_csv('/content/drive/MyDrive/test_data/train.csv')  # If you saved it before

X = train.drop(columns=drop_cols, errors='ignore')
X = X.select_dtypes(include=['int64', 'float64'])


  train = pd.read_csv('/content/drive/MyDrive/test_data/train.csv')  # If you saved it before


In [None]:
import pandas as pd
import lightgbm as lgb
from lightgbm import LGBMClassifier
import joblib  # if you had saved model


In [None]:
import pandas as pd
import glob

#  Step 1: Getting list of all event files (match actual filenames)
event_files = glob.glob('/content/drive/MyDrive/test_data/event/part-*.csv')
print(f"📦 Found {len(event_files)} event files.")

#  Step 2: Loading only essential columns to reduce memory load
use_columns = ['deviceId', 'hashId', 'event_type']

#  Step 3: Loading in chunks and concatenate
event_df = pd.concat([
    pd.read_csv(f, usecols=use_columns, on_bad_lines='skip', low_memory=False)
    for f in event_files
], ignore_index=True)

#  Step 4: Quick sanity check
print("✅ Loaded event_df shape:", event_df.shape)
print("🧾 Event types distribution:\n", event_df['event_type'].value_counts(dropna=False))
print("🔎 Sample:\n", event_df.head())


📦 Found 4 event files.
✅ Loaded event_df shape: (3544161, 3)
🧾 Event types distribution:
 event_type
TimeSpent-Front              3480131
TimeSpent-Back                 44933
News Bookmarked                10870
News Shared                     3517
News Unbookmarked               2275
Relevancy Option Selected       1312
Search                          1123
Name: count, dtype: int64
🔎 Sample:
                                deviceId                 event_type      hashId
0  1c53a149-303d-486e-ac62-0b9c9e469cda                     Search     Unknown
1  68738cd7-ae73-49c7-90d0-9829516f434e                     Search     Unknown
2  afe21f00-2d68-4a73-9bc6-f19be7f2226a  Relevancy Option Selected     Unknown
3  42b52a10-0a8e-4d25-a886-a5f7323ea8c0                     Search     Unknown
4  fb242092-49e6-45dd-b764-1f1bf34a2e7d            TimeSpent-Front  07ov7cef-1


In [None]:
import numpy as np

#  Step 1: Positive pairs → Users who spent time on a content card
positive_df = event_df[event_df['event_type'] == 'TimeSpent-Front'][['deviceId', 'hashId']].drop_duplicates()
positive_df['label'] = 1

print(" Positive samples:", positive_df.shape)

#  Step 2: Negative samples → Random deviceId/hashId combos not in positive_df
unique_users = event_df['deviceId'].dropna().unique()
unique_contents = event_df['hashId'].dropna().unique()

# Matching number of positives
num_negatives = len(positive_df)
np.random.seed(42)

neg_users = np.random.choice(unique_users, size=num_negatives, replace=True)
neg_contents = np.random.choice(unique_contents, size=num_negatives, replace=True)

negative_df = pd.DataFrame({'deviceId': neg_users, 'hashId': neg_contents})
negative_df = negative_df.merge(positive_df[['deviceId', 'hashId']], on=['deviceId', 'hashId'], how='left', indicator=True)
negative_df = negative_df[negative_df['_merge'] == 'left_only'].drop(columns=['_merge'])
negative_df['label'] = 0

print(" Negative samples:", negative_df.shape)

#  Step 3: Combining
train_df = pd.concat([positive_df, negative_df], ignore_index=True).drop_duplicates()
train_df = train_df.sample(frac=1, random_state=42).reset_index(drop=True)

#  Final check
print(" Final train_df shape:", train_df.shape)
print(train_df['label'].value_counts(normalize=True))
print(train_df.head())


✅ Positive samples: (2765385, 3)
✅ Negative samples: (2707319, 3)
✅ Final train_df shape: (5444510, 3)
label
1    0.507922
0    0.492078
Name: proportion, dtype: float64
                               deviceId      hashId  label
0  b4bebd7f-0729-4e0e-a88b-b51853d0b543  hhwllgvi-1      0
1  c0cec7eb-b4be-4e9b-9524-a2f40d4c4275  jlfeeiqj-1      1
2  1b51faf2-4b7a-4d77-9125-6ce066cd2d70  eelh3eej-1      0
3  d733445b-3a0e-4c67-b84c-d305605cd62c  tn2tzgfw-1      0
4  4f10065e-faa1-49f1-b8fc-93a6e186fb26  zn5jq7m9-1      0


In [None]:
#  Loading user + content features if not already
user_df = pd.read_csv('/content/drive/MyDrive/test_data/user_features.csv')
content_df = pd.read_csv('/content/drive/MyDrive/test_data/content_features.csv')

#  Harmonizing columns
user_df = user_df.rename(columns={'deviceid': 'deviceId'})
content_df = content_df.rename(columns={'hashid': 'hashId'})

#  Merging features into train_df
train = train_df.merge(user_df, on='deviceId', how='left')
train = train.merge(content_df, on='hashId', how='left')

#  Feature engineering: remove ID columns & non-numeric
X = train.drop(columns=['deviceId', 'hashId', 'label'], errors='ignore')
X = X.select_dtypes(include=['int64', 'float64'])
y = train['label']

#  LightGBM Training (fast with early stopping)
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier
from sklearn.metrics import roc_auc_score
import lightgbm as lgb

X_train, X_val, y_train, y_val = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

model = LGBMClassifier(n_estimators=200, random_state=42)
model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    callbacks=[lgb.early_stopping(20)]
)

val_preds = model.predict_proba(X_val)[:, 1]
auc_score = roc_auc_score(y_val, val_preds)

print(f" Final Validation AUC: {auc_score:.4f}")


[LightGBM] [Info] Number of positive: 2212308, number of negative: 2143300
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.511745 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1558
[LightGBM] [Info] Number of data points in the train set: 4355608, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.507922 -> initscore=0.031690
[LightGBM] [Info] Start training from score 0.031690
Training until validation scores don't improve for 20 rounds
Early stopping, best iteration is:
[114]	valid_0's binary_logloss: 0.236029
✅ Final Validation AUC: 0.9653


In [None]:
feature_cols = list(X.columns)


In [None]:
#  RAM-safe dry run with just 50 users
test_users_sample = test_users.iloc[:50].copy()

#  Using test_candidates instead of undefined `test_articles`
content_ids = test_candidates[['hashId']].drop_duplicates()

#  Creating all combinations of users × content
test_batch = test_users_sample.assign(key=1).merge(content_ids.assign(key=1), on='key').drop('key', axis=1)

#  Merging features
test_batch = test_batch.merge(user_df, on='deviceId', how='left')
test_batch = test_batch.merge(content_df, on='hashId', how='left')

#  Converting datetime columns if present
for col in ['first_seen', 'last_seen']:
    if col in test_batch.columns:
        test_batch[col] = pd.to_datetime(test_batch[col], errors='coerce').astype('int64') // 10**9

#  Matching feature columns (ensure it's defined; typically from training stage)
test_batch_X = test_batch[feature_cols].select_dtypes(include=['int64', 'float64']).fillna(0)

#  Predicting
test_batch['label'] = model.predict_proba(test_batch_X)[:, 1]

#  Selecting top 50 per user
top_sample = (
    test_batch.sort_values(['deviceId', 'label'], ascending=[True, False])
              .groupby('deviceId')
              .head(50)
              .reset_index(drop=True)
)

print(" Dry run passed. Sample shape:", top_sample.shape)
top_sample.head()


✅ Dry run passed. Sample shape: (2500, 12)


Unnamed: 0,deviceId,hashId,event_count_x,total_time_spent,unique_content_count,first_seen,last_seen,active_days,event_count_y,unique_viewers,avg_time_spent,label
0,07d18cd7-e320-4f68-9198-4bc78f177e7d,zdw0jrig-1,,,,-9223372037,-9223372037,,,,,0.000126
1,07d18cd7-e320-4f68-9198-4bc78f177e7d,y5pfnbmp-1,,,,-9223372037,-9223372037,,,,,0.000126
2,07d18cd7-e320-4f68-9198-4bc78f177e7d,eo2eyhgk-1,,,,-9223372037,-9223372037,,,,,0.000126
3,07d18cd7-e320-4f68-9198-4bc78f177e7d,fknyydal-1,,,,-9223372037,-9223372037,,,,,0.000126
4,07d18cd7-e320-4f68-9198-4bc78f177e7d,61ogen4w-1,,,,-9223372037,-9223372037,,,,,0.000126


In [None]:
import pandas as pd
import numpy as np

#  Loading test articles using exact filename
test_articles_path = '/content/drive/MyDrive/test_data/testing_content/part-00000-8be13c58-b74d-4e30-8877-c8b5e168035a-c000.csv'
test_articles = pd.read_csv(test_articles_path, on_bad_lines='skip')
test_articles = test_articles.rename(columns={'hashid': 'hashId'})

#  Using existing test_users (you should already have this from device file)
test_users_sample = test_users.iloc[:50].copy()  # you can bump this number up carefully

#  Creating cartesian product (safe for small sample)
test_candidates = test_users_sample.assign(key=1).merge(
    test_articles[['hashId']].assign(key=1), on='key'
).drop(columns='key')

#  Merging features
test_candidates = test_candidates.merge(user_df, on='deviceId', how='left')
test_candidates = test_candidates.merge(content_df, on='hashId', how='left')

#  Preparing feature matrix
X_test_model = test_candidates[[col for col in X.columns if col in test_candidates.columns]]
test_candidates['label'] = model.predict_proba(X_test_model)[:, 1]

#  Getting top-50 per user
top_50_sample = (
    test_candidates.sort_values(['deviceId', 'label'], ascending=[True, False])
    .groupby('deviceId')
    .head(50)
    .reset_index(drop=True)
)

#  Saving sample to disk (optional)
top_50_sample[['deviceId', 'hashId', 'label']].to_csv('sample_submission.csv', index=False)

# Preview
print(" Dry run passed. Sample shape:", top_50_sample.shape)
top_50_sample.head()


✅ Dry run passed. Sample shape: (2500, 12)


Unnamed: 0,deviceId,hashId,event_count_x,total_time_spent,unique_content_count,first_seen,last_seen,active_days,event_count_y,unique_viewers,avg_time_spent,label
0,07d18cd7-e320-4f68-9198-4bc78f177e7d,zdw0jrig-1,,,,,,,,,,0.000126
1,07d18cd7-e320-4f68-9198-4bc78f177e7d,y5pfnbmp-1,,,,,,,,,,0.000126
2,07d18cd7-e320-4f68-9198-4bc78f177e7d,eo2eyhgk-1,,,,,,,,,,0.000126
3,07d18cd7-e320-4f68-9198-4bc78f177e7d,fknyydal-1,,,,,,,,,,0.000126
4,07d18cd7-e320-4f68-9198-4bc78f177e7d,61ogen4w-1,,,,,,,,,,0.000126


In [None]:
import pandas as pd
import numpy as np

#  Loading test articles again if needed
test_articles_path = '/content/drive/MyDrive/test_data/testing_content/part-00000-8be13c58-b74d-4e30-8877-c8b5e168035a-c000.csv'
test_articles = pd.read_csv(test_articles_path, on_bad_lines='skip')
test_articles = test_articles.rename(columns={'hashid': 'hashId'})

#  Getting all unique test users
test_users_unique = test_users[['deviceId']].drop_duplicates().reset_index(drop=True)
print(f" Total test users: {len(test_users_unique)}")

#  Creating content pool (unique hashId)
content_ids = test_articles[['hashId']].drop_duplicates()

#  Preparing holder for final top-50 predictions
final_submissions = []

#  Batch loop (adjust batch_size for your available RAM)
batch_size = 250  # Safe for ~10GB RAM
for start in range(0, len(test_users_unique), batch_size):
    end = start + batch_size
    batch_users = test_users_unique.iloc[start:end].copy()

    print(f" Processing users {start} to {end}...")

    # 1. Creating user × content pairs
    batch_candidates = batch_users.assign(key=1).merge(
        content_ids.assign(key=1), on='key'
    ).drop(columns='key')

    # 2. Merging user and content features
    batch = batch_candidates.merge(user_df, on='deviceId', how='left')
    batch = batch.merge(content_df, on='hashId', how='left')

    # 3. Feature prep
    batch_X = batch[[col for col in X.columns if col in batch.columns]]
    batch['label'] = model.predict_proba(batch_X)[:, 1]

    # 4. Top-50 per user
    top_50 = (
        batch.sort_values(['deviceId', 'label'], ascending=[True, False])
        .groupby('deviceId')
        .head(50)
        .reset_index(drop=True)
    )
    final_submissions.append(top_50[['deviceId', 'hashId', 'label']])

    # 5. Cleaning up memory
    del batch, batch_X, top_50, batch_candidates
    import gc; gc.collect()

#  Combining all batches
submission_df = pd.concat(final_submissions, ignore_index=True)

#  Saving final submission
submission_df.to_csv('submission.csv', index=False)
print(" Submission saved as submission.csv")
print(submission_df.head())


🔢 Total test users: 10400
🔄 Processing users 0 to 250...
🔄 Processing users 250 to 500...
🔄 Processing users 500 to 750...
🔄 Processing users 750 to 1000...
🔄 Processing users 1000 to 1250...
🔄 Processing users 1250 to 1500...
🔄 Processing users 1500 to 1750...
🔄 Processing users 1750 to 2000...
🔄 Processing users 2000 to 2250...
🔄 Processing users 2250 to 2500...
🔄 Processing users 2500 to 2750...
🔄 Processing users 2750 to 3000...
🔄 Processing users 3000 to 3250...
🔄 Processing users 3250 to 3500...
🔄 Processing users 3500 to 3750...
🔄 Processing users 3750 to 4000...
🔄 Processing users 4000 to 4250...
🔄 Processing users 4250 to 4500...
🔄 Processing users 4500 to 4750...
🔄 Processing users 4750 to 5000...
🔄 Processing users 5000 to 5250...
🔄 Processing users 5250 to 5500...
🔄 Processing users 5500 to 5750...
🔄 Processing users 5750 to 6000...
🔄 Processing users 6000 to 6250...
🔄 Processing users 6250 to 6500...
🔄 Processing users 6500 to 6750...
🔄 Processing users 6750 to 7000...
🔄 P

## Rule-Based Location-Aware Recommendations

*A lightweight rule-based algorithm leveraging user location and global popularity.*

In [None]:
top_content = (
    event_df[event_df['event_type'] == 'TimeSpent-Front']
    .groupby('hashId')
    .size()
    .reset_index(name='view_count')
    .sort_values('view_count', ascending=False)
    .reset_index(drop=True)
)


In [None]:
print(top_content.head())  # Should have columns: hashId, view_count (at least)


       hashId  view_count
0  q4dqaz8m-1        5391
1  im5bxn3a-1        4962
2  7q98ag4y-1        4549
3  4hvxw5is-1        4523
4  b406vqmy-1        4294


In [None]:
import pandas as pd

#  Correct path to device file in Google Drive
device_path = '/content/drive/MyDrive/test_data/devices/part-00000-cdb2cdd7-9d14-4000-b947-4d0475444217-c000.csv'

#  Loading device data with user locations
devices = pd.read_csv(device_path)
user_loc = devices[['deviceid', 'lastknownsubadminarea']].dropna().rename(
    columns={'deviceid': 'deviceId', 'lastknownsubadminarea': 'location'}
)

#  Using already available top_content DataFrame
# If not available in RAM, load it again:
# top_content = pd.read_csv('/content/drive/MyDrive/test_data/top_content.csv')

top_pop = top_content.copy()
top_pop = top_pop.head(500)  # Ensure it's light enough

#  Cartesian join for all location-user pairs and top articles
user_loc['key'] = 1
top_pop['key'] = 1
cand = user_loc.merge(top_pop, on='key').drop(columns='key')

#  Ranking articles per user
cand['rank'] = cand.groupby('deviceId')['view_count'].rank(ascending=False, method='first')

#  Extracting top-50 per user
top50_rule = cand[cand['rank'] <= 50][['deviceId', 'hashId', 'rank', 'location']]
top50_rule.to_csv('top50_rule_based.csv', index=False)

print(" Rule-based fallback ready:")
print(top50_rule.head())


✅ Rule-based fallback ready:
                               deviceId      hashId  rank location
0  8d0972fb-4f2f-4907-9629-188ccda4846a  q4dqaz8m-1   1.0    Noida
1  8d0972fb-4f2f-4907-9629-188ccda4846a  im5bxn3a-1   2.0    Noida
2  8d0972fb-4f2f-4907-9629-188ccda4846a  7q98ag4y-1   3.0    Noida
3  8d0972fb-4f2f-4907-9629-188ccda4846a  4hvxw5is-1   4.0    Noida
4  8d0972fb-4f2f-4907-9629-188ccda4846a  b406vqmy-1   5.0    Noida


In [None]:
from google.colab import files

files.download('submission.csv')
files.download('top50_rule_based.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>