# Data Merging - Scopus and SciVal

This notebook merges Scopus and SciVal datasets using the EID column.

## Objectives:
1. Load both datasets
2. Merge using EID as key
3. Add abstracts from Scopus to SciVal entries
4. Resolve duplicate columns
5. Save merged dataset

In [None]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
from pathlib import Path

from src.data.load_data import load_scopus_data, load_scival_data
from src.data.merge_data import (
    merge_datasets,
    add_abstracts_to_scival,
    resolve_duplicate_columns
)
from src.utils.config import config

pd.set_option('display.max_columns', None)

## 1. Load Data

In [None]:
# Update with actual file paths
SCOPUS_FILE = '../data/raw/scopus_data.csv'
SCIVAL_FILE = '../data/raw/scival_data.csv'

# Load datasets
# scopus_df = load_scopus_data(SCOPUS_FILE)
# scival_df = load_scival_data(SCIVAL_FILE)

# print(f"Scopus: {scopus_df.shape}")
# print(f"SciVal: {scival_df.shape}")

## 2. Add Abstracts to SciVal Data

As mentioned in the README, we need to add abstracts from Scopus to SciVal entries using the EID column.

In [None]:
# Add abstracts from Scopus to SciVal
# merged_df = add_abstracts_to_scival(
#     scival_df=scival_df,
#     scopus_df=scopus_df,
#     eid_column='EID',
#     abstract_column='Abstract'  # Update with actual column name
# )

# print(f"Merged dataset shape: {merged_df.shape}")

## 3. Full Merge (Optional)

If you want to merge all columns from both datasets:

In [None]:
# Full merge of both datasets
# merged_df = merge_datasets(
#     scopus_df=scopus_df,
#     scival_df=scival_df,
#     on='EID',
#     how='inner'  # or 'outer' to keep all records
# )

# print(f"Merged dataset shape: {merged_df.shape}")
# print(f"Columns: {len(merged_df.columns)}")

## 4. Resolve Duplicate Columns

If some columns exist in both datasets, resolve conflicts:

In [None]:
# Check for duplicate columns
# duplicate_cols = [col for col in merged_df.columns if '_scopus' in col or '_scival' in col]
# print(f"Duplicate columns found: {len(duplicate_cols)}")
# if duplicate_cols:
#     print(duplicate_cols[:10])  # Show first 10

In [None]:
# Resolve duplicates (prioritize Scopus or SciVal)
# merged_df = resolve_duplicate_columns(
#     df=merged_df,
#     priority='scival',  # Choose 'scopus' or 'scival'
#     suffixes=('_scopus', '_scival')
# )

# print(f"Final dataset shape: {merged_df.shape}")

## 5. Examine Merged Data

In [None]:
# Check merged data
# merged_df.head()

In [None]:
# Check if abstracts were added successfully
# if 'Abstract' in merged_df.columns:
#     print(f"Abstracts present: {merged_df['Abstract'].notna().sum()} / {len(merged_df)}")
#     print(f"Missing abstracts: {merged_df['Abstract'].isna().sum()}")

In [None]:
# Data info
# merged_df.info()

## 6. Save Merged Dataset

In [None]:
# Save to processed data directory
# output_path = Path('../data/processed/merged_data.csv')
# output_path.parent.mkdir(parents=True, exist_ok=True)

# merged_df.to_csv(output_path, index=False)
# print(f"Merged data saved to: {output_path}")

# Also save as pickle for faster loading
# pickle_path = output_path.with_suffix('.pkl')
# merged_df.to_pickle(pickle_path)
# print(f"Also saved as pickle: {pickle_path}")

## Summary Statistics

In [None]:
# Summary
# print("=" * 50)
# print("MERGE SUMMARY")
# print("=" * 50)
# print(f"Total records in merged dataset: {len(merged_df)}")
# print(f"Total columns: {len(merged_df.columns)}")
# print(f"Memory usage: {merged_df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
# print(f"\nKey columns present:")
# key_cols = ['EID', 'Abstract', 'Title', 'Citation Count', 'Authors', 'Year']
# for col in key_cols:
#     matching = [c for c in merged_df.columns if col.lower() in c.lower()]
#     if matching:
#         print(f"  ✓ {col}: {matching}")
#     else:
#         print(f"  ✗ {col}: Not found")

## Next Steps

1. Proceed to `03_data_cleaning.ipynb` for data cleaning
2. Handle missing values
3. Remove invalid or corrupted entries
4. Prepare data for feature engineering