# Data Exploration - Scopus and SciVal

This notebook performs initial exploration of the Scopus and SciVal data files.

## Objectives:
1. Load Scopus and SciVal data files
2. Examine data structure and schema
3. Check for EID column in both datasets
4. Identify available features
5. Assess data quality and missing values

In [None]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from src.data.load_data import load_scopus_data, load_scival_data
from src.utils.config import config

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

%matplotlib inline

## 1. Load Data Files

In [None]:
SCOPUS_FILE = '../data/raw/scopus.csv'
SCIVAL_FILE = '../data/raw/scival.csv'

scopus_df = load_scopus_data(SCOPUS_FILE)
scival_df = load_scival_data(SCIVAL_FILE)

print("Data files loaded successfully!")

## 2. Examine Scopus Data

In [None]:
print(f"Scopus data shape: {scopus_df.shape}")
print(f"\nColumns: {list(scopus_df.columns)}")
scopus_df.info()

In [None]:
scopus_df.head()

In [None]:
# Check for EID column
if 'EID' in scopus_df.columns:
    print("✓ EID column found in Scopus data")
    print(f"Unique EIDs: {scopus_df['EID'].nunique()}")
    print(f"Total rows: {len(scopus_df)}")
    print(f"Duplicate EIDs: {scopus_df['EID'].duplicated().sum()}")
else:
    print("✗ EID column NOT found in Scopus data")

## 3. Examine SciVal Data

In [None]:
print(f"SciVal data shape: {scival_df.shape}")
print(f"\nColumns: {list(scival_df.columns)}")
scival_df.info()

In [None]:
scival_df.head()

In [None]:
# Check for EID column
if 'EID' in scival_df.columns:
    print("✓ EID column found in SciVal data")
    print(f"Unique EIDs: {scival_df['EID'].nunique()}")
    print(f"Total rows: {len(scival_df)}")
    print(f"Duplicate EIDs: {scival_df['EID'].duplicated().sum()}")
else:
    print("✗ EID column NOT found in SciVal data")

## 4. Identify Key Features

In [None]:
important_features = ['Abstract', 'Title', 'Citation Count', 'Citations', 'Author', 'Authors', 'h-index', 'Venue', 'Journal', 'Year', 'Publication Year']

print("Scopus columns matching key features:")
for feat in important_features:
    matching = [col for col in scopus_df.columns if feat.lower() in col.lower()]
    if matching:
        print(f"  {feat}: {matching}")

print("\nSciVal columns matching key features:")
for feat in important_features:
    matching = [col for col in scival_df.columns if feat.lower() in col.lower()]
    if matching:
        print(f"  {feat}: {matching}")

## 5. Missing Values Analysis

In [None]:
print("Scopus Missing Values:")
scopus_missing = scopus_df.isnull().sum()
scopus_missing_pct = (scopus_missing / len(scopus_df) * 100).round(2)
missing_df = pd.DataFrame({
    'Missing Count': scopus_missing,
    'Percentage': scopus_missing_pct
})
missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

In [None]:
print("SciVal Missing Values:")
scival_missing = scival_df.isnull().sum()
scival_missing_pct = (scival_missing / len(scival_df) * 100).round(2)
missing_df = pd.DataFrame({
    'Missing Count': scival_missing,
    'Percentage': scival_missing_pct
})
missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)

## 6. Check EID Overlap

In [None]:
scopus_eids = set(scopus_df['EID'].dropna())
scival_eids = set(scival_df['EID'].dropna())

common_eids = scopus_eids.intersection(scival_eids)
only_scopus = scopus_eids - scival_eids
only_scival = scival_eids - scopus_eids

print(f"Common EIDs: {len(common_eids)}")
print(f"Only in Scopus: {len(only_scopus)}")
print(f"Only in SciVal: {len(only_scival)}")
print(f"\nOverlap percentage: {len(common_eids) / min(len(scopus_eids), len(scival_eids)) * 100:.2f}%")

## Next Steps

Based on this exploration:
1. Proceed to `02_data_merging.ipynb` to merge the datasets
2. Identify which columns to keep/drop
3. Plan data cleaning strategy