# Feature Engineering: Title Text Features

Extract TF-IDF features from paper titles:
1. Load cleaned data
2. Text preprocessing of titles
3. TF-IDF vectorization
4. Save title features

In [None]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from pathlib import Path
import pickle

pd.set_option('display.max_columns', None)

## 1. Load Data

In [None]:
df = pd.read_pickle('../data/processed/cleaned_data.pkl')
print(f"Dataset: {df.shape}")
print(f"Titles available: {df['Title'].notna().sum()}")

## 2. Title Preprocessing

In [None]:
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    return text

df['title_processed'] = df['Title'].apply(preprocess_text)

# Check for empty titles
empty_titles = (df['title_processed'] == "").sum()
print(f"\nEmpty titles: {empty_titles}")

print(f"\nSample processed titles:")
for i in range(5):
    print(f"{i+1}. {df['title_processed'].iloc[i]}")

## 3. TF-IDF Vectorization on Titles

Using fewer features than abstracts since titles are shorter

In [None]:
# Use 1000 features for titles (vs 5000 for abstracts)
tfidf_title = TfidfVectorizer(
    max_features=1000,
    ngram_range=(1, 2),  # Unigrams and bigrams
    min_df=3,            # Word must appear in at least 3 papers
    max_df=0.8,          # Word can't appear in more than 80% of papers
    stop_words='english'
)

tfidf_matrix_title = tfidf_title.fit_transform(df['title_processed'])

print(f"Title TF-IDF matrix shape: {tfidf_matrix_title.shape}")
print(f"Vocabulary size: {len(tfidf_title.vocabulary_)}")

## 4. Analyze Top Title Features

In [None]:
print(f"Top 30 title features by average TF-IDF:")
feature_names_title = tfidf_title.get_feature_names_out()
mean_tfidf_title = np.asarray(tfidf_matrix_title.mean(axis=0)).ravel()
top_indices_title = mean_tfidf_title.argsort()[-30:][::-1]

for idx in top_indices_title:
    print(f"  {feature_names_title[idx]}: {mean_tfidf_title[idx]:.4f}")

## 5. Convert to DataFrame

In [None]:
tfidf_title_df = pd.DataFrame(
    tfidf_matrix_title.toarray(),
    columns=[f'title_tfidf_{feat}' for feat in feature_names_title],
    index=df.index
)

print(f"Title TF-IDF features DataFrame: {tfidf_title_df.shape}")
print(f"\nSample of features:")
print(tfidf_title_df.head())

## 6. Save Features

In [None]:
output_dir = Path('../data/features')
output_dir.mkdir(parents=True, exist_ok=True)

tfidf_title_df.to_pickle(output_dir / 'title_features.pkl')
print(f"✓ Title features saved to: {output_dir / 'title_features.pkl'}")

# Save vectorizer for deployment
with open(output_dir / 'tfidf_title_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf_title, f)
print(f"✓ Title TF-IDF vectorizer saved to: {output_dir / 'tfidf_title_vectorizer.pkl'}")

## Summary

In [None]:
print("="*60)
print("TITLE TEXT FEATURES SUMMARY")
print("="*60)
print(f"Total papers: {len(df)}")
print(f"Title TF-IDF features: {tfidf_title_df.shape[1]}")
print(f"Vocabulary size: {len(tfidf_title.vocabulary_)}")
print(f"N-gram range: {tfidf_title.ngram_range}")
print(f"Min doc frequency: {tfidf_title.min_df}")
print(f"Max doc frequency: {tfidf_title.max_df}")
print(f"\n✓ Ready to combine with other features!")