# Feature Engineering: Text Features

Extract TF-IDF features from abstracts:
1. Load cleaned data
2. Text preprocessing
3. TF-IDF vectorization
4. Save text features

In [None]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from pathlib import Path
import pickle

pd.set_option('display.max_columns', None)

## 1. Load Data

In [None]:
df = pd.read_pickle('../data/processed/cleaned_data.pkl')
print(f"Dataset: {df.shape}")
print(f"Abstracts available: {df['Abstract'].notna().sum()}")

## 2. Text Preprocessing

In [None]:
def preprocess_text(text):
    if pd.isna(text):
        return ""
    text = str(text).lower()
    return text

df['abstract_processed'] = df['Abstract'].apply(preprocess_text)

print(f"Sample processed abstract:\n{df['abstract_processed'].iloc[0][:200]}...")

## 3. TF-IDF Vectorization

In [None]:
tfidf = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    min_df=5,
    max_df=0.8,
    stop_words='english'
)

tfidf_matrix = tfidf.fit_transform(df['abstract_processed'])

print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")
print(f"Vocabulary size: {len(tfidf.vocabulary_)}")
print(f"\nTop 20 features by average TF-IDF:")
feature_names = tfidf.get_feature_names_out()
mean_tfidf = np.asarray(tfidf_matrix.mean(axis=0)).ravel()
top_indices = mean_tfidf.argsort()[-20:][::-1]
for idx in top_indices:
    print(f"  {feature_names[idx]}: {mean_tfidf[idx]:.4f}")

## 4. Convert to DataFrame

In [None]:
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=[f'tfidf_{feat}' for feat in feature_names],
    index=df.index
)

print(f"TF-IDF features DataFrame: {tfidf_df.shape}")
tfidf_df.head()

## 5. Save Features

In [None]:
output_dir = Path('../data/features')
output_dir.mkdir(parents=True, exist_ok=True)

tfidf_df.to_pickle(output_dir / 'text_features.pkl')
print(f"Text features saved to: {output_dir / 'text_features.pkl'}")

with open(output_dir / 'tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(tfidf, f)
print(f"TF-IDF vectorizer saved to: {output_dir / 'tfidf_vectorizer.pkl'}")

## Summary

In [None]:
print("=" * 50)
print("TEXT FEATURES SUMMARY")
print("=" * 50)
print(f"Total papers: {len(df)}")
print(f"TF-IDF features: {tfidf_df.shape[1]}")
print(f"Vocabulary size: {len(tfidf.vocabulary_)}")
print(f"N-gram range: {tfidf.ngram_range}")
print(f"Min doc frequency: {tfidf.min_df}")
print(f"Max doc frequency: {tfidf.max_df}")