## 1. Setup and Dependencies

In [66]:
import pandas as pd
from pathlib import Path
import sys
sys.path.append(str(Path('../src').resolve()))
from data_preprocessing import clean_text, tokenize, vectorize_text
import joblib

## 2. Load your cleaned data

In [67]:
df = pd.read_csv('../data/processed/cleaned_data.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 674 entries, 0 to 673
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0.1  674 non-null    int64 
 1   Unnamed: 0    674 non-null    int64 
 2   text          674 non-null    object
 3   label         674 non-null    int64 
 4   cleaned_text  674 non-null    object
dtypes: int64(3), object(2)
memory usage: 26.5+ KB


## 3. Clean Data

In [68]:
df['cleaned_text'] = df['text'].apply(clean_text)
df.to_csv('../data/processed/cleaned_data.csv', index=False)
df['cleaned_text'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 674 entries, 0 to 673
Series name: cleaned_text
Non-Null Count  Dtype 
--------------  ----- 
674 non-null    object
dtypes: object(1)
memory usage: 5.4+ KB


# 4. Vectorize

In [69]:
X, vectorizer = vectorize_text(df['cleaned_text'])
{X.shape}

{(674, 2505)}

## 5. Save TF-IDF matrix and vectorizer

In [70]:
joblib.dump(X, '../data/processed/tfidf_matrix.pkl')
joblib.dump(vectorizer, '../models/tfidf_vectorizer.pkl')

['../models/tfidf_vectorizer.pkl']