In [13]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import cosine_similarity
import joblib

# Load dataset
df = pd.read_csv('data/dataset_tratado.csv')

# Define a function to clean and parse the notes
def parse_notes(notes):
    notes_list = notes.split(',')  # Split by comma
    notes_list = [note.strip() for note in notes_list]  # Strip whitespace
    return ' '.join(notes_list)  # Combine back to a single string

# Apply the function to the 'notes' column
df['notes_combined'] = df['notes'].apply(parse_notes)

# Vectorize notes
vectorizer = TfidfVectorizer()
X_full = vectorizer.fit_transform(df['notes_combined'])

# Save the vectorizer
joblib.dump(vectorizer, 'model/vectorizer.pkl')

# Save the notes vectors
df_vectors = pd.DataFrame(X_full.toarray(), index=df.index)
df_vectors.to_csv('data/notes_vectors.csv', index=False)

# Save the dataset with 'notes_combined' and 'vectors'
df.to_csv('data/dataset_tratado.csv', index=False)