In [1]:
import sys
import os

# Add the path src/preprocessing.py
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [3]:
import pandas as pd
data = pd.read_csv("../data/processed/eda_output.csv")
print(data.head())

  label                                            message  message_length
0   ham  Go until jurong point, crazy.. Available only ...             111
1   ham                      Ok lar... Joking wif u oni...              29
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...             155
3   ham  U dun say so early hor... U c already then say...              49
4   ham  Nah I don't think he goes to usf, he lives aro...              61


In [12]:
from src.preprocessing import preprocess_text

# Create a new column in the data with messages preprocessing 
data['cleaned_message'] = data['message'].apply(preprocess_text)

print(data[['message', 'cleaned_message']].head(5))



                                             message  \
0  Go until jurong point, crazy.. Available only ...   
1                      Ok lar... Joking wif u oni...   
2  Free entry in 2 a wkly comp to win FA Cup fina...   
3  U dun say so early hor... U c already then say...   
4  Nah I don't think he goes to usf, he lives aro...   

                                     cleaned_message  
0  go jurong point crazy available bugis n great ...  
1                            ok lar joking wif u oni  
2  free entry wkly comp win fa cup final tkts st ...  
3                u dun say early hor u c already say  
4           nah dont think go usf life around though  


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [42]:
# Initialize the TF-IDF vectorizer
tf_idf = TfidfVectorizer(max_features=1000)  # Limit to 1000 most frequent words

# Transform the messages in the column cleaned_message in a numeric matrix (sparse matrix)  
X = tf_idf.fit_transform(data['cleaned_message'])

# sparse representation of line 0
print(X[0:1])

  (0, 927)	0.29139783035410916
  (0, 334)	0.2395913846995883
  (0, 428)	0.41898667089873387
  (0, 972)	0.34731064288019076
  (0, 338)	0.2844542469263201
  (0, 49)	0.3886271165799766
  (0, 168)	0.3957882453544288
  (0, 637)	0.34896914617041114
  (0, 325)	0.22613205283344187


In [62]:
# words in first message transformed in numbers
X[0].indices

array([927, 334, 428, 972, 338,  49, 168, 637, 325])

In [64]:
# value for every word in the first message
X[0].data

array([0.29139783, 0.23959138, 0.41898667, 0.34731064, 0.28445425,
       0.38862712, 0.39578825, 0.34896915, 0.22613205])

In [58]:
# All words that have been converted into numbers
feature_names = tf_idf.get_feature_names_out()

for index, value in zip(X[0].indices, X[0].data):
    print(f"Word: {feature_names[index]},\t TF-IDF value: {value}")

Word: wat,	 TF-IDF value: 0.29139783035410916
Word: got,	 TF-IDF value: 0.2395913846995883
Word: la,	 TF-IDF value: 0.41898667089873387
Word: world,	 TF-IDF value: 0.34731064288019076
Word: great,	 TF-IDF value: 0.2844542469263201
Word: available,	 TF-IDF value: 0.3886271165799766
Word: crazy,	 TF-IDF value: 0.3957882453544288
Word: point,	 TF-IDF value: 0.34896914617041114
Word: go,	 TF-IDF value: 0.22613205283344187


In [60]:
print("The first ten words selected:", feature_names[:10])
print("Matrix size:", X.shape)

The first ten words selected: ['abiola' 'able' 'abt' 'accept' 'access' 'account' 'across' 'actually'
 'add' 'address']
Matrix size: (5572, 1000)


In [72]:
y = data['label']
y

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: label, Length: 5572, dtype: object

In [74]:
from scipy.sparse import save_npz

save_npz("../data/processed/tfidf_matrix.npz", X)

y.to_csv("../data/processed/labels.csv", index=False)