In [9]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split

csv_data = pd.read_csv('../assets/spam.csv', encoding='ISO-8859-1')

# delete Unnamed column
csv_data = csv_data.loc[:, ~csv_data.columns.str.contains('^Unnamed')]
csv_data.columns = ['label', 'text']
# clear space
def clearSpace(text):
  pattern = re.compile(r'\s+')
  sentence = re.sub(pattern, ' ', text.lower())
  return sentence.strip()

# remove anything that is not English
def removeEnglish(text):
  pattern = re.compile(r'^[a-zA-Z]+$')
  stop_words = set(stopwords.words('english'))
  word_tokens = word_tokenize(text)
  filtered_list = [w for w in word_tokens if not w in stop_words]
  filtered_list = [w for w in filtered_list if pattern.match(w)]
  return ' '.join(filtered_list)

def clearSentence(text):
  space_text = clearSpace(text)
  filtered_list = removeEnglish(space_text)
  return filtered_list

# Preprocess text
docs = csv_data['text'].apply(clearSentence)

2. Use TF_IDF

In [10]:
# init TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# use the fit_transform transform the docs
X_tfidf = tfidf_vectorizer.fit_transform(docs)

# get feature
feature_list = tfidf_vectorizer.get_feature_names_out()

# print('----TF-IDF\n', X_tfidf)
print('----feature_list counts:', len(feature_list))

----feature_list counts: 7105


3. Apply feature selection with variance threshold
(threshold == 0.1, it will throw error)

In [11]:
# feature selection and set threshold = 0.001
threshold = 0.001
selector = VarianceThreshold(threshold)
X_selected = selector.fit_transform(X_tfidf)

4.how many feature you have removed

In [12]:
# selected_features = selector.get_support()
features_removed = X_tfidf.shape[1] - X_selected.shape[1]
# print('features removed counts: ', len(feature_list) - selected_features.sum())
print(f"features removed counts: {features_removed}")

features removed counts: 6950


5.Apply stratified hold-out with 70:30 ratio, with no shuffle, random state = 1234

In [13]:
# stratified hold-out
X_train, X_test, y_train, y_test = train_test_split(
    X_selected, 
    docs, 
    test_size=0.3, 
    random_state=1234, 
    # stratify=docs,
    shuffle=False
)

6.Report the shape of matrix for train and test set

In [14]:
# train and test
train_shape = X_train.shape
test_shape = X_test.shape
print('train_shape:', train_shape)
print('test_shape:', test_shape)

train_shape: (3900, 155)
test_shape: (1672, 155)


7. Report the top 10 and buttom 10 rows.

In [15]:
# top_10rows
top_10_rows = X_train[:10]
print('top_10_rows\n', top_10_rows)

top_10_rows
   (0, 142)	0.1934414974362614
  (0, 40)	0.16081723256683003
  (0, 41)	0.19160426362908617
  (0, 36)	0.1574042918397475
  (1, 89)	0.278979726023622
  (2, 134)	0.1399333473376727
  (2, 125)	0.13770940734010412
  (2, 32)	0.1291773438749615
  (3, 1)	0.3090166793888741
  (3, 109)	0.6207429378111018
  (3, 28)	0.34208730492608386
  (4, 4)	0.31997466807742914
  (4, 128)	0.27357082071110356
  (5, 111)	0.1738082424362937
  (5, 119)	0.18174488701480665
  (5, 61)	0.16596785147278276
  (5, 7)	0.18305898101099072
  (5, 145)	0.1962599074872638
  (5, 48)	0.1962599074872638
  (5, 89)	0.16312366637893552
  (6, 61)	0.4259234768966834
  (8, 11)	0.13766930526014834
  (8, 15)	0.40658158269686484
  (8, 99)	0.2135368880049105
  (9, 74)	0.36602529673707374
  (9, 11)	0.12747528710093337
  (9, 32)	0.3205037418850104


In [16]:
# bottom_10rows
bottom_10_rows = X_train[-10:]
print('bottom_10_rows\n', bottom_10_rows)

bottom_10_rows
   (0, 101)	0.3816270673437317
  (0, 129)	0.26890856746636066
  (1, 19)	0.30157613531188215
  (1, 11)	0.18690860628718514
  (1, 15)	0.27600051013661603
  (1, 99)	0.28991126273558676
  (1, 119)	0.2570336358191113
  (2, 145)	0.5749958598344096
  (3, 24)	0.21084247864828234
  (3, 14)	0.22696698004105995
  (3, 11)	0.14251713495294333
  (4, 151)	0.4676683902144561
  (5, 135)	0.18418278910419777
  (5, 74)	0.22681908078461524
  (5, 11)	0.15798813743607587
  (5, 15)	0.23329480324138707
  (5, 99)	0.24505313763326958
  (7, 55)	0.4881963796330494
  (7, 129)	0.35918460409570163
  (8, 109)	0.29589942088809196
  (8, 89)	0.23704995804744491
  (9, 90)	1.0
