In [1]:
from nltk.corpus import stopwords
from  nltk.tokenize import  word_tokenize
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import train_test_split

### 1.Use spam dataset

In [2]:
df = pd.read_csv('spamUtf8.csv')
# get data set number
len(df)

5572

### pre-process  
import nltk package;  
Use regular expressions to do text clean;  
Tokenize the cleaned sentence into words;  
Remove stopwords from the list of words

In [3]:
def pre_process(sentence):
    stop_words = set(stopwords.words('english'))
    # remove EXPECT A-Za-z white space
    newsentence = re.sub(re.compile(r'[^A-Za-z\s+]'),'',sentence.strip().lower())
    pattern = re.compile(r'\s+')
    # replace mult_white space to one white space
    newsentence = re.sub(pattern, ' ', newsentence)
    word_tokens = word_tokenize(newsentence)
    filtered_sentence = [w for w in word_tokens if w not in stop_words]
    return filtered_sentence

Delete unused columns;  
Rename key column to "label" and "text"
Perform preprocessing on the content of the text column and assign the result to the text column in the form of a string

In [4]:
conlumn = df.columns
df.drop(conlumn[2:],axis=1,inplace=True)
new_column = ['label','text']
df.columns = new_column
for index, row in df.iterrows():
    sentence = row.loc['text']
    filtered_sentence = pre_process(sentence)
    row.loc['text'] = ' '.join(filtered_sentence)
df.head()

Unnamed: 0,label,text
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts st ...
3,ham,u dun say early hor u c already say
4,ham,nah dont think goes usf lives around though


In [5]:
df['text'][0]

'go jurong point crazy available bugis n great world la e buffet cine got amore wat'

### 2.Use TF-IDF

In [6]:
# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
# Uses the fit_transform to convert the text data into the TF-IDF feature matrix
tfidf_matrix = tfidf_vectorizer.fit_transform(df['text'])
# Use "get_feature_names_out" method to retrieve the feature names from the TF-IDF vectorizer
tfidf_features = tfidf_vectorizer.get_feature_names_out()
# Calculates the number of features in the TF-IDF matrix 
len(tfidf_features)

8388

### 3.Use feature selection with variance threshold  
(use threshold level = 0.001) When the threshold is 0.01, no feature in the matrix has a variance greater than or equal to the specified threshold. So I define the threshold level = 0.001

In [7]:
threshold = 0.001
# Create instance of VarianceThreshold class and specify the threshold value
variance_threshold = VarianceThreshold(threshold=threshold)
# removes features with variance below the specified threshold
tfidf_matrix_variance = variance_threshold.fit_transform(tfidf_matrix)
# Use 'get_support()' to indicate which features were selected based on the variance threshold.
selected_features = variance_threshold.get_support()
# Calculate the number of features that passed the variance threshold
# selected_features is a boolean mask, so sum() can get the number of the "True" from the selected_features
selected_features.sum()

141

### 4.Removed features:  
tfidf_features - elected_features = Removed features

In [8]:
len(tfidf_features) - selected_features.sum()

8247

### 5.Apply stratified hold-out  
(use 70:30 ratio,shuffle = False,random_state = 1234)

In [9]:
# The ratio of training set to test set is 7:3
# data won't be shuffled
x_train,x_test,y_train,y_test = train_test_split(tfidf_matrix_variance,df['label'].values,test_size = 0.3,shuffle = False,random_state = 1234)

### 6.1 train matrix shape

In [10]:
# get train matrix shape
x_train.shape

(3900, 141)

### 6.2 test matrix shape

In [11]:
# get test matrix shape
x_test.shape

(1672, 141)

### 7.1 top 10 rows of train set

In [12]:
# get top10 rows
x_train[:10].toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### 7.2 buttom 10 rows of test set

In [13]:
# get buttom10 rows
x_train[-10:].toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])