In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression




data = pd.read_csv('./data/SMSSpam_processed.csv', delimiter=",").dropna()

# convert the text data into vectors
vectorizer = TfidfVectorizer()

X = vectorizer.fit_transform(data['text'])
Y = data['label'].values.ravel()

In [None]:
print(X)
print(Y)

In [None]:
#write to file to see

# Convert the sparse matrix X into a dense array
X_dense = X.toarray()

# Create a DataFrame with X and Y
data_export = pd.DataFrame(X_dense, columns=vectorizer.get_feature_names_out())
data_export['label'] = Y

# Write the DataFrame to a CSV file
data_export.to_csv('X_Y_data.csv', index=False)

The first part of each task is to load the data and extract the features and assign them to the variable X and Y using the code snippet on the previous page.
You then need to implement the train_test_split method from the sklearn (taking into account the random_state attribute), where necessary.
You will then print the accuracy of the LogisticRegression model alone or based on the results of resampling.
To recap, your answer for each exercise will be printed to the terminal by your code, and represented by a number to 3 decimal places (e.g. 87.040).

In [7]:
#simple test_train_split method
#with linear regression model

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

test_size = 0.33
seed = 7

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)

model = LogisticRegression(solver = 'liblinear')
model.fit(X_train, Y_train)

tts_result = model.score(X_test, Y_test)

print("%.3f" % (tts_result * 100.0))

94.070


In this exercise you will implement the k-fold Cross Validation resampling approach. This exercise assumes you have the data loaded and you have your features X, and target variableY (you can keep your current implementation for the previous exercise and adapt it.
You will need to implement the KFold class from the sklearn.model_selection with k=10 folds (you will need to specify the value of the random_state attribute to ensure you obtain the same results).
You will also need to use the cross_val_score class to implement the approach and obtain the scores for each fold.
You will then print the average accuracy of a LogisticRegression model to 3 decimal places (e.g. 87.040) in order to answer the question to this exercise.

In [8]:
#k-fold cross validation


from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

num_folds = 10
seed = 7

kfold = KFold(n_splits=num_folds, random_state=seed ,shuffle=True)

model = LogisticRegression(solver = 'liblinear')

kf_results = cross_val_score(model, X, Y, cv=kfold)

print("%.3f" % (kf_results.mean() * 100.0))

95.437


In [9]:
#LeaveOneOut

from sklearn.model_selection import LeaveOneOut

loocv = LeaveOneOut()

model = LogisticRegression(solver = 'liblinear')

loo_results = cross_val_score(model, X, Y, cv=loocv)

print("%.3f" % (loo_results.mean() * 100.0))

95.725


In [10]:
from sklearn.model_selection import LeaveOneOut
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

loocv = LeaveOneOut()
model = LogisticRegression(solver='liblinear')

scores = []
for i, (train_index, test_index) in enumerate(loocv.split(X)):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]
    model.fit(X_train, y_train)
    score = model.score(X_test, y_test)
    scores.append(score)
    print(f"Finished iteration {i+1}/{len(Y)} with score: {score}")

print("Mean accuracy:", sum(scores) / len(scores))


Finished iteration 1/5567 with score: 1.0
Finished iteration 2/5567 with score: 1.0
Finished iteration 3/5567 with score: 1.0
Finished iteration 4/5567 with score: 1.0
Finished iteration 5/5567 with score: 1.0
Finished iteration 6/5567 with score: 0.0
Finished iteration 7/5567 with score: 1.0
Finished iteration 8/5567 with score: 1.0
Finished iteration 9/5567 with score: 1.0
Finished iteration 10/5567 with score: 1.0
Finished iteration 11/5567 with score: 1.0
Finished iteration 12/5567 with score: 1.0
Finished iteration 13/5567 with score: 1.0
Finished iteration 14/5567 with score: 1.0
Finished iteration 15/5567 with score: 1.0
Finished iteration 16/5567 with score: 0.0
Finished iteration 17/5567 with score: 1.0
Finished iteration 18/5567 with score: 1.0
Finished iteration 19/5567 with score: 1.0
Finished iteration 20/5567 with score: 0.0
Finished iteration 21/5567 with score: 1.0
Finished iteration 22/5567 with score: 1.0
Finished iteration 23/5567 with score: 1.0
Finished iteration 2

In [14]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import ShuffleSplit
import time

n_splits = 10
test_size = 0.33
seed = 7

kfold = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed)

model = LogisticRegression(solver = 'liblinear')

start_time = time.time()  # Record the start time

results = cross_val_score(model, X, Y, cv=kfold)

end_time = time.time()  # Record the end time

print("%.3f" % (results.mean() * 100.0))
execution_time = end_time - start_time  # Calculate the execution time
print("Execution time: %.2f seconds" % execution_time)

94.385
Execution time: 0.06 seconds


In [15]:

from sklearn.utils import resample
from sklearn.linear_model import LogisticRegression
import numpy as np

# Number of bootstrap samples
n_bootstraps = 100

# List to store the accuracy scores of each bootstrap sample
bootstrap_scores = []

# Fit the model to each bootstrap sample and calculate accuracy
for i in range(n_bootstraps):
    # Resample with replacement
    X_bootstrap, Y_bootstrap = resample(X, Y, random_state=i)
    
    # Create and fit the logistic regression model
    model = LogisticRegression(solver='liblinear')
    model.fit(X_bootstrap, Y_bootstrap)
    
    # Calculate accuracy on the original dataset
    score = model.score(X, Y)
    bootstrap_scores.append(score)

# Calculate the mean accuracy and its standard deviation
mean_accuracy = np.mean(bootstrap_scores) * 100.0
std_accuracy = np.std(bootstrap_scores) * 100.0

print("Mean accuracy: %.3f%%" % mean_accuracy)
print("Standard deviation of accuracy: %.3f%%" % std_accuracy)





Mean accuracy: 96.338%
Standard deviation of accuracy: 0.230%
