In [5]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
import joblib

# Custom tokenizer to remove unwanted characters and tokenize the text
def custom_tokenizer(text):
    # Space out special characters 
    text = re.sub(r"(['\";=])", r" \1 ", text)  
    text = re.sub(r"--", " -- ", text)          
    text = re.sub(r"\s+", " ", text)            
    text = text.lower()
    return text.strip().split()

# Load the dataset
df = pd.read_csv('balanced.csv')
print(f"Dataset shape: {df.shape}")
print(df.head())

# Check for missing values and drop rows where 'Sentence' column is NaN
df.dropna(subset=['Sentence'], inplace=True)
print(f"After dropping missing values: {df.shape}")

# Ensure all 'Sentence' entries are strings 
df['Sentence'] = df['Sentence'].astype(str)

# Define the feature (X) and label (y)
X = df['Sentence']
y = df['Label']

# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)
# Count the number of samples in training and test sets
print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

# Count labels in the training set
train_label_counts = y_train.value_counts()
print("\nTraining label distribution:")
print(f"Safe (0): {train_label_counts.get(0, 0)}")
print(f"Malicious (1): {train_label_counts.get(1, 0)}")

# Count labels in the test set
test_label_counts = y_test.value_counts()
print("\nTest label distribution:")
print(f"Safe (0): {test_label_counts.get(0, 0)}")
print(f"Malicious (1): {test_label_counts.get(1, 0)}")

# Initialize the TfidfVectorizer with the custom tokenizer
vectorizer = TfidfVectorizer(tokenizer=custom_tokenizer, max_features=20000)

# Fit and transform the training data, and transform the test data
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Print the shape of the resulting feature matrices
print(f"After vectorization: X_train_vect shape: {X_train_vect.shape} | X_test_vect shape: {X_test_vect.shape}")

# Apply SMOTE to balance the training set (handling class imbalance)
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_vect, y_train)

# Print the size of the resampled training set
print(f"After SMOTE: Resampled training set size: {X_train_resampled.shape[0]} | Test set size remains: {X_test_vect.shape[0]}")

# Save the preprocessed data and the vectorizer for later use
joblib.dump(vectorizer, 'vectorizer.pkl')
joblib.dump(X_train_resampled, 'X_train_resampled.pkl')
joblib.dump(y_train_resampled, 'y_train_resampled.pkl')
joblib.dump(X_test_vect, 'X_test_vect.pkl')
joblib.dump(y_test, 'y_test.pkl')

# Print information about the vectorizer
print(f"Preprocessing complete and vectorizer saved!")
print(f"Vectorizer Vocabulary Size: {len(vectorizer.get_feature_names_out())}")



Dataset shape: (23322, 2)
                                            Sentence  Label
0  Geisingen Kirchen-Hausen first documented toge...      0
1      The victim later died result serious injuries      0
2  Aircraft electronic device rules stay force Au...      0
3  The problem known administrative department in...      0
4  The specific figures must calculated municipal...      0
After dropping missing values: (23322, 2)

Training set size: 16325
Test set size: 6997

Training label distribution:
Safe (0): 8568
Malicious (1): 7757

Test label distribution:
Safe (0): 3655
Malicious (1): 3342




After vectorization: X_train_vect shape: (16325, 20000) | X_test_vect shape: (6997, 20000)
After SMOTE: Resampled training set size: 17136 | Test set size remains: 6997
Preprocessing complete and vectorizer saved!
Vectorizer Vocabulary Size: 20000
