In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from joblib import dump


df1 = pd.read_csv('../backend/dataset1.tsv', delimiter='\t')

# Load the second dataset
df2 = pd.read_csv('../backend/dataset2.csv')
df3=pd.read_csv('../backend/dataset3.csv')

# Check column names
print("Original Dataset Columns:", df1.columns)
print("Second Dataset Columns:", df2.columns)

# Rename columns if necessary
df2 = df2.rename(columns={'Pattern String': 'text'})

# Concatenate the two datasets based on their common columns
df = pd.concat([df1[['text', 'Pattern Category']], df2[['text', 'Pattern Category']],df3[['text', 'Pattern Category']]], ignore_index=True)

# Drop rows with missing values
df = df.dropna(subset=['text', 'Pattern Category'])

# Display the count of rows after concatenation
print(df.count())
# Shuffle the dataset
df = shuffle(df, random_state=42)

# Data preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]
    text = ' '.join(tokens)
    return text

df['text'] = df['text'].apply(preprocess_text)
label_encoder = LabelEncoder()
df['Pattern Category'] = label_encoder.fit_transform(df['Pattern Category'])

# Features (X) - 'text', Labels (y) - 'Pattern Category'
X = df['text']
y = df['Pattern Category']

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Linear Support Vector Machine (SVM) model
svm_model = LinearSVC()
svm_model.fit(X_train_tfidf, y_train)

# Predictions on the test set
y_pred = svm_model.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Display classification report for precision, recall, and F1-score
print("Classification Report:\n", classification_report(y_test, y_pred))

# Save the trained model to a file
model_filename = 'svm_model.joblib'
dump(svm_model, model_filename)

# Save the TF-IDF vectorizer as well for later use in the frontend
vectorizer_filename = 'tfidf_vectorizer.joblib'
dump(tfidf_vectorizer, vectorizer_filename)

# Save the LabelEncoder
label_encoder_filename = 'label_encoder.joblib'
dump(label_encoder, label_encoder_filename)


Original Dataset Columns: Index(['page_id', 'text', 'label', 'Pattern Category'], dtype='object')
Second Dataset Columns: Index(['Pattern String', 'Comment', 'Pattern Category', 'Pattern Type',
       'Where in website?', 'Deceptive?', 'Website Page'],
      dtype='object')
text                3919
Pattern Category    3919
dtype: int64




Accuracy: 0.97
Classification Report:
               precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           2       0.99      0.96      0.97        90
           3       0.95      0.97      0.96       230
           4       1.00      1.00      1.00        12
           5       0.97      0.99      0.98       235
           6       0.75      0.43      0.55         7
           7       0.98      0.98      0.98       129
           8       1.00      1.00      1.00         3
           9       0.99      0.94      0.96        77

    accuracy                           0.97       784
   macro avg       0.90      0.92      0.90       784
weighted avg       0.97      0.97      0.97       784



['label_encoder.joblib']