In [None]:
import pandas as pd

# Load the datasets
training_data_path = 'training.csv'
validation_data_path = 'validation.csv'
test_data_path = 'test.csv'

# Read the CSV files
training_data = pd.read_csv(training_data_path)
validation_data = pd.read_csv(validation_data_path)
test_data = pd.read_csv(test_data_path)

# Display the first few rows of each dataset to understand the structure
(training_data.head(), validation_data.head(), test_data.head())

In [None]:
training_data["label"].value_counts()

In [None]:
training_data

In [None]:
validation_data

In [None]:
test_data

In [None]:
training_data = training_data.drop_duplicates()
validation_data = validation_data.drop_duplicates()
test_data = test_data.drop_duplicates()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import re
import scipy.sparse as sp
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import os

lemmatizer = WordNetLemmatizer()

# Define the preprocessing function including stop words removal
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Tokenize text
    tokens = word_tokenize(text)
    # Lemmatization and handling negations
    prev_word = ""
    processed_tokens = []
    for word in tokens:
        if word in ENGLISH_STOP_WORDS:
            continue
        if word == "not":
            prev_word = "not_"
        else:
            if prev_word == "not_":
                word = prev_word + word
                prev_word = ""
            word = lemmatizer.lemmatize(word)
            # Remove punctuation and numbers
            word = re.sub(r'[^\w\s]', '', word)
            word = re.sub(r'\d+', '', word)
            processed_tokens.append(word)
    return ' '.join(processed_tokens)

# Load the datasets again
training_data = pd.read_csv('training.csv')
validation_data = pd.read_csv('validation.csv')
test_data = pd.read_csv('test.csv')

# Apply the preprocessing to the text data
training_data['text'] = training_data['text'].apply(preprocess_text)
validation_data['text'] = validation_data['text'].apply(preprocess_text)
test_data['text'] = test_data['text'].apply(preprocess_text)

# Initialize TF-IDF Vectorizer without max_features to keep all words
# Configure the TF-IDF vectorizer to include bi-grams and tri-grams and to ignore rare words that appear in less than two documents.
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3), min_df=2)

# Fit the vectorizer on the training text data and transform all datasets
tfidf_vectorizer.fit(training_data['text'])
training_data_tfidf = tfidf_vectorizer.transform(training_data['text'])
validation_data_tfidf = tfidf_vectorizer.transform(validation_data['text'])
test_data_tfidf = tfidf_vectorizer.transform(test_data['text'])

# Save the TF-IDF data as .npz files since they are in sparse format
preprocessed_data_dir = 'Preprocessed Data/'
os.makedirs(preprocessed_data_dir, exist_ok=True)

# Define file paths for the TF-IDF data
training_data_tfidf_file = os.path.join(preprocessed_data_dir, 'training_tfidf.npz')
validation_data_tfidf_file = os.path.join(preprocessed_data_dir, 'validation_tfidf.npz')
test_data_tfidf_file = os.path.join(preprocessed_data_dir, 'test_tfidf.npz')

# Save the TF-IDF data
sp.save_npz(training_data_tfidf_file, training_data_tfidf)
sp.save_npz(validation_data_tfidf_file, validation_data_tfidf)
sp.save_npz(test_data_tfidf_file, test_data_tfidf)

# Return the file paths for confirmation
(training_data_tfidf_file, validation_data_tfidf_file, test_data_tfidf_file)

#### SVC Baseline Model

In [None]:
import numpy as np
import os
import scipy.sparse as sp
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# File paths for TF-IDF data
training_data_tfidf_file = 'training_tfidf.npz'
validation_data_tfidf_file = 'validation_tfidf.npz'
test_data_tfidf_file = 'test_tfidf.npz'

# Load the original data with labels
original_training_data = pd.read_csv('training.csv')  
original_validation_data = pd.read_csv('validation.csv') 
original_test_data = pd.read_csv('test.csv')  

# Extract labels from the original data
training_labels = original_training_data['label']
validation_labels = original_validation_data['label']
test_labels = original_test_data['label']

# Load the TF-IDF data
training_data_tfidf = sp.load_npz(training_data_tfidf_file)
validation_data_tfidf = sp.load_npz(validation_data_tfidf_file)
test_data_tfidf = sp.load_npz(test_data_tfidf_file)

# Initialize the Support Vector classifier
svc = SVC()

# Train the classifier
svc.fit(training_data_tfidf, training_labels)

# Predict on validation and test data
validation_predictions = svc.predict(validation_data_tfidf)
test_predictions = svc.predict(test_data_tfidf)

# Evaluate the classifier
print("Validation Set Performance:")
print(classification_report(validation_labels, validation_predictions))
print("Accuracy:", accuracy_score(validation_labels, validation_predictions))
print("\nConfusion Matrix:")
print(confusion_matrix(validation_labels, validation_predictions))

#### SVC Hyperparameter Tuning using GridSearchCV

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Load your dataset
original_training_data = pd.read_csv('training.csv')  
original_validation_data = pd.read_csv('validation.csv') 
original_test_data = pd.read_csv('test.csv')  

# Assuming your dataset has 'text' column for the input text and 'label' column for the labels
text_data = original_training_data['text']
labels = original_training_data['label']

# Split the dataset into training, validation, and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(text_data, labels, test_size=0.2, random_state=42)
val_texts, test_texts, val_labels, test_labels = train_test_split(test_texts, test_labels, test_size=0.5, random_state=42)

# Convert text data to TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)  
train_tfidf = vectorizer.fit_transform(train_texts)
val_tfidf = vectorizer.transform(val_texts)
test_tfidf = vectorizer.transform(test_texts)

# Convert TF-IDF matrices to pandas DataFrames
train_tfidf_df = pd.DataFrame(train_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
val_tfidf_df = pd.DataFrame(val_tfidf.toarray(), columns=vectorizer.get_feature_names_out())
test_tfidf_df = pd.DataFrame(test_tfidf.toarray(), columns=vectorizer.get_feature_names_out())

# Display the resulting DataFrames
print("Train TF-IDF DataFrame")
display(train_tfidf_df)

print("\nValidation TF-IDF DataFrame")
display(val_tfidf_df)

# Display the Test TF-IDF DataFrame
print("\nTest TF-IDF DataFrame")
display(test_tfidf_df)

# Initialize the Support Vector classifier
svc = SVC()

# Define the parameter grid to search
param_grid = {
    'C' : [1, 10, 100],
    'kernel' : ['linear','rbf']
}

# Use GridSearchCV to find the best hyperparameters
grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, scoring='accuracy', cv=5)
grid_search.fit(train_tfidf, train_labels)

# Get the best hyperparameters
best_params = grid_search.best_params_
print("\nBest Hyperparameters:")
print(best_params)

# Train the classifier with the best hyperparameters
best_svc = SVC(C = best_params['C'], kernel=best_params['kernel'])
best_svc.fit(train_tfidf, train_labels)

# Predict on validation data
val_predictions = best_svc.predict(val_tfidf)

# Evaluate the classifier on the validation set
print("\nValidation Set Performance")
print(classification_report(val_labels, val_predictions))
print("Accuracy:", accuracy_score(val_labels, val_predictions))
print("\nConfusion Matrix:")
print(confusion_matrix(val_labels, val_predictions))

# Predict on test data
test_predictions = best_svc.predict(test_tfidf)

# Evaluate the classifier on the test set
print("\nTest Set Performance")
print(classification_report(test_labels, test_predictions))
print("Accuracy:", accuracy_score(test_labels, test_predictions))
print("\nConfusion Matrix:")
print(confusion_matrix(test_labels, test_predictions))