In [1]:
# Import the necessary library
from google.colab import files

# Upload the zip file (this will prompt you to select the file from your local system)
uploaded = files.upload()

# Unzip the file
!unzip mini-project-1.zip

import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Load the training and validation data
train_data = pd.read_csv('train_text_seq.csv')
valid_data = pd.read_csv('valid_text_seq.csv')

# Convert input strings into numerical arrays
X_train = np.array([list(map(int, list(seq))) for seq in train_data['input_str']])
y_train = train_data['label_str'].astype(int).values

X_valid = np.array([list(map(int, list(seq))) for seq in valid_data['input_str']])
y_valid = valid_data['label_str'].astype(int).values

# Train an SVM model with soft margin (C controls the margin softness)
C = 1.0  # Penalty parameter, tune this hyperparameter
model = SVC(C=C, kernel='linear')  # Linear SVM

# Fit the model on full training data
model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred_valid = model.predict(X_valid)

# Calculate accuracy on the validation set
accuracy = accuracy_score(y_valid, y_pred_valid)
print(f"Validation Accuracy: {accuracy:.4f}")

train_sizes = [0.2, 0.4, 0.6, 0.8, 1.0]
accuracies = []

for size in train_sizes:
    # Select the first `size` fraction of training data
    X_train_subset, _, y_train_subset, _ = train_test_split(X_train, y_train, train_size=size, random_state=42)

    # Train the SVM model on this subset
    model.fit(X_train_subset, y_train_subset)

    # Predict on the validation set
    y_pred_valid = model.predict(X_valid)

    # Calculate accuracy
    accuracy = accuracy_score(y_valid, y_pred_valid)
    accuracies.append(accuracy)

    print(f"Training with {int(size * 100)}% data: Validation Accuracy = {accuracy:.4f}")

# Plot accuracy vs. training size
plt.plot([int(s * 100) for s in train_sizes], accuracies, marker='o')
plt.xlabel('% of Training Data')
plt.ylabel('Validation Accuracy')
plt.title('Validation Accuracy vs. Training Data Size')
plt.grid(True)
plt.show()


KeyboardInterrupt: 