In [1]:
from google.colab import files
import zipfile
import os

# Upload the zip file
uploaded = files.upload()

# Extract the zip file
zip_path = 'mini-project-1.zip'  # Change to your zip file name
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall('/content/')

# List the extracted files
print(os.listdir('/content/'))

import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Load training and validation data
train_features = np.load('/content/train_feature.npy')  # (n_samples, 13, 786)
valid_features = np.load('/content/valid_feature.npy')  # (n_samples, 13, 786)

# Load labels
train_labels = pd.read_csv('/content/train_emoticon.csv')['label']
valid_labels = pd.read_csv('/content/valid_emoticon.csv')['label']

# Feature Transformation (Averaging 13 embeddings to a single vector of size 786)
X_train_avg = np.mean(train_features, axis=1)  # Shape: (n_samples, 786)
X_val_avg = np.mean(valid_features, axis=1)    # Shape: (n_samples, 786)

# Check shapes
print(f"Train features shape: {X_train_avg.shape}")
print(f"Validation features shape: {X_val_avg.shape}")

# Define percentages of training data to use
percentages = [20, 40, 60, 80, 100]
accuracies = []

# Train SVM on subsets of the training data
for pct in percentages:
    # Get the first `pct%` of the training data
    subset_size = int((pct / 100) * X_train_avg.shape[0])
    X_train_subset = X_train_avg[:subset_size]
    y_train_subset = train_labels[:subset_size]

    # Train the model on the subset
    model = SVC(C=1.0, kernel='linear')
    model.fit(X_train_subset, y_train_subset)

    # Predict on the validation set
    y_val_pred = model.predict(X_val_avg)

    # Calculate validation accuracy
    accuracy = accuracy_score(valid_labels, y_val_pred)
    accuracies.append(accuracy)

    print(f"Training set size: {subset_size}, Validation Accuracy: {accuracy:.4f}")

# Plot accuracy vs. training set size
plt.figure(figsize=(8, 6))
plt.plot(percentages, accuracies, marker='o', linestyle='-', color='b')
plt.title('Validation Accuracy vs. Training Set Size (Averaging Embeddings)')
plt.xlabel('Training Set Size (%)')
plt.ylabel('Validation Accuracy')
plt.grid(True)
plt.show()


KeyboardInterrupt: 