In [33]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
import os

In [None]:
try:
    print("\nLoading data from CSV files...")
    # load the 5D feature data
    X = pd.read_csv('/Users/yumin/Documents/GitHub/TikTok-TechJam-2025/feature_engineering_model/temp.csv')

    # load the binary ground truth labels
    y = pd.read_csv('/Users/yumin/Documents/GitHub/TikTok-TechJam-2025/data_gpt_labeler/final_data_labeled_1.csv')
    
    # check if the required columns exist
    required_features = ['A', 'B', 'D2', 'E', 'G']
    if not all(col in X.columns for col in required_features):
        print(f"Error: features.csv must contain columns {required_features}.")
        exit()

    required_label = 'policy_label'
    if required_label not in y.columns:
        print(f"Error: final_data_labeled_1.csv must contain a '{required_label}' column.")
        exit()

    print("Data loaded successfully.")

except FileNotFoundError:
    print("Error: One or both of the CSV files (features.csv, labels.csv) were not found.")
    print("Please make sure they are in the same directory as this script.")
    exit()


Loading data from CSV files...
Data loaded successfully.


In [42]:
print(y['policy_label'].value_counts())
print(X)

policy_label
1    8910
0    1090
Name: count, dtype: int64
             A         B        D2         E         G
0     0.126823  0.581653  0.660104  0.993421  0.999407
1     0.066522  0.565305  0.620909  0.360918  0.997275
2     0.158263  0.575038  0.551840  0.987322  0.999049
3     0.062633  0.575837  0.630236  0.859828  0.999422
4     0.005904  0.572793  0.792418  0.901759  0.999557
...        ...       ...       ...       ...       ...
9995  0.353107  0.584095  0.609373  0.986010  0.999477
9996  0.407681  0.572810  0.561827  0.803679  0.999491
9997  0.129818  0.583600  0.596096  0.905910  0.999499
9998  0.024677  0.561784  0.500000  0.767955  0.999611
9999  0.298661  0.572159  0.588138  0.982511  0.999352

[10000 rows x 5 columns]


In [None]:
# undersampling majority class
def undersample_data(X_data, y_data):
    # combine features and labels into a single DataFrame for easy manipulation
    combined_df = pd.concat([X_data, y_data], axis=1)

    # separate majority and minority classes
    majority_class = combined_df[combined_df['policy_label'] == 1]
    minority_class = combined_df[combined_df['policy_label'] == 0]

    # check for empty minority class
    if minority_class.empty:
        print("Warning: Minority class is empty. Cannot perfom undersampling.")
        return X_data, y_data

    # undersample the majority class to match the number of samples in the minority class
    undersampled_majority = majority_class.sample(
        n=len(minority_class), 
        replace=False, 
        random_state=42
    )

    balanced_df = pd.concat([undersampled_majority, minority_class])

    # shuffle the combined df to ensure samples are not ordered by class
    balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

    # split the balanced df back into features (X_balanced) and labels (y_balanced)
    X_balanced = balanced_df.drop('policy_label', axis=1)
    y_balanced = balanced_df['policy_label']

    print(f"Original number of samples: {len(combined_df)}")
    print(f"Balanced number of samples: {len(balanced_df)}")
    print("Undersampling complete.")

    return X_balanced, y_balanced

X_balanced, y_balanced = undersample_data(X, y['policy_label'])


Performing undersampling to balance the dataset...
Original number of samples: 10000
Balanced number of samples: 2180
Undersampling complete.


In [46]:
print(y_balanced)

0       1
1       1
2       1
3       0
4       1
       ..
2175    0
2176    0
2177    0
2178    0
2179    1
Name: policy_label, Length: 2180, dtype: int64


In [47]:
print(X_balanced)

             A         B        D2         E         G
0     0.058882  0.558941  0.324697  0.990225  0.999323
1     0.006189  0.551747  0.644799  0.993508  0.999353
2     0.245700  0.607392  0.633163  0.969169  0.999559
3     0.010014  0.526322  0.000000  0.847382  0.989642
4     0.135020  0.577114  0.710586  0.982642  0.999343
...        ...       ...       ...       ...       ...
2175  0.008589  0.531942  0.000000  0.884126  0.991157
2176  0.009070  0.538217  0.500000  0.858099  0.999559
2177  0.011487  0.552040  0.688271  0.842453  0.999393
2178  0.012410  0.565927  0.597343  0.575527  0.999489
2179  0.158671  0.584328  0.668578  0.770834  0.999300

[2180 rows x 5 columns]


In [None]:

# train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X_balanced, y_balanced, test_size=0.2, random_state=42, stratify=y_balanced
)


Splitting data into training and testing sets...


In [49]:
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

Training set size: 1744 samples
Testing set size: 436 samples


In [None]:
# training the SVM classifier
model = SVC(kernel='linear', random_state=42)
model.fit(X_train, y_train)
print("SVM model training complete.")


Initializing and training the SVM classifier...
SVM model training complete.


In [None]:
# make predictions on the test data
y_pred = model.predict(X_test)

# accuracy score
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Evaluating the model on the test set...
Model Accuracy: 0.61

Classification Report:
              precision    recall  f1-score   support

           0       0.66      0.44      0.53       218
           1       0.58      0.77      0.66       218

    accuracy                           0.61       436
   macro avg       0.62      0.61      0.60       436
weighted avg       0.62      0.61      0.60       436

