In [1]:
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
import evaluation
import tqdm

In [2]:
# Step 1: Load data from JSON file
with open('a1_data/train.json', 'r',encoding='utf-8') as f:
    data = json.load(f)

In [3]:
# Extract sentences and languages
sentences = [entry['text'] for entry in data]
languages = [entry['langid'] for entry in data]

# Step 2: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(sentences, languages, test_size=0.2, random_state=42)

In [4]:
# Step 3: Convert text data into numerical features using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [5]:
# Step 4: Train SVM model
svm_classifier = SVC(kernel='linear', verbose=True)
svm_classifier.fit(X_train_tfidf, y_train)

[LibSVM]........*...*
optimization finished, #iter = 11814
obj = -817.329138, rho = 0.721717
nSV = 6549, nBSV = 360
..........*.....*
optimization finished, #iter = 15465
obj = -882.694133, rho = 0.816958
nSV = 8029, nBSV = 444
......*..*
optimization finished, #iter = 8995
obj = -749.088770, rho = 0.570891
nSV = 4778, nBSV = 278
.....*...*
optimization finished, #iter = 8286
obj = -699.898748, rho = 0.523570
nSV = 4475, nBSV = 256
.*.*
optimization finished, #iter = 2667
obj = -260.481688, rho = -0.509467
nSV = 1549, nBSV = 92
.......*...*
optimization finished, #iter = 10855
obj = -759.766762, rho = 0.664426
nSV = 5687, nBSV = 302
...*.*
optimization finished, #iter = 4932
obj = -616.881002, rho = 0.343349
nSV = 2955, nBSV = 174
....*.*
optimization finished, #iter = 5462
obj = -669.843035, rho = 0.484662
nSV = 3306, nBSV = 228
...*.*
optimization finished, #iter = 4462
obj = -554.173106, rho = 0.124416
nSV = 2519, nBSV = 119
......*...*
optimization finished, #iter = 9966
obj = -761

In [None]:
predictions_svm = []
for i in tqdm(range(len(X_test_tfidf))):
    prediction = svm_classifier.predict(X_test_tfidf[i])
    predictions_svm.append(prediction)

In [None]:
print("Micro F1 score for SVM:", evaluation.compute_micro_f1_score(predictions_svm, y_test))

In [None]:
print("Macro F1 score for SVM:", evaluation.compute_macro_f1_score(predictions_svm, y_test))