In [1]:
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
import evaluation
# Step 1: Load data from JSON file
with open('a1_data/train.json', 'r') as f:
    data = json.load(f)

# Extract sentences and languages
sentences = [entry['text'] for entry in data]
languages = [entry['langid'] for entry in data]

# Step 2: Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(sentences, languages, test_size=0.2, random_state=42)

# Step 3: Convert text data into numerical features using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Step 4: Train Logistic Regression model
logistic_regression_classifier = LogisticRegression(max_iter=1000)
logistic_regression_classifier.fit(X_train_tfidf, y_train)

# Step 5: Train SVM model
svm_classifier = SVC(kernel='linear', probability=True)
svm_classifier.fit(X_train_tfidf, y_train)

# Step 6: Create a voting classifier combining Logistic Regression and SVM
voting_classifier = VotingClassifier(estimators=[('lr', logistic_regression_classifier), ('svm', svm_classifier)], voting='soft')
voting_classifier.fit(X_train_tfidf, y_train)

# Step 7: Evaluate the combined model
predictions = voting_classifier.predict(X_test_tfidf)


In [None]:
print("Micro F1 Score for combined model: ", evaluation.compute_micro_f1_score(predictions, y_test))
print("Macro F1 Score for combined model: ", evaluation.compute_macro_f1_score(predictions, y_test))