In [1]:
import json
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
import evaluation
import numpy as np

### Load Training and Testing Data

In [2]:
# Step 1: Load data from JSON file
with open('a1_data/train.json', 'r',encoding='utf-8') as f:
    data_train = json.load(f)
with open('valid_new.json', 'r',encoding='utf-8') as f:
    data_test = json.load(f)

### Training Data

In [3]:
# Extract sentences and languages
sentences_train = [entry['text'] for entry in data_train]
languages_train = [entry['langid'] for entry in data_train]

# Step 2: Split data into training and testing sets
X_train, Y_train = sentences_train, languages_train

del(sentences_train)
del(languages_train)

### Test on the training data and remove the outliers

In [4]:
for i in range(0,1):
    X_test=X_train
    Y_test=Y_train
    count_vectorizer = CountVectorizer(ngram_range=(4,6),analyzer='char')
    X_train_counts = count_vectorizer.fit_transform(X_train)
    X_test_counts = count_vectorizer.transform(X_test)
    naive_bayes_classifier = MultinomialNB(alpha=0.01)
    naive_bayes_classifier.fit(X_train_counts, Y_train)
    predictions_nb = naive_bayes_classifier.predict(X_test_counts)
    X_train_new=[]
    Y_train_new=[]
    for i in range(len(X_train)):
        if Y_train[i]==predictions_nb[i]:
            X_train_new.append(X_train[i])
            Y_train_new.append(Y_train[i])
    X_train=X_train_new
    Y_train=Y_train_new
    del(X_train_new)
    del(Y_train_new)
    del(count_vectorizer)
    del(X_train_counts)
    del(X_test_counts)
    del(naive_bayes_classifier)
    del(predictions_nb)

print("Data cleaned")

In [5]:
# with open('X_train_clean.json', 'r',encoding='utf-8') as f:
#     data_clean = json.load(f)
# X_train=data_clean
# with open('Y_train_clean.json', 'r',encoding='utf-8') as f:
#     data_clean = json.load(f)
# Y_train=data_clean
# print(len(X_train))

788185


### Testing Data

In [6]:
sentences_test = [entry['text'] for entry in data_test]
languages_test = [entry['langid'] for entry in data_test]

X_test, Y_test = sentences_test, languages_test

del(sentences_test)
del(languages_test)

### N Gram Parameter Tuning

In [7]:
# now I want to find the best value for ngram_range so I will run two loops to find the best value for ngram_range
# for i in range (1,7):
#     for j in range (i,7):
#         # Step 3: Vectorize the data
#         vectorizer = CountVectorizer(ngram_range=(i,j),analyzer='char')
#         X_train_counts = vectorizer.fit_transform(X_train)
#         X_test_counts = vectorizer.transform(X_test)
#         # Step 4: Train a classifier
#         classifier = MultinomialNB()
#         classifier.fit(X_train_counts, Y_train)
#         # Step 5: Test the classifier
#         predictions = classifier.predict(X_test_counts)
#         # Step 6: Evaluate the classifier
#         print('Micro F1 for ngram_range=(',i,',',j,'):',evaluation.compute_micro_f1_score(predictions, Y_test))
#         print('Macro F1 for ngram_range=(',i,',',j,'):',evaluation.compute_macro_f1_score(predictions, Y_test))
#         print('=======================================================')
#         # Step 7: Write the micro and macro F1 scores to a file
#         with open('ngram_range.txt', 'a') as file:
#             file.write('Micro F1 for ngram_range=('+str(i)+','+str(j)+'):'+str(evaluation.compute_micro_f1_score(predictions, Y_test))+'\n')
#             file.write('Macro F1 for ngram_range=('+str(i)+','+str(j)+'):'+str(evaluation.compute_macro_f1_score(predictions, Y_test))+'\n')
#             file.write('=======================================================\n')


### UpSampling the Data
TODO: Change the number 17 according to the data available

In [8]:
X_train_new = []
Y_train_new = []

for i in range(len(X_train)):
    if Y_train[i] == 'ta' or Y_train[i]=='kn' or Y_train[i]=='ml' or Y_train[i]=='hi' or Y_train[i]=='bn' or Y_train[i]=='mr':
        for j in range(17):
            X_train_new.append(X_train[i])
            Y_train_new.append(Y_train[i])
    else:
        X_train_new.append(X_train[i])
        Y_train_new.append(Y_train[i])

X_train = X_train_new
Y_train = Y_train_new

del(X_train_new)
del(Y_train_new)

1129593


### Vectorization

In [9]:
print("Start vectorize")
count_vectorizer = CountVectorizer(ngram_range=(4,6),analyzer='char')
X_train_counts = count_vectorizer.fit_transform(X_train)
X_test_counts = count_vectorizer.transform(X_test)
del(count_vectorizer)

(1129593, 6952776)


### Naive Bayes Model
TODO: Smoothing parameter tuning and set it according to the Capital Words
TODO: Hyperparameter tuning for the complete training data

In [10]:
# Do naive bayes
naive_bayes_classifier = MultinomialNB(alpha=0.01)
naive_bayes_classifier.fit(X_train_counts, Y_train)
predictions_nb = naive_bayes_classifier.predict(X_test_counts)
del(naive_bayes_classifier)
del(X_train_counts)
del(X_test_counts)

In [11]:
print("Micro F1 score for NB:", evaluation.compute_micro_f1_score(predictions_nb, Y_test))
print("Macro F1 score for NB:", evaluation.compute_macro_f1_score(predictions_nb, Y_test))
del(X_train)
del(Y_train)
del(predictions_nb)

Micro F1 score for NB: 0.9164179104477613
Macro F1 score for NB: 0.9315237116887807
