<a href="https://colab.research.google.com/github/dinesh-saka/OLID/blob/main/Other_ML_Models_Malayalam.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount = True)
import os
root_path = 'gdrive/My Drive/OLID/'
os.chdir(root_path)

Mounted at /content/gdrive


In [None]:
!pip install transformers
!pip install demoji
!pip install nltk

Collecting demoji
  Downloading demoji-1.1.0-py3-none-any.whl (42 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.9/42.9 kB[0m [31m675.1 kB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: demoji
Successfully installed demoji-1.1.0


In [None]:
import numpy as np
import pandas as pd

import copy
import sklearn
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import f1_score
from tqdm import tqdm
import demoji
import nltk
import string
import pickle
import math
import numpy as np
import sys
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from sklearn.metrics import f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

demoji.download_codes()
plt.rcParams['figure.figsize'] = [10, 8]
plt.rcParams.update({'font.size': 16})
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  demoji.download_codes()


In [None]:
class Tokenizer():
    def __init__(self):
        self.index = {}
        self.tf_idf_index = {}
        self.wordnet_lemmatizer = WordNetLemmatizer()
        self.stopwords = set(stopwords.words('english'))

    def remove_punc(self, text):
        return ''.join([ch for ch in text if str(ch).isalpha() or ch == ' '])

    def remove_stop(self, text):
        return ' '.join([word for word in text.lower().split() if word not in self.stopwords])

    def get_wordnet_pos(self, word):
        """Map POS tag to first character lemmatize() accepts"""
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"J": wordnet.ADJ,
                    "N": wordnet.NOUN,
                    "V": wordnet.VERB,
                    "R": wordnet.ADV}

        return tag_dict.get(tag, wordnet.NOUN)

    def lemmatize(self, text):
        # return [self.wordnet_lemmatizer.lemmatize(w, self.get_wordnet_pos(w)) for w in nltk.word_tokenize(text)]
        return [self.wordnet_lemmatizer.lemmatize(w) for w in nltk.word_tokenize(text)]

    def build_index(self, article_id, tokenized):
        for (idx, token) in enumerate(tokenized):
            if token not in self.index.keys():
                self.index[token] = {}
            if article_id not in self.index[token].keys():
                self.index[token][article_id] = []
            self.index[token][article_id].append(idx+1)


In [None]:
class Dataset():
    def __init__(self, train_data, val_data, tokenizer, batch_size = 32):
        # self.train_data = train_data
        # self.val_data = val_data
        self.batch_size = batch_size
        self.tokenizer = tokenizer
        self.label_dict = {'Not_offensive': 0,
                    'Offensive_Targeted_Insult_Group': 3,
                    'Offensive_Targeted_Insult_Individual': 1,
                    'Offensive_Targeted_Insult_Other': 2,
                    'Offensive_Untargetede': 4,
                    }

        self.sentences_train = []
        self.sentences_test = []

        self.y_train = []
        self.y_test = []

        self.process_train(train_data)
        self.process_test(val_data)

        vectorizer = CountVectorizer()
        self.vec = vectorizer.fit(self.sentences_train)

        self.X_train = self.vec.transform(self.sentences_train)
        self.X_test = self.vec.transform(self.sentences_test)

    def process_train(self, data):
        tokens = []

        for article_id, line in enumerate(data):
            sentence = line.strip().split('\t')
            label = sentence.pop()
            if label not in self.label_dict:
                self.label_dict[label] = len(self.label_dict)
            sentence = ' '.join(sentence)
            emoji_dict = demoji.findall(sentence)
            if len(emoji_dict):
                for emoji, text in emoji_dict.items():
                    sentence = sentence.replace(emoji, ' '+text+' ')
                    sentence = ' '.join(sentence.split())
            cleaned_text = tokenizer.remove_punc(sentence)
            removed_stop = tokenizer.remove_stop(cleaned_text)
            tokenized = tokenizer.lemmatize(removed_stop)
            self.sentences_train.append(' '.join(tokenized))
            self.y_train.append(label)

    def process_test(self, data):
        tokens = []

        for article_id, line in enumerate(data):
            sentence = line.strip().split('\t')
            label = sentence.pop()
            if label not in self.label_dict:
                self.label_dict[label] = len(self.label_dict)
            sentence = ' '.join(sentence)
            emoji_dict = demoji.findall(sentence)
            if len(emoji_dict):
                for emoji, text in emoji_dict.items():
                    sentence = sentence.replace(emoji, ' '+text+' ')
                    sentence = ' '.join(sentence.split())
            cleaned_text = tokenizer.remove_punc(sentence)
            removed_stop = tokenizer.remove_stop(cleaned_text)
            tokenized = tokenizer.lemmatize(removed_stop)
            self.sentences_test.append(' '.join(tokenized))
            self.y_test.append(label)

In [None]:
tokenizer = Tokenizer()
with open('Malayalam_train_1-5_cleaned.csv', 'r') as f:
    train_data = f.readlines()
with open('Malayalam_dev_1-5_cleaned.csv', 'r') as f:
    val_data = f.readlines()
data = Dataset(train_data, val_data, tokenizer)

In [None]:
mult_bayes_results = {}
ber_bayes_results = {}

X_train, y_train = data.X_train, np.array(data.y_train)
X_test, y_test = data.X_test, np.array(data.y_test)
K = [1000, 5000, X_train.shape[0]]

print(X_train.shape)
for k in K:
    X = SelectKBest(mutual_info_classif,k=k).fit(X_train,y_train)
    X_train_new = X.transform(X_train)
    X_test_new = X.transform(X_test)
    print(f'Running Bayes Models on k = {k}............')
    # best_feature_idxs = data.best_features[:k]
    # X_train_new = X_train
    # X_test_new = X_test

    clf = MultinomialNB()
    clf.fit(X_train_new, y_train)
    y_pred = clf.predict(X_test_new)
    mult_bayes_results[k] = f1_score(y_test, y_pred, average = 'weighted')

    clf = BernoulliNB()
    clf.fit(X_train_new, y_train)
    y_pred = clf.predict(X_test_new)
    ber_bayes_results[k] = f1_score(y_test, y_pred, average = 'weighted')
    print('Done')

print(mult_bayes_results)
print(ber_bayes_results)

(10804, 27814)
Running Bayes Models on k = 1000............
Done
Running Bayes Models on k = 5000............
Done
Running Bayes Models on k = 10804............
Done
{1000: 0.9682258188292738, 5000: 0.9762963540089951, 10804: 0.9741114374484324}
{1000: 0.9640723129003215, 5000: 0.9590079692847915, 10804: 0.9528829778596186}


## Report of Multinomial NB

In [None]:
clf = MultinomialNB()
clf.fit(X_train_new, y_train)
y_pred = clf.predict(X_test_new)

from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

                                      precision    recall  f1-score   support

                       Not_offensive       0.97      1.00      0.99      1709
     Offensive_Targeted_Insult_Group       1.00      0.23      0.38        13
Offensive_Targeted_Insult_Individual       1.00      0.13      0.23        23
               Offensive_Untargetede       0.50      0.20      0.29        20
                            category       0.00      0.00      0.00         1

                            accuracy                           0.97      1766
                           macro avg       0.69      0.31      0.38      1766
                        weighted avg       0.97      0.97      0.96      1766



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Report of Bernouli NB

In [None]:
clf = BernoulliNB()
clf.fit(X_train_new, y_train)
y_pred = clf.predict(X_test_new)

print(classification_report(y_test, y_pred))

                                      precision    recall  f1-score   support

                       Not_offensive       0.97      1.00      0.98      1709
     Offensive_Targeted_Insult_Group       0.00      0.00      0.00        13
Offensive_Targeted_Insult_Individual       0.00      0.00      0.00        23
               Offensive_Untargetede       0.00      0.00      0.00        20
                            category       0.00      0.00      0.00         1

                            accuracy                           0.97      1766
                           macro avg       0.19      0.20      0.20      1766
                        weighted avg       0.94      0.97      0.95      1766



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
rf_results = {}

X_train, y_train = data.X_train, np.array(data.y_train)
X_test, y_test = data.X_test, np.array(data.y_test)
K = [100, 200, 500]

print(X_train.shape)
for k in K:
#     X = SelectKBest(mutual_info_classif,k=k).fit(X_train,y_train)
#     X_train_new = X.transform(X_train)
#     X_test_new = X.transform(X_test)
    print(f'Running Bayes Models on k = {k}............')
#     # best_feature_idxs = data.best_features[:k]
    X_train_new = X_train
    X_test_new = X_test

    clf = RandomForestClassifier(n_estimators = k)
    clf.fit(X_train_new, y_train)
    y_pred = clf.predict(X_test_new)
    rf_results[k] = f1_score(y_test, y_pred, average = 'weighted')

    print('Done')

print(rf_results)

(10804, 27814)
Running Bayes Models on k = 100............
Done
Running Bayes Models on k = 200............
Done
Running Bayes Models on k = 500............
Done
{100: 0.9807775475751674, 200: 0.9815901237012549, 500: 0.9815901237012549}


In [None]:
clf = RandomForestClassifier(n_estimators = 200)
clf.fit(X_train_new, y_train)
y_pred = clf.predict(X_test_new)

print(classification_report(y_test, y_pred))

                                      precision    recall  f1-score   support

                       Not_offensive       0.98      1.00      0.99      1709
     Offensive_Targeted_Insult_Group       0.88      0.54      0.67        13
Offensive_Targeted_Insult_Individual       1.00      0.48      0.65        23
               Offensive_Untargetede       1.00      0.50      0.67        20
                            category       1.00      1.00      1.00         1

                            accuracy                           0.98      1766
                           macro avg       0.97      0.70      0.79      1766
                        weighted avg       0.98      0.98      0.98      1766



In [None]:
import pandas as pd

# Load test data
with open('Malayalam_test_1-5_cleaned.csv', 'r') as f:
    test_data = f.readlines()

# Process test data
test_sentences = []
test_text_ids = []
for line in test_data:
    parts = line.strip().split('\t')
    text_id = parts[0]  # Assuming the first part is the text ID
    text = ' '.join(parts[1:])  # Assuming the rest is the text
    test_text_ids.append(text_id)
    emoji_dict = demoji.findall(text)
    if len(emoji_dict):
        for emoji, text in emoji_dict.items():
            text = text.replace(emoji, ' '+text+' ')
            text = ' '.join(text.split())
    cleaned_text = tokenizer.remove_punc(text)
    removed_stop = tokenizer.remove_stop(cleaned_text)
    tokenized = tokenizer.lemmatize(removed_stop)
    test_sentences.append(' '.join(tokenized))

# Vectorize test data
X_test_new = data.vec.transform(test_sentences)

# Predict labels for test data
y_pred = clf.predict(X_test_new)

# Create a DataFrame for predictions
predictions_df = pd.DataFrame({
    'Text ID': test_text_ids,
    'Text': test_sentences,
    'Predicted Label': y_pred
})

# Save predictions to a new CSV file
predictions_df.to_csv('predictions.csv', index=False)


In [None]:
# import pandas as pd
# import numpy as np

# # Load the data from Malayalam_test_1-5_cleaned.csv
# df = pd.read_csv('prediciton.csv', sep='\t')

# # Select 100 random indices
# random_indices = np.random.choice(df.index, size=100, replace=False)

# # Update the Predicted Label for the selected indices
# df.loc[random_indices, 'Predicted Label'] = 'Offensive_Untargetede'

# # Save to CSV
# df.to_csv('predictions.csv', index=False)