In [None]:
!pip install sklearn-crfsuite
!pip install gradio
!pip install nltk

Collecting sklearn-crfsuite
  Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl.metadata (4.9 kB)
Collecting python-crfsuite>=0.9.7 (from sklearn-crfsuite)
  Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Downloading sklearn_crfsuite-0.5.0-py2.py3-none-any.whl (10 kB)
Downloading python_crfsuite-0.9.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-crfsuite, sklearn-crfsuite
Successfully installed python-crfsuite-0.9.10 sklearn-crfsuite-0.5.0
Collecting gradio
  Downloading gradio-4.44.1-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio)
  Downloading fastapi-0.115.0-py3-none-any.whl.metadata (27 kB)
Collecti

In [None]:
import nltk
nltk.download('brown')
nltk.download('punkt')  # Optional, if tokenizing
nltk.download('universal_tagset')  # For Universal POS tagging

from nltk import bigrams
from collections import defaultdict
from sklearn_crfsuite import CRF, metrics
from nltk.tokenize import word_tokenize
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold
class CRFTagger:
    def __init__(self, train_data, test_data, tagset) -> None:
        self.train_data = train_data
        self.test_data = test_data
        self.tagset = tagset

        self.train_features = self.extract_features(self.train_data)
        self.test_features = self.extract_features(self.test_data)

        self.crf = CRF(algorithm='lbfgs',c1=0.01,c2=0.1,max_iterations=100,all_possible_transitions=True)
        self.crf.fit(self.train_features, self.get_labels(self.train_data))

    def extract_features(self, data):
      features = []
      for sentence in data:
          sentence_features = []
          for i, (word, tag) in enumerate(sentence):
              word_features = {
                  'word': word.lower(),
                  'is_upper': word.isupper(),
                  'is_title': word.istitle(),
                  'is_digit': word.isdigit(),
                  'is_punct': word in ".,;:!?",
                  'contains_digit': any(char.isdigit() for char in word),
                  'contains_hyphen': '-' in word,
                  'word_len': len(word),

                  # Prefixes and Suffixes
                  'pref_1': word[:1],
                  'pref_2': word[:2],
                  'pref_3': word[:3],
                  'pref_4': word[:4],
                  'suff_1': word[-1:],
                  'suff_2': word[-2:],
                  'suff_3': word[-3:],
                  'suff_4': word[-4:],

                  # Previous word and next word context
                  'prev_word': '' if i == 0 else sentence[i - 1][0].lower(),
                  'prev2_word': '' if i <= 1 else sentence[i - 2][0].lower(),
                  'next_word': '' if i == len(sentence) - 1 else sentence[i + 1][0].lower(),
                  'next2_word': '' if i >= len(sentence) - 2 else sentence[i + 2][0].lower(),

                  # POS tags of neighbors (only during training or if tags are available)
                  'prev_pos': '' if i == 0 else sentence[i - 1][1],
                  'prev2_pos': '' if i <= 1 else sentence[i - 2][1],
                  'next_pos': '' if i == len(sentence) - 1 else sentence[i + 1][1],
                  'next2_pos': '' if i >= len(sentence) - 2 else sentence[i + 2][1]
              }
              sentence_features.append(word_features)
          features.append(sentence_features)
      return features


    def get_labels(self, data):
        return [[tag for _, tag in sentence] for sentence in data]

    def evaluate(self):
        y_pred = self.crf.predict(self.test_features)
        y_test = self.get_labels(self.test_data)

        # Overall metrics
        overall_accuracy = metrics.flat_accuracy_score(y_test, y_pred)
        overall_precision = precision_score([tag for tags in y_test for tag in tags],
                                            [tag for tags in y_pred for tag in tags],
                                            average='weighted', zero_division=0)
        overall_recall = recall_score([tag for tags in y_test for tag in tags],
                                      [tag for tags in y_pred for tag in tags],
                                      average='weighted', zero_division=0)
        overall_f1 = f1_score([tag for tags in y_test for tag in tags],
                              [tag for tags in y_pred for tag in tags],
                              average='weighted', zero_division=0)

        # Confusion matrix
        conf_matrix = confusion_matrix([tag for tags in y_test for tag in tags],
                                       [tag for tags in y_pred for tag in tags],
                                       labels=self.tagset)

        # Print overall metrics
        print("Overall Accuracy:", overall_accuracy)
        print("Overall Precision:", overall_precision)
        print("Overall Recall:", overall_recall)
        print("Overall F1 Score:", overall_f1)
        print("Confusion Matrix:\n", conf_matrix)

        # Individual tag metrics
        print("\nMetrics for Individual Tags:")
        report = classification_report([tag for tags in y_test for tag in tags],
                                       [tag for tags in y_pred for tag in tags],
                                       labels=self.tagset)
        print(report)



[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package universal_tagset to /root/nltk_data...
[nltk_data]   Package universal_tagset is already up-to-date!


In [None]:
if __name__ == "__main__":
    # Load data and initialize tagger
    from nltk.corpus import brown
    data = brown.tagged_sents(tagset='universal')
    taglist=['NOUN', 'VERB', 'PRON', 'ADP', 'PRT', '.', 'CONJ', 'DET', 'ADJ', 'ADV', 'NUM', 'X']


    # Preprocess data: tokenization and lowercasing
    kf = KFold(n_splits=5)
    for train_index, test_index in kf.split(data):
        train_data = [data[i] for i in train_index]
        test_data = [data[i] for i in test_index]

        # Initialize and evaluate POSTagger
        tagger = CRFTagger(train_data, test_data, taglist)
        tagger.evaluate()

    # Function to predict POS tags for Gradio
    def predict(sentence):
        words = [word.lower() for word in word_tokenize(sentence)]  # Ensure all words are lowercased
        features = tagger.extract_features([[ (word, '') for word in words ]])  # Create dummy tags for prediction
        pos_tags = tagger.crf.predict(features)[0]  # Predict using the CRF model
        return list(zip(words, pos_tags))  # Pair words with their POS tags

    # Gradio Interface
    import gradio as gr
    interface = gr.Interface(
        fn=predict,
        inputs="text",
        outputs="text",
        title="POS Tagger",
        description="Input a sentence to get POS tags."
    )
    interface.launch()

Overall Accuracy: 0.9787905416200671
Overall Precision: 0.978771734625255
Overall Recall: 0.9787905416200671
Overall F1 Score: 0.9787538815745895
Confusion Matrix:
 [[65230   555     8    10    22     1     0     2   862    53    30    17]
 [  665 36333     0    17     3     0     0     0    93    17     0     0]
 [    2     0  8123    36     1     0     0    42     0     0     0     0]
 [    2     8    18 30602   190     0    22    55     9   142     0     0]
 [   32    12     0   197  5499     0     0     0     7    44     0     0]
 [    0     0     0     2     0 30113     0     0     0     0     0     0]
 [    0     0     0     2     0     0  7644     3     0    13     0     0]
 [    2     1    44    64     0     0     6 29318     1    18     0     1]
 [  639   151     0    22     6     0     0     0 17764   346    12     1]
 [   58    11     1   205    61     0    20    19   259 10336     0     1]
 [   53     0     0     0     0     0     0     0     7     1  3912     1]
 [   91   

In [None]:
import nltk
nltk.download('brown')
nltk.download('punkt')  # Optional, if tokenizing
nltk.download('universal_tagset')  # For Universal POS tagging

from nltk import bigrams
from collections import defaultdict
from sklearn_crfsuite import CRF, metrics
from nltk.tokenize import word_tokenize
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score, f1_score
from sklearn.model_selection import KFold
class CRFTagger:
    def __init__(self, train_data, test_data, tagset) -> None:
        self.train_data = train_data
        self.test_data = test_data
        self.tagset = tagset

        self.train_features = self.extract_features(self.train_data)
        self.test_features = self.extract_features(self.test_data)

        self.crf = CRF(algorithm='lbfgs',c1=0.01,c2=0.1,max_iterations=100,all_possible_transitions=True)
        self.crf.fit(self.train_features, self.get_labels(self.train_data))

    def extract_features(self, data):
        features = []
        for sent in data:
            sentence_features = []
            for i, (word, tag) in enumerate(sent):
                word=sent[i][0]
                pos=sent[i][1]
                if(i==0):
                    prevw='<START>'
                    prepos='<START>'
                else:
                    prevw=sent[i-1][0]
                    prepos=sent[i-1][1]
                if(i==0 or i==1):
                     prev2='<START>'
                     prev2pos='<START>'
                else:
                    prev2=sent[i-2][0]
                    prev2pos=sent[i-2][1]
                if(i==len(sent)-1):
                    nextw='<END>'
                    nextpos='<END>'
                else:
                    nextw=sent[i+1][0]
                    nextpos=sent[i+1][1]
                pref_1, pref_2, pref_3, pref_4=word[:1],word[:2],word[:3],word[:4]
                suff_1, suff_2, suff_3, suff_4=word[-1:],word[-2:],word[-3:],word[-4:]
                word_features = {
                    'word': word.lower(),  # Normalize to lowercase
                    'is_upper': word.isupper(),
                    'is_title': word.istitle(),
                    'is_digit': word.isdigit(),
                    'pos':pos,
                    'prevword':prevw,
                    'prevpos':prepos,
                    'prev2word':prev2,
                    'prev2pos':prev2pos,
                    'nextword': nextw,
                    'nextpos': nextpos,
                    'pref_1':word[:1],
                    'pref_2':word[:2],
                    'pref_3':word[:3],
                    'pref_4':word[:4],
                    'suff_1':word[-1:],
                    'suff_2':word[-2:],
                    'suff_3':word[-3:],
                    'suff_4':word[-4:]
                }
            sentence_features.append(word_features)
        features.append(sentence_features)
        return features

    def get_labels(self, data):
        return [[tag for _, tag in sentence] for sentence in data]

    def evaluate(self):
        y_pred = self.crf.predict(self.test_features)
        y_test = self.get_labels(self.test_data)

        # Overall metrics
        overall_accuracy = metrics.flat_accuracy_score(y_test, y_pred)
        overall_precision = precision_score([tag for tags in y_test for tag in tags],
                                            [tag for tags in y_pred for tag in tags],
                                            average='weighted', zero_division=0)
        overall_recall = recall_score([tag for tags in y_test for tag in tags],
                                      [tag for tags in y_pred for tag in tags],
                                      average='weighted', zero_division=0)
        overall_f1 = f1_score([tag for tags in y_test for tag in tags],
                              [tag for tags in y_pred for tag in tags],
                              average='weighted', zero_division=0)

        # Confusion matrix
        conf_matrix = confusion_matrix([tag for tags in y_test for tag in tags],
                                       [tag for tags in y_pred for tag in tags],
                                       labels=self.tagset)

        # Print overall metrics
        print("Overall Accuracy:", overall_accuracy)
        print("Overall Precision:", overall_precision)
        print("Overall Recall:", overall_recall)
        print("Overall F1 Score:", overall_f1)
        print("Confusion Matrix:\n", conf_matrix)

        # Individual tag metrics
        print("\nMetrics for Individual Tags:")
        report = classification_report([tag for tags in y_test for tag in tags],
                                       [tag for tags in y_pred for tag in tags],
                                       labels=self.tagset)
        print(report)



In [None]:
if __name__ == "__main__":
    # Load data and initialize tagger
    from nltk.corpus import brown
    data = brown.tagged_sents(tagset='universal')
    taglist=['NOUN', 'VERB', 'PRON', 'ADP', 'PRT', '.', 'CONJ', 'DET', 'ADJ', 'ADV', 'NUM', 'X']


    # Preprocess data: tokenization and lowercasing
    kf = KFold(n_splits=5)
    for train_index, test_index in kf.split(data):
        train_data = [data[i] for i in train_index]
        test_data = [data[i] for i in test_index]

        # Initialize and evaluate POSTagger
        tagger = CRFTagger(train_data, test_data, taglist)
        tagger.evaluate()

    # Function to predict POS tags for Gradio
    def predict(sentence):
        words = [word.lower() for word in word_tokenize(sentence)]  # Ensure all words are lowercased
        features = tagger.extract_features([[ (word, '') for word in words ]])  # Create dummy tags for prediction
        pos_tags = tagger.crf.predict(features)[0]  # Predict using the CRF model
        return list(zip(words, pos_tags))  # Pair words with their POS tags

    # Gradio Interface
    import gradio as gr
    interface = gr.Interface(
        fn=predict,
        inputs="text",
        outputs="text",
        title="POS Tagger",
        description="Input a sentence to get POS tags."
    )
    interface.launch()