<a href="https://colab.research.google.com/github/diliprc96/POS_Tagging/blob/main/POS_Tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!unzip /content/drive/MyDrive/POS_Tagging/DATASET_POS.zip

In [36]:
import os
import matplotlib.pyplot as plt


In [37]:



train_data = load_data(train_path)
test_data = load_data(test_path)

In [38]:
import os
import matplotlib.pyplot as plt

class PreprocessData:
    def __init__(self, data_path):
        """
        Initializes the PreprocessData class with the given path.
        """
        self.data_path = data_path
        self.train_path = os.path.join(self.data_path, 'WSJ_train.pos')
        self.test_path = os.path.join(self.data_path, 'WSJ_test.pos')
        self.train_data = None
        self.test_data = None
        self.train_sentences = None
        self.test_sentences = None

    def load_data(self, file_path):
        """
        Loads the data from the given file path.
        """
        with open(file_path, 'r') as file:
            data = file.readlines()
        return data

    def preprocess_data(self, data):
        """
        Processes the raw data into sentences of word-tag tuples.
        """
        sentences = []
        sentence = []
        for line in data:
            line = line.strip()
            if line:
                word, tag = line.split()
                sentence.append((word, tag))
            else:
                if sentence:  # Non-empty sentence
                    sentences.append(sentence)
                    sentence = []  # Reset for the next sentence
        if sentence:  # Append the last sentence if it doesn't end with a blank line
            sentences.append(sentence)
        return sentences

    def plot_sentence_length_distribution(self, sentences):
        """
        Plots the distribution of sentence lengths in terms of token count.
        """
        sentence_lengths = [len(sentence) for sentence in sentences]
        plt.figure(figsize=(10, 6))
        plt.hist(sentence_lengths, bins=30, color='blue', alpha=0.7)
        plt.title('Sentence Length Distribution')
        plt.xlabel('Sentence Length (Number of Tokens)')
        plt.ylabel('Frequency')
        plt.show()

    def process(self):
        """
        This method loads and preprocesses the train and test datasets.
        """
        # Load data
        self.train_data = self.load_data(self.train_path)
        self.test_data = self.load_data(self.test_path)

        # Preprocess data
        self.train_sentences = self.preprocess_data(self.train_data)
        self.test_sentences = self.preprocess_data(self.test_data)

        # Plot sentence length distribution for the training set
        self.plot_sentence_length_distribution(self.train_sentences)

        # Output sample sentences for verification
        return self.train_sentences, self.test_sentences

data_path = '/content/Dataset_POS'
preprocessor = PreprocessData(data_path)
train_sentences, test_sentences = preprocessor.process()



In [48]:
# Create word-to-index and index-to-word mapping necessary for conversions for
# words before training and after prediction. From the list of tuples generated
# earlier, now we will build the independent and dependent variable structure.

def create_mappings(sentences):
  words = set()
  tags = set()
  for sentence in sentences :
    for word, tag in sentence :
      words.add(word)
      tags.add(tag)
  word_to_index = {word : index for index, word in enumerate(words)}
  index_to_word = {index : word for index, word in enumerate(words)}
  tag_to_index = {tag : index for index, tag in enumerate(tags)}
  index_to_tag = {index : tag for index, tag in enumerate(tags)}

  return word_to_index, index_to_word, tag_to_index, index_to_tag

word_to_index, index_to_word, tag_to_index, index_to_tag = create_mappings(train_sentences)


In [50]:
index_to_word

{0: 'inflict',
 1: 'Goldwater',
 2: '6,320',
 3: 'memorabilia',
 4: 'rank-and-file',
 5: '98',
 6: 'school-lunch',
 7: 'regions',
 8: 'missed',
 9: 'relates',
 10: 'multi-spired',
 11: 'mediator',
 12: 'LATE',
 13: 'fun',
 14: 'Electronic',
 15: 'wording',
 16: 'SCI',
 17: 'minimum-fee',
 18: 'Nearby',
 19: 'Watching',
 20: '3.125',
 21: 'languorous',
 22: 'Mintz',
 23: 'crept',
 24: 'outback',
 25: '2.51',
 26: 'peanut',
 27: 'premise',
 28: 'Hypotheekkas',
 29: 'PTL',
 30: '8.007',
 31: 'foreclosure',
 32: 'ASSOCIATION',
 33: 'swallowed',
 34: 'registered',
 35: 'psychobiology',
 36: 'aluminum-makers',
 37: 'tying',
 38: 'reliance',
 39: 'Chojnowski',
 40: 'Nope',
 41: 'flame',
 42: 'corresponded',
 43: 'TILT',
 44: 'factories',
 45: '535',
 46: 'Thought',
 47: '1998-2011',
 48: 'Doskocil',
 49: 'Widely',
 50: 'hard-disk',
 51: 'citizen',
 52: 'immersed',
 53: 'unrolls',
 54: 'Look',
 55: 'Stoneridge',
 56: 'instead',
 57: 'bookstore',
 58: 'sporting-goods',
 59: 'shout',
 60: 'criti