<a href="https://colab.research.google.com/github/gaixen/BCS_recruitment/blob/main/VeritasVigil%3A%20The%20truth%20Watchman/demo_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Decoding the sentiments

In [110]:
!pip install wolta



In [111]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import re
from typing import Text

In [112]:
df_true=pd.read_csv('/content/True.csv')
df_fake=pd.read_csv('/content/Fake.csv')

# Custom Tokenizer Development

In [134]:
class customtokenizer:
  def __init__(self):
    self.emoticon_pattern=re.compile(r'[:;=8][\-o\*]?[\\]dDpP/\:\}\{@\|\\}')
    self.contractions={"can't":"can not","won't":"will not","i'm":"i am","he's":"he is",
    "she's":"she is","it's":"it is","that's":"that is","there's":"there is","what's":"what is",
             "who've":"who have","'ve":"have","didn't":"did not","don't":"do not","isn't":"is not",
                       "shouldn't":"should not"}#some frequently used short-forms
  #they are identified early so that when in later functions we split punctuations etc, it doesn't provide a barrier
  def expand_contractions(self,text:str)->str:
      for contraction,expanded in self.contractions.items():
        text=re.sub(r'\b'+re.escape(contraction)+r'\b',expanded,text)

      return text


  def normalize(self,word:str)->list[str]:
    match=re.search(r'(.)\1{2,}',word)
    if match:
      char=match.group(1)
      repeat_count=len(match.group(0))
      normalized=re.sub(r'(.)\1{2,}',char,word)
      return[normalized,f"<REpEat:{repeat_count}>"]
    else:
      return[word]

  def tokenize(self,text:str)->list[str]:
    text=text.lower()#lowercasing the words
    text=self.expand_contractions(text)#expand the contractions
    #text=self.emoticon_pattern(text)#identify emoticon patterns
    emoticons=self.emoticon_pattern.findall(text)
    text=self.emoticon_pattern.sub('',text)#substitute the emoticon patterns
    text=re.sub(r'([!?.,;:"(){}[\]])', r' \1 ',text)#splitting punctuations
    text=re.sub(r'\s{2,}',' ',text)
    tokens=[]#initialise a blank list of tokens
    for word in text.strip().split():
      tokens.extend(self.normalize(word))
    return tokens+emoticons


In [135]:
if __name__ == "__main__":
    tokenizer = customtokenizer()
    sample1 = "there are mannnnny of the PROTAGONISTS it's abhored IT...!!"
    sample2="latttent"
    tokens = tokenizer.tokenize(sample1)
    print(tokens)

['there', 'are', 'many', '<REpEat:5>', 'of', 'the', 'protagonists', 'it', 'is', 'abhored', 'it', '.', '.', '.', '!', '!']


# Rule‑Based POS(parts of speech) Tagger

Distinguish between nouns, adjectives and verbs only


In [140]:
class POS_tagger:
  def __init__(self):
    self.pronouns={"i", "you", "he", "she", "it", "we", "they", "me", "us", "them"}
    self.determiners = {"the", "a", "an", "this", "that", "these", "those","there"}
    self.adj_endings=['ous', 'ful', 'ive', 'al', 'ic', 'able', 'ible','ary']
    self.verb_endings=['ing','ed','en','es','s','ise','ize']
    self.be_verb={"is","am","are","was","were"}
    self.adv_endings=['ly','ily']
    self.adverbs_common={"very", "most" ,"so"}
    self.noun_endings=['ment', 'ness', 'ity', 'tion', 'sion', 'er', 'or']
    self.prepositions=["of","to","in","for","on","with","at","by","from","about","over","after","as"]
    self.pre={"REpEat"}
  def tagger(self,tokens:list[str])->list[tuple[str,str]]:
    tagging_done=[]
    for token in tokens:
      if re.fullmatch(r'[.,!?;:\'\"()\[\]{}]', token):
        tagger="punctuation"
      elif token in self.pronouns:
        tagger="pronoun"
      elif token in self.determiners:
        tagger="determiners"
      elif token in self.adverbs_common:
        tagger="adverb"
      elif token in self.be_verb:
        tagger="verb"
      elif token in self.prepositions:
        tagger="preposition"
      elif re.fullmatch(r'\d+(\.\d+)?', token):
        tagger = "NUM"
      elif token.startswith("<REPEAT"):
        tagger = "OTHER"
      elif any(token.endswith(suffix)for suffix in self.noun_endings) :
        tagger="noun"
      elif any(token.endswith(suffix)for suffix in self.adj_endings) :
        tagger="adjective"
      elif any(token.endswith(suffix)for suffix in self.adv_endings) :
        tagger="adverb"
      elif any(token.endswith(suffix)for suffix in self.verb_endings) :
        tagger="verb"
      elif any(token.startswith(pre)for pre in self.pre):
        tagger="ignore"
      else:
        tagger="other"
      tagging_done.append((token,tagger))
    return tagging_done

In [141]:
if __name__ == "__main__":
    tokenizer = customtokenizer()
    tagger = POS_tagger()
    sample_1 = "Sooooooo scary!!!IT's very arduous!!"
    sample_2="there are mannnnny of the PROTAGONISTS who've abhored IT...!!"
    tokens = tokenizer.tokenize(sample_1)
    tagging_done = tagger.tagger(tokens)
    tokens = tokenizer.tokenize(sample_1)
    print(tokens)
for token, tagger in tagging_done:
  if re.fullmatch(r'<REpEat:\d+>', token):
    continue
  else:
    print(f"{token:15} : {tagger}")

['so', '<REpEat:7>', 'scary', '!', '!', '!', 'it', 'is', 'very', 'arduous', '!', '!']
so              : adverb
scary           : adjective
!               : punctuation
!               : punctuation
!               : punctuation
it              : pronoun
is              : verb
very            : adverb
arduous         : adjective
!               : punctuation
!               : punctuation


# Custom Stemmer or Lemmatizer

The motivation of this pipeline is to reduce similar tokens like "eaten","ate","eating" to their stem word i.e."eat". But it is to be taken care that over-stemming is avoided like "protagonists" isn't converted to "protagon". Basically I will try to convert only those tokens which are verb :)

In [None]:
class lemmatizer:
  def __init__(self):
    self.verb_endings=['ing','ed','en','es','s','ise','ize']
    self.be_verb={"is","am","are","was","were"}
    self.noun_endings=['ment', 'ness', 'ity', 'tion', 'sion', 'er', 'or']
    self.adj_endings=['ous', 'ful', 'ive', 'al', 'ic', 'able', 'ible','ary']

  def lemmatize(self,token:str,pos:str)->str:
    lemma=token
