<a href="https://colab.research.google.com/github/gaixen/BCS_recruitment/blob/main/VeritasVigil%3A%20The%20truth%20Watchman/demo_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Custom Tokenizer Development

In [1]:
import re
from typing import Text

In [2]:
class customtokenizer:
  def __init__(self):
    self.emoticon_pattern=re.compile(r'[:;=8][\-o\*]?[\\]dDpP/\:\}\{@\|\\}')
    self.contractions={"can't":"can not","won't":"will not","i'm":"i am","he's":"he is",
    "she's":"she is","it's":"it is","that's":"that is","there's":"there is","what's":"what is",
             "who've":"who have","'ve":"have","didn't":"did not","don't":"do not","isn't":"is not",
                       "shouldn't":"should not"}#some frequently used short-forms
  #they are identified early so that when in later functions we split punctuations etc, it doesn't provide a barrier
  def expand_contractions(self,text:str)->str:
    def expand_quotes(match):
        subject = match.group(1)
        token_0 = match.group(2)
        token_0 = token_0.lower()
        if len(token_0) >= 2 and (
            (token_0[-1] == 'e' and token_0[-2] == 'n') or
            (token_0[-1] == 'n' and token_0[-2] == 'e')
        ):
            return f"{subject} has {match.group(2)}"
        else:
            return f"{subject} is {match.group(2)}"
    text = re.sub(r"\b(he|she|it)'s\s+(\w+)", expand_quotes, text, flags=re.IGNORECASE)

    for contraction,expanded in self.contractions.items():
        text=re.sub(r'\b'+re.escape(contraction)+r'\b',expanded,text)

    return text


  def normalize(self,word:str)->list[str]:
    match=re.search(r'(.)\1{2,}',word)
    if match:
      char=match.group(1)
      repeat_count=len(match.group(0))
      normalized=re.sub(r'(.)\1{2,}',re.escape(char),word)
      return[normalized,f"<REpEat:{repeat_count}>"]
    else:
      return[word]

  def tokenize(self,text:str)->list[str]:
    text=text.lower()#lowercasing the words
    text=self.expand_contractions(text)#expand the contractions
    #text=self.emoticon_pattern(text)#identify emoticon patterns
    emoticons=self.emoticon_pattern.findall(text)
    text=self.emoticon_pattern.sub('',text)#substitute the emoticon patterns
    text=re.sub(r'([!?.,;:"(){}[\]])', r' \1 ',text)#splitting punctuations
    text=re.sub(r'\s{2,}',' ',text)
    tokens=[]#initialise a blank list of tokens
    for word in text.strip().split():
      tokens.extend(self.normalize(word))
    return tokens+emoticons


In [3]:
if __name__ == "__main__":
    tokenizer = customtokenizer()
    sample1 = "there are mannnnny of the PROTAGONISTS it's abhored IT...!!"
    sample2="latttent"
    tokens = tokenizer.tokenize(sample1)
    print(tokens)

['there', 'are', 'many', '<REpEat:5>', 'of', 'the', 'protagonists', 'it', 'is', 'abhored', 'it', '.', '.', '.', '!', '!']


# Rule‑Based POS(parts of speech) Tagger

Distinguish between nouns, adjectives and verbs only


In [4]:
class POS_tagger:
  def __init__(self):
    self.pronouns={"i", "you", "he", "she", "it", "we", "they", "me", "us", "them","their"}
    self.determiners = {"the", "a", "an", "this", "that", "these", "those","there"}
    self.adj_endings=['ous', 'ful', 'ive', 'al', 'ic', 'able', 'ible','ary']
    self.verb_endings=['ing','ed','en','es','s','ise','ize']
    self.be_verb={"is","am","are","was","were"}
    self.adv_endings=['ly','ily']
    self.adverbs_common={"very", "most" ,"so"}
    #self.noun_endings=['ment', 'ness', 'ity', 'tion', 'sion', 'er', 'or']
    self.prepositions=["of","to","in","for","on","with","at","by","from","about","over","after","as"]
    self.pre={"REpEat"}
    self.noun_endings = ['tion', 'ment', 'ness', 'ity', 'ist', 'ism', 'ance', 'ence', 'ship', 's']  # careful with 's'


  def tagger(self,tokens:list[str])->list[tuple[str,str]]:
    tagging_done=[]
    for token in tokens:
      if re.fullmatch(r'REPEAT:\d+',token):
        continue
      if re.fullmatch(r'[.,!?;:\'\"()\[\]{}]', token):
        tagger="punctuation"
      elif token in self.pronouns:
        tagger="pronoun"
      elif token in self.determiners:
        tagger="determiners"
      elif token in self.be_verb:
        tagger="verb"
      elif token in self.adverbs_common:
        tagger="adverb"
      elif re.fullmatch(r'\d+(\.\d+)?', token):
        tagger = "NUM"
      elif token in self.prepositions:
        tagger="preposition"
      elif token.startswith("<REPEAT"):
        tagger = "OTHER"
      elif any(token.endswith(suffix)for suffix in self.noun_endings) :
        tagger="noun"
      elif any(token.endswith(suffix)for suffix in self.verb_endings) :
        tagger="verb"
      elif any(token.endswith(suffix)for suffix in self.adj_endings) :
        tagger="adjective"
      elif any(token.endswith(suffix)for suffix in self.adv_endings) :
        tagger="adverb"

      elif any(token.startswith(pre)for pre in self.pre):
        tagger="ignore"
      else:
        tagger="other"
      tagging_done.append((token,tagger))
    return tagging_done

In [5]:
if __name__ == "__main__":
    tokenizer = customtokenizer()
    tagger = POS_tagger()
    sample_1 = "Sooooooo scary!!!IT's very arduous!!"
    sample_2="there he's taken are mannnnny of the PROTAGONISTS who've abhored IT...!!"
    tokens = tokenizer.tokenize(sample_2)
    tagging_done = tagger.tagger(tokens)
    tokens = tokenizer.tokenize(sample_1)
    print(tokens)
for token, tagger in tagging_done:
  if re.fullmatch(r'<REpEat:\d+>', token):
    continue
  else:
    print(f"{token:15} : {tagger}")

['so', '<REpEat:7>', 'scary', '!', '!', '!', 'it', 'is', 'very', 'arduous', '!', '!']
there           : determiners
he              : pronoun
has             : noun
taken           : verb
are             : verb
many            : other
of              : preposition
the             : determiners
protagonists    : noun
who             : other
have            : other
abhored         : verb
it              : pronoun
.               : punctuation
.               : punctuation
.               : punctuation
!               : punctuation
!               : punctuation


# Custom Stemmer or Lemmatizer

The motivation of this pipeline is to reduce similar tokens like "eaten","ate","eating" to their stem word i.e."eat". But it is to be taken care that over-stemming is avoided like "protagonists" isn't converted to "protagon". Basically I will try to convert only those tokens which are verb :)

In [6]:
class lemmatizer:
  def __init__(self):
    self.verb_endings=['ing','ed','en','es','ise','ize']
    self.be_verb={"is","am","are","was","were"}
    self.noun_endings=['ment', 'ness', 'ity', 'tion', 'sion', 'er', 'or']
    self.adj_endings=['ous', 'ful', 'ive', 'al', 'ic', 'able', 'ible','ary']

  def lemmatize(self,token:str,pos:str)->str:
    lemma=token
    if pos=='verb':
      for suffix in self.verb_endings:
        if token.endswith(suffix) and len(token)>len(suffix)+2:
          lemma=token[:-len(suffix)]
          if len(lemma) >= 2 and lemma[-1] == lemma[-2]:
             lemma = lemma[:-1]
          break
    elif pos=='noun':
      for suffix in self.noun_endings:
        if token.endswith(suffix) and len(token)>len(suffix)+2:
          lemma=token[:-len(suffix)]
          break
    elif pos=='adjective':
      for suffix in self.adj_endings:
        if token.endswith(suffix) and len(token)>len(suffix)+2:
          lemma=token[:-len(suffix)]
          break
    return lemma


In [7]:
if __name__ == "__main__":
    tokenizer = customtokenizer()
    tagger = POS_tagger()
    lemmatizer = lemmatizer()
    sample = "He's been running from the protagonists, fearing their abhored powers."
    tokens = tokenizer.tokenize(sample)
    print("tokens:", tokens)
    tagging_done = tagger.tagger(tokens)
    print("\nLemmatization:")
    for tok, tag in tagging_done:
        if re.fullmatch(r'<REPEAT:\d+>', tok):
            continue
        lemma = lemmatizer.lemmatize(tok, tag)
        print(f"{tok:15} ({tag}):{lemma}")

tokens: ['he', 'has', 'been', 'running', 'from', 'the', 'protagonists', ',', 'fearing', 'their', 'abhored', 'powers', '.']

Lemmatization:
he              (pronoun):he
has             (noun):has
been            (verb):been
running         (verb):run
from            (preposition):from
the             (determiners):the
protagonists    (noun):protagonists
,               (punctuation):,
fearing         (verb):fear
their           (pronoun):their
abhored         (verb):abhor
powers          (noun):powers
.               (punctuation):.


# Loading the dataset into the pipeline

In [8]:
!pip install wolta



In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import re

In [10]:
df_true=pd.read_csv('/content/True.csv')
df_fake=pd.read_csv('/content/Fake.csv')

In [11]:
df_true.head()

Unnamed: 0,title,text,subject,date
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017"
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017"
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017"
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017"
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017"


In [12]:
fake_texts=[]
for text in df_fake['text']:
  fake_texts.append(text)
true_texts=[]
for text in df_true['text']:
  true_texts.append(text)

In [13]:
texts = fake_texts + true_texts
labels = [0]*len(fake_texts) + [1]*len(true_texts)#0 for fake and 1 for true

# Integrating tokenizer,tagger and lemmatizer into the pipeline

In [14]:
tokenizer = customtokenizer()
tagger = POS_tagger()
lemmatizer = lemmatizer # This line was causing the error
def preprocess(text):
    tokens=tokenizer.tokenize(text)
    tagged=tagger.tagger(tokens)
    lemmas=[lemmatizer.lemmatize(tok, tag)
              for tok, tag in tagged if not re.fullmatch(r'<REpEat:\d+>', tok)] # Changed tagging_done to tagged
    return ' '.join(lemmas)  #join back for vectorization

In [15]:
preprocessed_texts = [preprocess(doc) for doc in texts]

# Decoding the sentiments(Bag of Words)

In [None]:
class bag_of_words:
  def __init__(self,min_freq=1):
    self.vocabulary={}
    self.min_freq=min_freq

  def build_vocabulary(self,documents):
    from collections import Counter
    word_counts=Counter()
    freq={}
    for doc in documents:
      tokens=re.findall(r'\b\w+\b',doc.lower())
      unique_tokens=set(tokens)
      for token in unique_tokens:
        freq[token]=freq.get(token,0)+1
        self.vocabulary = {}
    index = 0
    # Manually assign indices starting from 0
    for word, count in freq.items():
        if count >= self.min_freq:
            self.vocabulary[word] = index
            index += 1
    #verify the size and maximum index
    #print(f"Built vocabulary size: {len(self.vocabulary)}")
    #if self.vocabulary:
    #     max_index = max(self.vocabulary.values())
    #     print(f"Max index in vocabulary: {max_index}")
    #self.vocabulary={word: i for i, (word, count) in enumerate(freq.items()) if count >= self.min_freq}

  def transformation(self,documents):
    vectors=[]
    vocab_size = len(self.vocabulary)
    for doc in documents:
      tokens=re.findall(r'\b\w+\b',doc.lower())
      vector=np.zeros(len(self.vocabulary),dtype=int)
      for token in tokens:
        loc=self.vocabulary.get(token)
        if loc is not None: #and 0 <= loc < vocab_size:
          vector[loc]=vector[loc]+1
      vectors.append(vector)
    return np.array(vectors)

  def fitting(self,documents):
    self.build_vocabulary(documents)
    return self.transformation(documents)



In [None]:
vectorizer = bag_of_words(min_freq=2)
X = vectorizer.fitting(texts)
y = np.array(labels)

print(f"Vocabulary size: {len(vectorizer.build_vocabulary)}")
print(f"Feature vector shape: {X.shape}")


# TF-IDF implemenation

In [16]:
from collections import defaultdict, Counter
from scipy.sparse import lil_matrix
import numpy as np
import math

class TFIDFVectorizer:
    def __init__(self, min_freq=1):
        self.min_freq = min_freq
        self.vocab = {}
        self.idf = {}

    def fit(self, documents):
        doc_freq = defaultdict(int)
        total_docs = len(documents)
        for doc in documents:
            tokens = set(doc.split())  # unique tokens per doc
            for token in tokens:
                doc_freq[token] += 1
        self.vocab = {
            token: idx for idx, (token, freq) in enumerate(doc_freq.items())
            if freq >= self.min_freq
        }
        for token in self.vocab:
            df = doc_freq[token]
            self.idf[token] = math.log((1 + total_docs) / (1 + df)) + 1  # smooth IDF

    def transform(self, documents):
        rows = len(documents)
        cols = len(self.vocab)
        X = lil_matrix((rows, cols), dtype=np.float32)

        for i, doc in enumerate(documents):
            tf = Counter(doc.split())
            total_terms = sum(tf.values())
            for token, count in tf.items():
                if token in self.vocab:
                    tf_val = count / total_terms
                    idf_val = self.idf[token]
                    X[i, self.vocab[token]] = tf_val * idf_val
        return X.tocsr()

    def fit_transform(self, documents):
        self.fit(documents)
        return self.transform(documents)


In [17]:
updated_text=preprocessed_texts

In [18]:
if __name__ == "__main__":
    documents = [
        "the protagonist is brave",
        "the antagonist is cunning",
        "brave souls fight villains",
        "the brave and the bold"
    ]

    vectorizer = TFIDFVectorizer(min_freq=1)
    X = vectorizer.fit_transform(updated_text)
    y=np.array(labels)
    print(f"Vocabulary: {vectorizer.vocab}")
    print("TF-IDF matrix shape:", X.shape)
    print(X.toarray())


TF-IDF matrix shape: (44898, 159309)
[[0.00982121 0.01302487 0.00794298 ... 0.         0.         0.        ]
 [0.00567951 0.         0.         ... 0.         0.         0.        ]
 [0.02487674 0.         0.         ... 0.         0.         0.        ]
 ...
 [0.005505   0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.02115786 ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.04669078 0.04669078 0.04669078]]


# Training the model

In [19]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.svm import SVC

In [20]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

Using Naive-Bayes model

---



In [21]:
naive_bayes=MultinomialNB()
naive_bayes.fit(X_train,y_train)
y_pred_naive_bayes=naive_bayes.predict(X_test)

Using support vector machine

---



In [None]:
svm=SVC()
svm.fit(X_train,y_train)
y_pred_svm=svm.predict(X_test)