<a href="https://colab.research.google.com/github/gaixen/BCS_recruitment/blob/main/VeritasVigil%3A%20The%20truth%20Watchman/demo_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Custom Tokenizer Development

In [1]:
import re
from typing import Text

In [2]:
class customtokenizer:
  def __init__(self):
    self.emoticon_pattern=re.compile(r'[:;=8][\-o\*]?[\\]dDpP/\:\}\{@\|\\}')
    self.contractions={"can't":"can not","won't":"will not","i'm":"i am","he's":"he is",
    "she's":"she is","it's":"it is","that's":"that is","there's":"there is","what's":"what is",
             "who've":"who have","'ve":"have","didn't":"did not","don't":"do not","isn't":"is not",
                       "shouldn't":"should not"}#some frequently used short-forms
  #they are identified early so that when in later functions we split punctuations etc, it doesn't provide a barrier
  def expand_contractions(self,text:str)->str:
    def expand_quotes(match):
        subject = match.group(1)
        token_0 = match.group(2)
        token_0 = token_0.lower()
        if len(token_0) >= 2 and (
            (token_0[-1] == 'e' and token_0[-2] == 'n') or
            (token_0[-1] == 'n' and token_0[-2] == 'e')
        ):
            return f"{subject} has {match.group(2)}"
        else:
            return f"{subject} is {match.group(2)}"
    text = re.sub(r"\b(he|she|it)'s\s+(\w+)", expand_quotes, text, flags=re.IGNORECASE)

    for contraction,expanded in self.contractions.items():
        text=re.sub(r'\b'+re.escape(contraction)+r'\b',expanded,text)

    return text


  def normalize(self,word:str)->list[str]:
    match=re.search(r'(.)\1{2,}',word)
    if match:
      char=match.group(1)
      repeat_count=len(match.group(0))
      normalized=re.sub(r'(.)\1{2,}',re.escape(char),word)
      return[normalized,f"<REpEat:{repeat_count}>"]
    else:
      return[word]

  def tokenize(self,text:str)->list[str]:
    text=text.lower()#lowercasing the words
    text=self.expand_contractions(text)#expand the contractions
    #text=self.emoticon_pattern(text)#identify emoticon patterns
    emoticons=self.emoticon_pattern.findall(text)
    text=self.emoticon_pattern.sub('',text)#substitute the emoticon patterns
    text=re.sub(r'([!?.,;:"(){}[\]])', r' \1 ',text)#splitting punctuations
    text=re.sub(r'\s{2,}',' ',text)
    tokens=[]#initialise a blank list of tokens
    for word in text.strip().split():
      tokens.extend(self.normalize(word))
    return tokens+emoticons


In [3]:
if __name__ == "__main__":
    tokenizer = customtokenizer()
    sample1 = "there are mannnnny of the PROTAGONISTS it's abhored IT...!!"
    sample2="latttent"
    tokens = tokenizer.tokenize(sample1)
    print(tokens)

['there', 'are', 'many', '<REpEat:5>', 'of', 'the', 'protagonists', 'it', 'is', 'abhored', 'it', '.', '.', '.', '!', '!']


# Rule‑Based POS(parts of speech) Tagger

Distinguish between nouns, adjectives and verbs only


In [4]:
class POS_tagger:
  def __init__(self):
    self.pronouns={"i", "you", "he", "she", "it", "we", "they", "me", "us", "them","their"}
    self.determiners = {"the", "a", "an", "this", "that", "these", "those","there"}
    self.adj_endings=['ous', 'ful', 'ive', 'al', 'ic', 'able', 'ible','ary']
    self.verb_endings=['ing','ed','en','es','s','ise','ize']
    self.be_verb={"is","am","are","was","were"}
    self.adv_endings=['ly','ily']
    self.adverbs_common={"very", "most" ,"so"}
    #self.noun_endings=['ment', 'ness', 'ity', 'tion', 'sion', 'er', 'or']
    self.prepositions=["of","to","in","for","on","with","at","by","from","about","over","after","as"]
    self.pre={"REpEat"}
    self.noun_endings = ['tion', 'ment', 'ness', 'ity', 'ist', 'ism', 'ance', 'ence', 'ship', 's']  # careful with 's'


  def tagger(self,tokens:list[str])->list[tuple[str,str]]:
    tagging_done=[]
    for token in tokens:
      if re.fullmatch(r'REPEAT:\d+',token):
        continue
      if re.fullmatch(r'[.,!?;:\'\"()\[\]{}]', token):
        tagger="punctuation"
      elif token in self.pronouns:
        tagger="pronoun"
      elif token in self.determiners:
        tagger="determiners"
      elif token in self.be_verb:
        tagger="verb"
      elif token in self.adverbs_common:
        tagger="adverb"
      elif re.fullmatch(r'\d+(\.\d+)?', token):
        tagger = "NUM"
      elif token in self.prepositions:
        tagger="preposition"
      elif token.startswith("<REPEAT"):
        tagger = "OTHER"
      elif any(token.endswith(suffix)for suffix in self.noun_endings) :
        tagger="noun"
      elif any(token.endswith(suffix)for suffix in self.verb_endings) :
        tagger="verb"
      elif any(token.endswith(suffix)for suffix in self.adj_endings) :
        tagger="adjective"
      elif any(token.endswith(suffix)for suffix in self.adv_endings) :
        tagger="adverb"

      elif any(token.startswith(pre)for pre in self.pre):
        tagger="ignore"
      else:
        tagger="other"
      tagging_done.append((token,tagger))
    return tagging_done

In [5]:
if __name__ == "__main__":
    tokenizer = customtokenizer()
    tagger = POS_tagger()
    sample_1 = "Sooooooo scary!!!IT's very arduous!!"
    sample_2="there he's taken are mannnnny of the PROTAGONISTS who've abhored IT...!!"
    tokens = tokenizer.tokenize(sample_2)
    tagging_done = tagger.tagger(tokens)
    tokens = tokenizer.tokenize(sample_1)
    print(tokens)
for token, tagger in tagging_done:
  if re.fullmatch(r'<REpEat:\d+>', token):
    continue
  else:
    print(f"{token:15} : {tagger}")

['so', '<REpEat:7>', 'scary', '!', '!', '!', 'it', 'is', 'very', 'arduous', '!', '!']
there           : determiners
he              : pronoun
has             : noun
taken           : verb
are             : verb
many            : other
of              : preposition
the             : determiners
protagonists    : noun
who             : other
have            : other
abhored         : verb
it              : pronoun
.               : punctuation
.               : punctuation
.               : punctuation
!               : punctuation
!               : punctuation


# Custom Stemmer or Lemmatizer

The motivation of this pipeline is to reduce similar tokens like "eaten","ate","eating" to their stem word i.e."eat". But it is to be taken care that over-stemming is avoided like "protagonists" isn't converted to "protagon". Basically I will try to convert only those tokens which are verb :)

In [6]:
class lemmatizer:
  def __init__(self):
    self.verb_endings=['ing','ed','en','es','ise','ize']
    self.be_verb={"is","am","are","was","were"}
    self.noun_endings=['ment', 'ness', 'ity', 'tion', 'sion', 'er', 'or']
    self.adj_endings=['ous', 'ful', 'ive', 'al', 'ic', 'able', 'ible','ary']

  def lemmatize(self,token:str,pos:str)->str:
    lemma=token
    if pos=='verb':
      for suffix in self.verb_endings:
        if token.endswith(suffix) and len(token)>len(suffix)+2:
          lemma=token[:-len(suffix)]
          if len(lemma) >= 2 and lemma[-1] == lemma[-2]:
             lemma = lemma[:-1]
          break
    elif pos=='noun':
      for suffix in self.noun_endings:
        if token.endswith(suffix) and len(token)>len(suffix)+2:
          lemma=token[:-len(suffix)]
          break
    elif pos=='adjective':
      for suffix in self.adj_endings:
        if token.endswith(suffix) and len(token)>len(suffix)+2:
          lemma=token[:-len(suffix)]
          break
    return lemma


In [7]:
if __name__ == "__main__":
    tokenizer = customtokenizer()
    tagger = POS_tagger()
    lemmatizer = lemmatizer()
    sample = "He's been running from the protagonists, fearing their abhored powers."
    tokens = tokenizer.tokenize(sample)
    print("tokens:", tokens)
    tagging_done = tagger.tagger(tokens)
    print("\nLemmatization:")
    for tok, tag in tagging_done:
        if re.fullmatch(r'<REPEAT:\d+>', tok):
            continue
        lemma = lemmatizer.lemmatize(tok, tag)
        print(f"{tok:15} ({tag}):{lemma}")

tokens: ['he', 'has', 'been', 'running', 'from', 'the', 'protagonists', ',', 'fearing', 'their', 'abhored', 'powers', '.']

Lemmatization:
he              (pronoun):he
has             (noun):has
been            (verb):been
running         (verb):run
from            (preposition):from
the             (determiners):the
protagonists    (noun):protagonists
,               (punctuation):,
fearing         (verb):fear
their           (pronoun):their
abhored         (verb):abhor
powers          (noun):powers
.               (punctuation):.


# Loading the dataset into the pipeline

In [None]:
!pip install wolta

Collecting wolta
  Downloading wolta-0.3.7-py3-none-any.whl.metadata (960 bytes)
Collecting catboost (from wolta)
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Collecting imblearn (from wolta)
  Downloading imblearn-0.0-py2.py3-none-any.whl.metadata (355 bytes)
Downloading wolta-0.3.7-py3-none-any.whl (19 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import re

In [None]:
df_true=pd.read_csv('/content/True.csv')
df_fake=pd.read_csv('/content/Fake.csv')

In [None]:
df_true.head()

In [None]:
fake_texts=[]
for text in df_fake['text']:
  fake_texts.append(text)
true_texts=[]
for text in df_true['text']:
  true_texts.append(text)

In [None]:
texts = fake_texts + true_texts
labels = [0]*len(fake_texts) + [1]*len(true_texts)#0 for fake and 1 for true

# Integrating tokenizer,tagger and lemmatizer into the pipeline

In [None]:
tokenizer = customtokenizer()
tagger = POS_tagger()
lemmatizer = lemmatizer # This line was causing the error
def preprocess(text):
    tokens=tokenizer.tokenize(text)
    tagged=tagger.tagger(tokens)
    lemmas=[lemmatizer.lemmatize(tok, tag)
              for tok, tag in tagged if not re.fullmatch(r'<REpEat:\d+>', tok)] # Changed tagging_done to tagged
    return ' '.join(lemmas)  #join back for vectorization

In [None]:
preprocessed_texts = [preprocess(doc) for doc in texts]

# Decoding the sentiments(Bag of Words)

In [None]:
class bag_of_words:
  def __init__(self,min_freq=1):
    self.vocabulary={}
    self.min_freq=min_freq

  def build_vocabulary(self,documents):
    from collections import Counter
    word_counts=Counter()
    freq={}
    for doc in documents:
      tokens=re.findall(r'\b\w+\b',doc.lower())
      unique_tokens=set(tokens)
      for token in unique_tokens:
        freq[token]=freq.get(token,0)+1
        self.vocabulary = {}
    index = 0
    # Manually assign indices starting from 0
    for word, count in freq.items():
        if count >= self.min_freq:
            self.vocabulary[word] = index
            index += 1
    #verify the size and maximum index
    #print(f"Built vocabulary size: {len(self.vocabulary)}")
    #if self.vocabulary:
    #     max_index = max(self.vocabulary.values())
    #     print(f"Max index in vocabulary: {max_index}")
    #self.vocabulary={word: i for i, (word, count) in enumerate(freq.items()) if count >= self.min_freq}

  def transformation(self,documents):
    vectors=[]
    vocab_size = len(self.vocabulary)
    for doc in documents:
      tokens=re.findall(r'\b\w+\b',doc.lower())
      vector=np.zeros(len(self.vocabulary),dtype=int)
      for token in tokens:
        loc=self.vocabulary.get(token)
        if loc is not None: #and 0 <= loc < vocab_size:
          vector[loc]=vector[loc]+1
      vectors.append(vector)
    return np.array(vectors)

  def fitting(self,documents):
    self.build_vocabulary(documents)
    return self.transformation(documents)



In [None]:
vectorizer = bag_of_words(min_freq=2)
X = vectorizer.fitting(texts)
y = np.array(labels)

print(f"Vocabulary size: {len(vectorizer.build_vocabulary)}")
print(f"Feature vector shape: {X.shape}")


# TF-IDF implemenation

In [None]:
from collections import defaultdict, Counter
from scipy.sparse import lil_matrix
import numpy as np
import math

class TFIDFVectorizer:
    def __init__(self, min_freq=1):
        self.min_freq = min_freq
        self.vocab = {}
        self.idf = {}

    def fit(self, documents):
        doc_freq = defaultdict(int)
        total_docs = len(documents)
        for doc in documents:
            tokens = set(doc.split())  # unique tokens per doc
            for token in tokens:
                doc_freq[token] += 1
        self.vocab = {
            token: idx for idx, (token, freq) in enumerate(doc_freq.items())
            if freq >= self.min_freq
        }
        for token in self.vocab:
            df = doc_freq[token]
            self.idf[token] = math.log((1 + total_docs) / (1 + df)) + 1  # smooth IDF

    def transform(self, documents):
        rows = len(documents)
        cols = len(self.vocab)
        X = lil_matrix((rows, cols), dtype=np.float32)

        for i, doc in enumerate(documents):
            tf = Counter(doc.split())
            total_terms = sum(tf.values())
            for token, count in tf.items():
                if token in self.vocab:
                    tf_val = count / total_terms
                    idf_val = self.idf[token]
                    X[i, self.vocab[token]] = tf_val * idf_val
        return X.tocsr()

    def fit_transform(self, documents):
        self.fit(documents)
        return self.transform(documents)


In [None]:
updated_text=preprocessed_texts

In [None]:
if __name__ == "__main__":
    vectorizer = TFIDFVectorizer(min_freq=1)
    X = vectorizer.fit_transform(updated_text)
    y=np.array(labels)
    # print(f"Vocabulary: {vectorizer.vocab}")
    # print("TF-IDF matrix shape:", X.shape)
    # print(X.toarray())


# Training the model

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.svm import SVC

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

Using Naive-Bayes model

---



In [None]:
naive_bayes=MultinomialNB()
naive_bayes.fit(X_train,y_train)
y_pred_naive_bayes=naive_bayes.predict(X_test)

Using support vector machine

---



In [None]:
svm=SVC()
svm.fit(X_train,y_train)
y_pred_svm=svm.predict(X_test)

# Visualisations

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def plot_wordcloud(texts, title='Word Cloud'):
    all_text = ' '.join(texts)
    wc = WordCloud(width=800, height=400, background_color='white').generate(all_text)

    plt.figure(figsize=(12, 6))
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(title, fontsize=16)
    plt.show()
plot_wordcloud(updated_text, title='common Words in News Dataset')


In [None]:
from collections import Counter

def plot_top_tokens(texts, top_n=20, title='Top Tokens'):
    tokens = ' '.join(texts).split()
    token_counts = Counter(tokens)
    common = token_counts.most_common(top_n)
    labels, values = zip(*common)

    plt.figure(figsize=(10, 5))
    plt.bar(labels, values, color='skyblue')
    plt.xticks(rotation=45, ha='right')
    plt.title(title)
    plt.ylabel('Frequency')
    plt.tight_layout()
    plt.show()
plot_top_tokens(updated_text, title='most Common Tokens')


In [None]:
import re

def plot_repeat_tokens(texts):
    repeat_counts = Counter()
    for text in texts:
        repeats = re.findall(r'<REPEAT:(\d+)>', text)
        for r in repeats:
            repeat_counts[int(r)] += 1

    if not repeat_counts:
        print("No <REPEAT:n> tokens found.")
        return

    keys, values = zip(*sorted(repeat_counts.items()))
    plt.figure(figsize=(8, 5))
    plt.bar(keys, values, color='coral')
    plt.xlabel('Repeat Count (n)')
    plt.ylabel('Frequency')
    plt.title('Distribution of <REPEAT:n> Tokens')
    plt.xticks(keys)
    plt.show()
plot_repeat_tokens(updated_text)


In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

def plot_confusion_matrix(y_true, y_pred, labels=['Real', 'Fake'], title='Confusion Matrix'):
    cm = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap=plt.cm.Blues)
    plt.title(title)
    plt.show()
y_pred = svm.predict(X_test)
plot_confusion_matrix(y_test, y_pred)


In [None]:
def plot_class_distribution(labels, label_names=['Real', 'Fake']):
    from collections import Counter
    count = Counter(labels)
    keys = [label_names[k] for k in sorted(count)]
    values = [count[k] for k in sorted(count)]

    plt.figure(figsize=(6, 4))
    plt.bar(keys, values, color=['green', 'red'])
    plt.title('Class Distribution')
    plt.ylabel('Count')
    plt.show()
plot_class_distribution(y)


# Evaluation Metrices

In [None]:
from sklearn.metrics import roc_curve, auc

def plot_roc(model, X_test, y_test, title='ROC Curve'):
    probs = model.decision_function(X_test)  # Use predict_proba for models like NB
    fpr, tpr, thresholds = roc_curve(y_test, probs)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(8, 5))
    plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.2f})', color='red')
    plt.plot([0, 1], [0, 1], linestyle='--', color='gray')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(title)
    plt.legend()
    plt.show()
plot_roc(svm, X_test, y_test, title='ROC Curve for SVM')


In [None]:
from sklearn.metrics import accuracy_score,classification_report,confusion_matrix,roc_curve,auc

In [None]:
accuracy_svm = accuracy_score(y_test, y_pred_svm)
f1_svm = f1_score(y_test, y_pred_svm, average='weighted')
accuracy_nb = accuracy_score(y_test, y_pred_naive_bayes)
f1_nb = f1_score(y_test, y_pred_naive_bayes, average='weighted')
# Optional: print for confirmation
print(f"Accuracy_nb: {accuracy_nb:.4f}")
print(f"F1 Score_nb: {f1_nb:.4f}")
print(f"Accuracy_svm: {accuracy_svm:.4f}")
print(f"F1 Score_svm: {f1_svm:.4f}")

In [None]:
def plot_model_performance(metrics_dict, title='Model Comparison'):
    import seaborn as sns
    import pandas as pd

    df = pd.DataFrame(metrics_dict)
    df = df.set_index('Model')

    df.plot(kind='bar', figsize=(10, 6), colormap='viridis')
    plt.title(title)
    plt.ylabel('Score')
    plt.ylim(0, 1.05)
    plt.xticks(rotation=0)
    plt.legend(loc='lower right')
    plt.tight_layout()
    plt.show()
metrics = {
    'Model': ['SVM', 'Naive Bayes'],
    'Accuracy': [accuracy_svm,accuracy_nb],
    'F1 Score': [f1_svm,f1_nb]
}
plot_model_performance(metrics)


In [None]:
nb_preds = naive_bayes.predict(X_test)
svm_preds = svm.predict(X_test)
print("Naive Bayes:\n", classification_report(y_test, nb_preds))
print("SVM:\n", classification_report(y_test, svm_preds))
print("Confusion Matrix (SVM):\n", confusion_matrix(y_test, svm_preds))
print("confusion matrix(NB):\n",confusion_matrix(y_test,nb_preds))