# _N-gram Model for Gibberish words detection_

## N-grams:
ordered sub-strings of length n

### "GIBBERISH"

__Length:__ m = 9 <br>
__n_grams:__ (m - n + 1) = (10 - n) <br>
__Length:__ GI, IB, BB, BE, ER, RI, IS, SH <br>
__Length:__ GIB, IBB, BBE, BER, ERI, RIS, ISH <br>
<br>
3-gram model is developed o alphabets. <br>
Probabiity of a 3-gram is calculated using conditional probabilities. <br>
<br>
___$P(GIB) = P(G) * P(I|G) * P(B|GI)$___ <br><br>
The 3 probabilities on the right hand side is calculated using transition matrices trained using corpus of merchant names and the transition matrices serves as our model. <br><br> Allowing for 27 characters (26 alphabets and a space), our model will have 3 arrays (transition matrices) of dimensionsL 27, 27x27, 27$^{2}$x27 <br><br>
The 3-gram probability is calculated for each possible (m-2) 3-grams and are averaged. The average transition probability serves as our score on which we can set a threshold.

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.probability import FreqDist
import glob
import os
import time
import warnings

import seaborn as sns
import matplotlib.pyplot as plt
from itertools import accumulate

In [None]:
df_train = read_data("\\verbatim\\train.csv")

In [None]:
def standardize_text(df, text_field):
    df[text_field] = df.filter(regex = text_field)
    df[text_field] = df[text_field].str.replace(r"[!#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]","")
    df[text_field] = df[text_field].str.replace(r"[^A-Za-z]", " ")
    df[text_field] = df[text_field].str.replace(" ", "")
    df[text_field] = df[text_field].str.lower()
    df[text_field] = df[text_field].str.strip()
    df[text_field] = df[text_field].str.replace(" ", "-", regex = True)
    df[text_field] = df[text_field].str.replace("--", "-", regex = True)

In [None]:
df3 = standardize_text(df_train, 'Merch_Name')['Merch_Name'].dropna()

In [None]:
accepted_chars = 'abcdefghijklmnopqrstuvwxyz-'
pos = dict([(char, idx) for idx, char in enumerate(accepted_chars)])

In [None]:
possible_bigrams = []
for i, x in enumerate(accepted_chars):
    for j, y in enumerate(accepted_chars):
        possible_bigrams.append(x+y)

In [None]:
pos2 = dict([(char, idx) for idx, char in enumerate(possible_bigrams)])

In [None]:
def ngram(n,l):
    for start in range(0, len(l) - n +1):
        yield ''.join(l[start:start + n])
        
def train_unigram():
    k = len(accepted_chars)  
    counts = [[10 for i in range(k)]]
    for line in df3:
        for a in ngram(1, line):
            counts[0][pos[a]] += 1
    for i, row in enumerate(counts):
        s = float(sum(row))
        for j in range(len(row)):
            row[j] = math.log(row[j]/s)
    return counts

def train_bigram():
    k = len(accepted_chars)  
    counts = [[10 for i in range(k)] for i range(k)]
    for line in df3:
        for a,b in ngram(2, line):
            counts[pos[a]][pos[b]] += 1
    for i, row in enumerate(counts):
        s = float(sum(row))
        for j in range(len(row)):
            row[j] = math.log(row[j]/s)
    return counts

def train_trigram():
    k = len(accepted_chars)  
    k2 = len(possible_bigrams)
    counts = [[10 for i in range(k)] for i range(k2)]
    for line in df3:
        for a,b,c in ngram(3, line):
            counts[pos[a+b]][pos[c]] += 1
    for i, row in enumerate(counts):
        s = float(sum(row))
        for j in range(len(row)):
            row[j] = math.log(row[j]/s)
    return counts

In [None]:
unigram_matrix = train_unigram()

In [None]:
print(unigram_matrix)

In [None]:
bigram_matrix = train_bigram()

In [None]:
plt.subplot(figsize = (10,8))
sns.heatmap(bigram_matrix, xticklabels = pos.keys(), yticklabels = pos.keys(), cmap = 'coolwarm')
plt.xlabel('Second Alphabet')
pt.ylabel('First Alphabet')
plt.title('Bi-gram transition log-likelihood')
plt.show()

In [None]:
trigram_matrix = train_trigram()

#### Average transition Probability

In [None]:
def avg_transition_prob(l, uni_mat = unigram_matrix, bi_mat = bigram_matrix, tri_mat = trigram_matrix):
    log_prob = 0.0
    transition_ct = 0
    for a, b, c in ngram(3,l):
        uniprob = uni_mat[0][pos[a]]
        biprob = bi_mat[pos[a]][pos[b]]
        triprob = tri_mat[pos2[a + b]][pos[c]]
        log_prob += uniprob + biprob + triprob
        transition_ct += 1
    return -1*(log_prob/transition_ct or 1)

In [None]:
train_predictions = [avg_transition_prob(x) for x in df3]

In [None]:
%matplotlib inline
plt.hist(train_predictions, density = True, bins = np.arange(0, 20, 0.1), label = 'train', alpha = 0.5)
plt.hist(test_predictions, density = True, bins = np.arange(0, 20, 0.1), label = 'gibberish', alpha = 0.5)
plt.xlabel('strangeness')
plt.ylabel('Density')
plt.legend(loc = 'best')
plt.show()

In [None]:
unigram_pd = pd.DataFrame({'char': [x for x in accepted_chars], 'value': list(np.squeeze(unigram_matrix))})

In [None]:
bigram_pd = pd.DataFrame({'char': [x+y for x in accepted_chars for y in accepted_chars],
                          'value': np.array(bigram_matrix).reshape((27**2,))})

In [None]:
trigram_pd = pd.DataFrame({'char': [x+y for x in accepted_chars for y in possible_bigrams],
                          'value': np.array(trigram_matrix).reshape((27**3,))})