In [48]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [20]:
df = pd.read_csv("constitution_articlewise.csv")
df.head()

Unnamed: 0,PART-Article,Data
0,PART I - 1,"Name and territory of the Union.—(1) India, th..."
1,PART I - 2,Admission or establishment of new States.—Parl...
2,PART I - 2A,[Sikkim to be associated with the Union.] Rep....
3,PART I - 3,Formation of new States and alteration of area...
4,PART I - 4,Laws made under articles 2 and 3 to provide fo...


In [25]:
import string
def remPunct(text):
    text = text.replace("("," ").replace(")"," ")
    ans = "".join([c.lower() for c in text if c not in string.punctuation+"—"])
    return ans

In [26]:
df['Data'] = df["Data"].map(str).apply(lambda x: remPunct(x))
df.head()

Unnamed: 0,PART-Article,Data
0,PART I - 1,name and territory of the union 1 india that ...
1,PART I - 2,admission or establishment of new statesparlia...
2,PART I - 2A,sikkim to be associated with the union rep by ...
3,PART I - 3,formation of new states and alteration of area...
4,PART I - 4,laws made under articles 2 and 3 to provide fo...


In [29]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

def remStop(text):
    ans = [i for i in text.split() if i not in stopwords.words('english')]
    return " ".join(ans)
df['Data'] = df["Data"].apply(lambda x: remStop(x))
df.head()

[nltk_data] Downloading package stopwords to /home/cm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,PART-Article,Data
0,PART I - 1,name territory union 1 india bharat shall bea ...
1,PART I - 2,admission establishment new statesparliament m...
2,PART I - 2A,sikkim associated union rep constitution thirt...
3,PART I - 3,formation new states alteration areas boundari...
4,PART I - 4,laws made articles 2 3 provide amendmentof fir...


In [30]:
from nltk.stem import PorterStemmer
def Stemmer(text):
    ans = [PorterStemmer().stem(i) for i in text.split()]
    return " ".join(ans)
df['Data'] = df["Data"].apply(lambda x: Stemmer(x))
df.head()

Unnamed: 0,PART-Article,Data
0,PART I - 1,name territori union 1 india bharat shall bea ...
1,PART I - 2,admiss establish new statesparlia may lawadmit...
2,PART I - 2A,sikkim associ union rep constitut thirti sixth...
3,PART I - 3,format new state alter area boundari ornam exi...
4,PART I - 4,law made articl 2 3 provid amendmentof first f...


In [32]:
Corpus = []
for i in df.Data.values:
    Corpus.append(i.split())

In [36]:
!mkdir word2vec
PATH = "word2vec"
embedding_depth = 300
from gensim.models import Word2Vec
model_sb = Word2Vec(Corpus, sg=1, window=2, vector_size=embedding_depth, hs=1)
model_sb.save(f"{PATH}/modelSB0_{embedding_depth}")

In [49]:
data_emb = np.zeros((df.shape[0],embedding_depth))
mainc = 0
for i in df.Data:
    buff = i.split()
    buffn = np.zeros(embedding_depth)
    c = 0
    for j in buff:
        try:
            buffn+=model_sb.wv[j]
            c+=1
        except:
            pass
    data_emb[mainc] = buffn/c
    mainc+=1

In [52]:
data_emb

array([[-2.81573116e-02,  1.07740952e-01,  2.59164488e-03, ...,
        -2.30966160e-02, -7.03013568e-02, -1.04730112e-01],
       [-3.04501372e-02,  9.32923621e-02, -4.15646524e-02, ...,
        -5.82069147e-02, -3.38633081e-02, -6.76568768e-02],
       [ 1.75205692e-02,  6.45251479e-03, -2.51955011e-02, ...,
        -9.86967199e-03, -3.55671897e-02, -8.27735718e-02],
       ...,
       [ 3.03240847e-02, -3.89440760e-02, -1.31322937e-02, ...,
        -2.10742836e-02, -4.47861533e-02, -9.10109240e-02],
       [ 1.22016267e-02,  1.18826022e-02, -1.59249189e-02, ...,
        -1.15943460e-02, -5.83680177e-02, -6.48959460e-02],
       [ 5.21226540e-03,  3.35149296e-02, -5.62073665e-02, ...,
        -5.88831047e-02,  1.82900611e-05, -9.57289192e-02]])

In [139]:
def EQSense(sent, emb=data_emb, model=model_sb, df=df):
    sent = Stemmer(remStop(remPunct(sent)))
    buff = sent.split()
    query = np.zeros(emb.shape[1])
    
    c = 0
    for j in buff:
        try:
            query+=model_sb.wv[j]
            c+=1
        except:
            pass
    if query.all() == np.zeros(emb.shape[1]).all():
        return "Constitution was not violated"
    query = query/c
    
    cos_sim = []
    for i in emb:
        cos_sim.append(float(cosine_similarity(query.reshape(-1,1).T,i.reshape(-1,1).T)))
    ans = []
    for i in range(5):
        ind = np.argmax(cos_sim)
        cos_sim[ind] = -9999
        ans.append(df["PART-Article"][ind])
    return ans
    

In [150]:
EQSense("boy having age of 14 years was slapped by boss")

['PART III - 24',
 'PART VI - 157',
 'PART II - 5',
 'PART IV - 45',
 'PART V - 63']