# Loading Data


In [6]:
import pandas as pd

data = pd.read_csv('E:\Final_Data.csv', index_col=0)
#data.head()
#data.info()
data

Unnamed: 0,Title,Content
0,punishment crimes committed substance susch co...,person kills attempts cause death child woman ...
1,punishment abduction wimen children,person human trafficking prevention suppressio...
2,punishment ransom,person detains woman child ransom person shall...
3,punishment rapem death due rape,man rapes woman child shall punishable death p...
4,punishment inciting women commit suicide,woman commits suicide without consent woman pe...
...,...,...
195,adulteration food drink intended sale,whoever adulterates article food drink make ar...
196,punishment theft,whoever commits theft shall punished imprisonm...
197,theft dwellinghouse etc,whoever commits theft building tent vessel bui...
198,theft preparation made causing death hurt rest...,whoever commits theft made preparation causing...


## Cleaning Data

In [19]:
def clean_again(x):
    x = str(x).replace('whoever', '')
    x = str(x).replace('shall', '')
    x = str(x).replace('person', '')
    return x

In [20]:
data['Content'] = data['Content'].apply(lambda x: clean_again(x))
data.head()

Unnamed: 0,Title,Content
0,punishment crimes committed substance susch co...,kills attempts cause death child woman burnin...
1,punishment abduction wimen children,human trafficking prevention suppression act ...
2,punishment ransom,detains woman child ransom liable death life...
3,punishment rapem death due rape,man rapes woman child punishable death penalty...
4,punishment inciting women commit suicide,woman commits suicide without consent woman d...


# TF-IDF (Feature Extraction)
### Source of tf-idf: https://medium.com/@cmukesh8688/tf-idf-vectorizer-scikit-learn-dbc0244a911a

In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
#unicode
documents = data['Content'].values.astype("U")

vectorizer = TfidfVectorizer(stop_words='english')
features = vectorizer.fit_transform(documents)
features

<200x1094 sparse matrix of type '<class 'numpy.float64'>'
	with 4662 stored elements in Compressed Sparse Row format>

# Clustering
### Source of clustering: https://www.youtube.com/watch?v=ORpDAUQUnkU

In [22]:
from sklearn.cluster import KMeans
model = KMeans(n_clusters=8, init = 'k-means++', max_iter = 10, n_init = 1)
model.fit(features)

data['Cluster'] = model.labels_
data.to_csv('Pre_Label_Data.csv')
data

Unnamed: 0,Title,Content,Cluster
0,punishment crimes committed substance susch co...,kills attempts cause death child woman burnin...,1
1,punishment abduction wimen children,human trafficking prevention suppression act ...,1
2,punishment ransom,detains woman child ransom liable death life...,1
3,punishment rapem death due rape,man rapes woman child punishable death penalty...,1
4,punishment inciting women commit suicide,woman commits suicide without consent woman d...,1
...,...,...,...
195,adulteration food drink intended sale,adulterates article food drink make article no...,7
196,punishment theft,commits theft punished imprisonment either des...,4
197,theft dwellinghouse etc,commits theft building tent vessel building te...,4
198,theft preparation made causing death hurt rest...,commits theft made preparation causing death h...,4


# Printing top 10 features of the clusters

In [26]:
print("Cluster Centraoids: \n")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

for i in range(8):
    print("Cluster %d:" % i)
    for j in order_centroids[i, :30]:
        print(' %s,'%terms[j])
    print('-------------')

Cluster Centraoids: 

Cluster 0:
 false,
 offence,
 evidence,
 convicted,
 gives,
 weapon,
 cause,
 punishable,
 knowing,
 extend,
 imprisonment,
 true,
 likely,
 gave,
 punished,
 years,
 fine,
 certificate,
 term,
 armed,
 believe,
 description,
 uses,
 drug,
 section,
 knows,
 used,
 deadly,
 declaration,
 warranty,
-------------
Cluster 1:
 act,
 child,
 woman,
 death,
 hurt,
 years,
 imprisonment,
 extend,
 causes,
 fine,
 term,
 description,
 section,
 punished,
 life,
 grievous,
 punishable,
 caused,
 means,
 case,
 cause,
 guilty,
 voluntarily,
 substance,
 human,
 intention,
 homicide,
 culpable,
 provided,
 taka,
-------------
Cluster 2:
 public,
 servant,
 property,
 extend,
 duty,
 term,
 fine,
 discharge,
 imprisonment,
 description,
 punished,
 injury,
 lawful,
 cause,
 taka,
 knows,
 years,
 office,
 knowing,
 sale,
 authority,
 obstructs,
 legally,
 intending,
 taken,
 likely,
 months,
 act,
 functions,
 bound,
-------------
Cluster 3:
 officer,
 sailor,
 soldier,
 airm

# Pie chart of the result

In [None]:
""""
%matplotlib inline
import matplotlib.pyplot as plt

exp_vals = [2, 13, 4, 10]
exp_labels = ["Cluster 0", "Cluster 1", "Cluster 2", "Cluster 3"]

plt.pie(exp_vals,labels=exp_labels, shadow=True, autopct='%1.1f%%', radius=2.5)
#plt.axis("equal")
plt.show()