<a href="https://colab.research.google.com/github/dea1013/NLP-Synonym-Clusterer/blob/main/NLP_Synonym_Clusterer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [252]:
import numpy as np
import pandas as pd

import nltk
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.stem import PorterStemmer
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.cluster import AgglomerativeClustering
from sklearn.decomposition import PCA

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Import Data

In [254]:
df = pd.read_csv('/content/drive/MyDrive/Data Science Projects/NLP Synonym Clusterer/Vocabulary.csv')
df

Unnamed: 0,Key,Word,Definition
0,1,的,"indicates possession, like adding 's to a noun"
1,2,我,I; me
2,3,你,you (singular)
3,4,是,be; is; are; am
4,5,了,indicates a completed or finished action
...,...,...,...
4995,4996,深情厚谊,profound friendship
4996,4997,武侠,knight-errant; a genre of swordplay martial ar...
4997,4998,将就,put up with; accept somewhat reluctantly
4998,4999,对联,rhyming couplet; vertical written couplet usua...


In [194]:
df.to_csv('/content/drive/MyDrive/Data Science Projects/NLP Synonym Clusterer/Synonyms.csv',index=0)

# Preprocessing

## Cleaning

In [255]:
# convert to lower case
df['Definition'] = df['Definition'].str.lower()

# remove punctuation
df['Definition'] = df['Definition'].str.replace(r'[^\w\s\d]','',regex=True)

# remove extra white space
df['Definition'] = df['Definition'].str.replace(r' +', ' ',regex=True)

df

Unnamed: 0,Key,Word,Definition
0,1,的,indicates possession like adding s to a noun
1,2,我,i me
2,3,你,you singular
3,4,是,be is are am
4,5,了,indicates a completed or finished action
...,...,...,...
4995,4996,深情厚谊,profound friendship
4996,4997,武侠,knighterrant a genre of swordplay martial arts...
4997,4998,将就,put up with accept somewhat reluctantly
4998,4999,对联,rhyming couplet vertical written couplet usual...


## Normalization

In [256]:
# lemmatization
# wordnet = WordNetLemmatizer()
# df['Definition'] = df['Definition'].apply(lambda x: x.split(' '))
# df['Definition'] = df['Definition'].apply(lambda x: [wordnet.lemmatize(token,'n' if pos not in ['a', 'r', 'n', 'v'] else pos) for token,pos in pos_tag(x)])
# df['Definition'] = df['Definition'].apply(lambda x: ' '.join(x))

# stemming
porter = PorterStemmer()
df['Definition'] = df['Definition'].apply(lambda x: x.split(' '))
df['Definition'] = df['Definition'].apply(lambda x: [porter.stem(word) for word in x])
df['Definition'] = df['Definition'].apply(lambda x: ' '.join(x))

df

Unnamed: 0,Key,Word,Definition
0,1,的,indic possess like ad s to a noun
1,2,我,i me
2,3,你,you singular
3,4,是,be is are am
4,5,了,indic a complet or finish action
...,...,...,...
4995,4996,深情厚谊,profound friendship
4996,4997,武侠,knighterr a genr of swordplay martial art movi...
4997,4998,将就,put up with accept somewhat reluctantli
4998,4999,对联,rhyme couplet vertic written couplet usual pla...


In [257]:
df.sample(10)

Unnamed: 0,Key,Word,Definition
4701,4702,家常,the daili life of a famili homestyl food
4254,4255,抱负,aspir ambit
2742,2743,心灵,heart soul smart quickwit
4941,4942,任重道远,lit a heavi load and a long road fig to bear h...
246,247,票,ticket bank note a vote
2534,2535,恶心,disgust nauseou make somebodi embarrass èxīn b...
1149,1150,邮局,post offic
3458,3459,准则,principl standard or norm criterion
1367,1368,便,then in that case as earlylittl as
2848,2849,犬,dog kangxi radic 94


# TF-IDF

## Word Embedding

In [258]:
vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(df['Definition'].values)
vectors

<5000x5340 sparse matrix of type '<class 'numpy.float64'>'
	with 18298 stored elements in Compressed Sparse Row format>

## Cluster

In [259]:
clusters = AgglomerativeClustering(n_clusters=None,
                                   metric='cosine',
                                   linkage='average',
                                   distance_threshold=0.5).fit(vectors.toarray())
df['TF_IDF_Group_ID'] = clusters.labels_
np.unique(clusters.labels_).shape

(3819,)

In [260]:
dic = df.groupby('TF_IDF_Group_ID').groups
df['TF_IDF_Group'] = df['TF_IDF_Group_ID'].apply(lambda x: [k+1 for k in dic[x].tolist()])

## PCA

In [261]:
points = PCA(n_components=2).fit_transform(vectors.toarray())
norms = np.linalg.norm(points,axis=1)
points.T[0] /= norms
points.T[1] /= norms
df['TF_IDF_X'] = points.T[0]
df['TF_IDF_Y'] = points.T[1]

# BOW

## Word Embedding

## Cluster

# CBOW

## Word Embedding

## Cluster

# Skip-Gram

# GloVe

# BERT

# Export

In [271]:
df.to_csv('/content/drive/MyDrive/Data Science Projects/NLP Synonym Clusterer/Synonyms.csv',index=0)