<a href="https://colab.research.google.com/github/dea1013/NLP-Synonym-Clusterer/blob/main/NLP_Synonym_Clusterer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries

In [73]:
import numpy as np
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.stem import PorterStemmer
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Import Data

In [30]:
df = pd.read_csv('/content/drive/MyDrive/Data Science Projects/NLP Synonym Clusterer/Vocabulary.csv')
df = df.sort_values(by='Key')
df

Unnamed: 0,Key,Word,Definition
0,1,的,"indicates possession, like adding 's to a noun"
1,2,我,I; me
2,3,你,you (singular)
3,4,是,be; is; are; am
4,5,了,indicates a completed or finished action
...,...,...,...
4994,4996,深情厚谊,profound friendship
4995,4997,武侠,knight-errant; a genre of swordplay martial ar...
4996,4998,将就,put up with; accept somewhat reluctantly
4997,4999,对联,rhyming couplet; vertical written couplet usua...


# Preprocessing

## Cleaning

In [66]:
# convert to lower case
df['Definition'] = df['Definition'].str.lower()

# remove punctuation
df['Definition'] = df['Definition'].str.replace(r'[^\w\s\d]','',regex=True)

# remove extra white space
df['Definition'] = df['Definition'].str.replace(r' +', ' ',regex=True)

df

Unnamed: 0,Key,Word,Definition,Tokens
0,1,的,indicates possession like adding s to a noun,"[indic, possess, like, ad, s, to, a, noun]"
1,2,我,i me,"[i, me]"
2,3,你,you singular,"[you, singular]"
3,4,是,be is are am,"[be, is, are, am]"
4,5,了,indicates a completed or finished action,"[indic, a, complet, or, finish, action]"
...,...,...,...,...
4994,4996,深情厚谊,profound friendship,"[profound, friendship]"
4995,4997,武侠,knighterrant a genre of swordplay martial arts...,"[knighterr, a, genr, of, swordplay, martial, a..."
4996,4998,将就,put up with accept somewhat reluctantly,"[put, up, with, accept, somewhat, reluctantli]"
4997,4999,对联,rhyming couplet vertical written couplet usual...,"[rhyme, couplet, vertic, written, couplet, usu..."


## Tokenization

In [72]:
# split by whitespace
df['Tokens'] = df['Definition'].apply(lambda x: x.split(' '))

df

Unnamed: 0,Key,Word,Definition,Tokens
0,1,的,indicates possession like adding s to a noun,"[indicates, possession, like, adding, s, to, a..."
1,2,我,i me,"[i, me]"
2,3,你,you singular,"[you, singular]"
3,4,是,be is are am,"[be, is, are, am]"
4,5,了,indicates a completed or finished action,"[indicates, a, completed, or, finished, action]"
...,...,...,...,...
4994,4996,深情厚谊,profound friendship,"[profound, friendship]"
4995,4997,武侠,knighterrant a genre of swordplay martial arts...,"[knighterrant, a, genre, of, swordplay, martia..."
4996,4998,将就,put up with accept somewhat reluctantly,"[put, up, with, accept, somewhat, reluctantly]"
4997,4999,对联,rhyming couplet vertical written couplet usual...,"[rhyming, couplet, vertical, written, couplet,..."


## Normalization

In [74]:
# lemmatization
wordnet = WordNetLemmatizer()
df['Tokens'] = df['Tokens'].apply(lambda x: [wordnet.lemmatize(token,'n' if pos not in ['a', 'r', 'n', 'v'] else pos) for token,pos in pos_tag(x)])

# stemming
# porter = PorterStemmer()
# df['Tokens'] = df['Tokens'].apply(lambda x: [porter.stem(word) for word in x])

df

Unnamed: 0,Key,Word,Definition,Tokens
0,1,的,indicates possession like adding s to a noun,"[indicates, possession, like, adding, s, to, a..."
1,2,我,i me,"[i, me]"
2,3,你,you singular,"[you, singular]"
3,4,是,be is are am,"[be, is, are, am]"
4,5,了,indicates a completed or finished action,"[indicates, a, completed, or, finished, action]"
...,...,...,...,...
4994,4996,深情厚谊,profound friendship,"[profound, friendship]"
4995,4997,武侠,knighterrant a genre of swordplay martial arts...,"[knighterrant, a, genre, of, swordplay, martia..."
4996,4998,将就,put up with accept somewhat reluctantly,"[put, up, with, accept, somewhat, reluctantly]"
4997,4999,对联,rhyming couplet vertical written couplet usual...,"[rhyming, couplet, vertical, written, couplet,..."


In [70]:
df.sample(10)

Unnamed: 0,Key,Word,Definition,Tokens
2936,2938,本事,ability skill capability shì this matter liter...,"[abil, skill, capabl, shì, thi, matter, litera..."
1476,1477,基本,basic fundamental,"[basic, fundament]"
3853,3855,便条,informal note,"[inform, note]"
4526,4528,别致,unique unconventional fancy,"[uniqu, unconvent, fanci]"
731,732,随便,as one pleases informal random casual,"[as, one, pleas, inform, random, casual]"
796,797,刀,knife blade kangxi radical 18,"[knife, blade, kangxi, radic, 18]"
4163,4165,可观,considerable impressive,"[consider, impress]"
298,299,西瓜,watermelon,[watermelon]
4468,4470,董事长,chairman of the board,"[chairman, of, the, board]"
3657,3659,徘徊,pace back and forth hesitate waver,"[pace, back, and, forth, hesit, waver]"


# Word Embedding

## TF-IDF

## BOW

## CBOW

## Skip-Gram

## GloVe

## BERT

# Clustering