In [None]:
!pip install sentence-transformers
!pip install fasttext
!pip install gensim
!pip install autocorrect

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import pandas as pd
import numpy as np
from ast import literal_eval
import re
from nltk.corpus import words as nlp_words
from sentence_transformers import SentenceTransformer
import fasttext
import gensim
import logging
import os
import nltk.data
import string
import spacy
from autocorrect import Speller
import nltk
nltk.download('words')

In [None]:
data_path = '/content/gdrive/MyDrive/advices_assignment/data/advices_assignment.csv'
df = pd.read_csv(data_path)
if df.isna().sum()[0]!=0:
    print('NULL VALUES PRESENT!!!')
df.drop_duplicates(inplace=True, ignore_index=True)
df['advice'] = df['advice'].apply(lambda x : literal_eval(x))

In [None]:
### COMBINING SENTENCES IN ADVICE, LOWERCASING AND DUPLICATE REMOVAL
def combine(x):
    sent = ''
    for i in range(len(x)):
        sent += x[i] +', '
    return sent[:-2]

df['combined'] = df['advice'].apply(lambda x : combine(x))
df['count'] = df['combined'].apply(lambda x : len(x.split(' ')))
df['count'].quantile(0.95)
df['lowercased'] = df['combined'].apply(lambda x : x.lower())
df.drop_duplicates(subset=['lowercased'], inplace=True, ignore_index=True)

In [None]:
### GENERATING UNIQUE WORD CORPUS FOR ADVICE DATASET
word_list = sum(df['advice'].tolist(), [])
corpus = ''
for sentence in word_list:
    corpus += sentence + ' '

corpus = re.sub(r'[^\w]', ' ', corpus)
words = corpus.split(' ')

unique_corpus = []
for word in words:
    word = word.lower()
    if word not in unique_corpus:
        unique_corpus.append(word)

print('LEN WORD CORPUS :: ', len(unique_corpus))

LEN WORD CORPUS ::  3578


In [None]:
### CLEANING THE CORPUS - STEP 1 : GENERATING OMITTION WORD LIST BY CHECKING WITH ENGLISH OCCURRENCES IN NLTK
unique_eng_words = list(set(unique_corpus).intersection(set(nlp_words.words())))
omitted_words = list(set(unique_corpus) - set(unique_eng_words))

print('No of omitted tokens :: ', len(omitted_words))

No of omitted tokens ::  1659


In [None]:
### CLEANING THE CORPUS - STEP 2 : REMOVING NON ENGLISH OCCURRENCES

corrections = []
for w in omitted_words:
    w = w.encode('ascii',errors='ignore').decode()
    if w!='':
        corrections.append(w)

omitted_words = list(set(omitted_words) - set(corrections))
print('No of omitted tokens :: ', len(omitted_words))

No of omitted tokens ::  200


In [None]:
### CLEANING THE CORPUS - STEP 3 : REMOVING NUMERIC OCCURRENCES

omitted_words_ = []
for w in corrections:
    if w.isdigit():
        omitted_words_.append(w)

corrections = list(set(corrections) - set(omitted_words_))
omitted_words = omitted_words_ + omitted_words
print('No of omitted tokens :: ', len(omitted_words))

No of omitted tokens ::  286


In [None]:
### CLEANING THE CORPUS - STEP 5 : REMOVING GARBAGE WORDS WITH REPEATED LETTERS (letter thresh = 2)
omitted_words_ = []
for w in corrections:
    if len(set(w)) <= 2:
        omitted_words_.append(w)
omitted_words = omitted_words_ + omitted_words
corrections = list(set(corrections) - set(omitted_words))
print('No of omitted tokens :: ', len(omitted_words))

No of omitted tokens ::  404


In [None]:
final_unique_corpus = list(set(unique_corpus) - set(omitted_words))

In [None]:
def remove_garbage(sentence, allowed_word_list):
    sentence = sentence.split(' ')
    updated_sentence = ''
    for word in sentence:
        if word in allowed_word_list:
            updated_sentence += word + ' '
    return updated_sentence[:-1]


In [None]:
df['cleaned_advice'] = df['lowercased'].apply(lambda x : remove_garbage(x, final_unique_corpus))
df.drop_duplicates(['cleaned_advice'], inplace=True, ignore_index=True)

### dropping one word advices
drop_ind = df.query('count==1').index
df.drop(index=drop_ind, inplace=True)
df.reset_index(inplace=True, drop=True)

In [None]:
df = df[['advice', 'cleaned_advice', 'count']]

In [None]:
final_unique_corpus = pd.DataFrame(np.sort(final_unique_corpus), columns=['tokens'])

In [None]:
final_unique_corpus.to_csv('/content/gdrive/MyDrive/advices_assignment/data/corpus.csv', index=False)
df.to_csv('/content/gdrive/MyDrive/advices_assignment/data/cleaned_advices.csv', index=False)