In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

import re
from nltk.corpus import stopwords
from bs4 import BeautifulSoup

import nltk
from fuzzywuzzy import fuzz
from spellchecker import SpellChecker
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize




In [2]:
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cr245297\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cr245297\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cr245297\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
data = pd.read_excel('F:\\NLP\\dis 2022-2023 v2.xlsx')

In [4]:
data.shape

(79910, 13)

In [1]:
data.head()

NameError: name 'data' is not defined

In [6]:
def preprocess(q):
    
    q = str(q).lower().strip()
    
    # Replace certain special characters with their string equivalents
    q = q.replace('%', ' percent')
    q = q.replace('$', ' dollar ')
    q = q.replace('#', ' number ')
    q = q.replace('@', ' at ')
    
    

    
    # Decontracting words
    # https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
    # https://stackoverflow.com/a/19794953
    contractions = { 
    "ain't": "am not",
    "aren't": "are not",
    "can't": "can not",
    "can't've": "can not have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he'll've": "he will have",
    "he's": "he is",
    "how'd": "how did",
    "how'd'y": "how do you",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'd've": "i would have",
    "i'll": "i will",
    "i'll've": "i will have",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'd've": "it would have",
    "it'll": "it will",
    "it'll've": "it will have",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "mightn't've": "might not have",
    "must've": "must have",
    "mustn't": "must not",
    "mustn't've": "must not have",
    "needn't": "need not",
    "needn't've": "need not have",
    "o'clock": "of the clock",
    "oughtn't": "ought not",
    "oughtn't've": "ought not have",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "shan't've": "shall not have",
    "she'd": "she would",
    "she'd've": "she would have",
    "she'll": "she will",
    "she'll've": "she will have",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "shouldn't've": "should not have",
    "so've": "so have",
    "so's": "so as",
    "that'd": "that would",
    "that'd've": "that would have",
    "that's": "that is",
    "there'd": "there would",
    "there'd've": "there would have",
    "there's": "there is",
    "they'd": "they would",
    "they'd've": "they would have",
    "they'll": "they will",
    "they'll've": "they will have",
    "they're": "they are",
    "they've": "they have",
    "to've": "to have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'd've": "we would have",
    "we'll": "we will",
    "we'll've": "we will have",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what'll've": "what will have",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "when've": "when have",
    "where'd": "where did",
    "where's": "where is",
    "where've": "where have",
    "who'll": "who will",
    "who'll've": "who will have",
    "who's": "who is",
    "who've": "who have",
    "why's": "why is",
    "why've": "why have",
    "will've": "will have",
    "won't": "will not",
    "won't've": "will not have",
    "would've": "would have",
    "wouldn't": "would not",
    "wouldn't've": "would not have",
    "y'all": "you all",
    "y'all'd": "you all would",
    "y'all'd've": "you all would have",
    "y'all're": "you all are",
    "y'all've": "you all have",
    "you'd": "you would",
    "you'd've": "you would have",
    "you'll": "you will",
    "you'll've": "you will have",
    "you're": "you are",
    "you've": "you have"
    }

    q_decontracted = []

    for word in q.split():
        if word in contractions:
            word = contractions[word]

        q_decontracted.append(word)

    q = ' '.join(q_decontracted)
    q = q.replace("'ve", " have")
    q = q.replace("n't", " not")
    q = q.replace("'re", " are")
    q = q.replace("'ll", " will")
    
    # Removing HTML tags
    q = BeautifulSoup(q)
    q = q.get_text()
    
    # Remove punctuations
    pattern = re.compile('\W')
    q = re.sub(pattern, ' ', q).strip()

    # Remove stop words
    new_text = []
    
    for word in q.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    q = new_text[:]
    new_text.clear()
    return " ".join(q)
    

In [7]:
data['subcategory_comments'] = data['subcategory_comments'].apply(preprocess)

In [8]:
#spelling check and fix
def correct_spelling_data(text):
    spell = SpellChecker()
    
    #Split the text into words
    words = text.split()
    
    corrected_words = [spell.correction(word) if spell.correction(word) is not None else word for word in words]
         
    #join the corrected words back into the data
    corrected_text_data = ' '.join(corrected_words)
    
    return corrected_text_data

In [9]:
data['subcategory_comments'] = data['subcategory_comments'].apply(correct_spelling_data)

In [10]:
#lemmatization 
def lemmatize_data(text):
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    lemmatized_data = ' '.join(lemmatized_tokens)
    return lemmatized_data

In [11]:
data['subcategory_comments'] = data['subcategory_comments'].apply(lemmatize_data)

In [2]:
data.head()

NameError: name 'data' is not defined

In [13]:
data['subcategory_comments'].duplicated().sum()

35703

In [14]:
def clean_spaces(data_toclean):
    words = data_toclean.split()
    clean_data = ' '.join(words)
    return clean_data

In [15]:
data['subcategory_comments'] = data['subcategory_comments'].apply(clean_spaces)

In [16]:
data['subcategory_comments'].duplicated().sum()

35703

In [17]:
data = data.drop_duplicates(subset='subcategory_comments', keep='first')

In [18]:
data.shape

(44207, 13)

In [19]:
data.to_excel("cleaned_data.xlsx",index=False)

In [20]:
import itertools

In [41]:
#function to find documemts with first 5 words similar 

# similar_rows = []
# for i in range(len(data)):
#    for j in range(i + 1, len(data)):
#        words_i = data.iloc[i,9].split()[:5]
#        words_j = data.iloc[j,9].split()[:5]
#        if words_i == words_j:
#            similar_rows.append((i, j))
        
        

similar_rows = []
word_sets = [set(row.split()[:3]) for row in data.iloc[:, 9]]
for i, j in itertools.combinations(range(len(data)), 2):
   if word_sets[i] == word_sets[j]:
       similar_rows.append(data.index[j])



In [42]:
#print(len(rows_to_remove))
print(len(similar_rows))

234912


In [43]:
# Delete similar rows from the dataset
# for i, j in similar_rows:
#    del data[j]

data.drop(similar_rows, inplace=True)

In [44]:
data.shape

(16751, 13)

In [91]:
# # Function to find similarity between two strings
# def similarity(row1, row2):
#    # Handle NaN values
#    if type(row1) == float and np.isnan(row1):
#        row1 = ''
#    if type(row2) == float and np.isnan(row2):
#        row2 = ''
#    return fuzz.ratio(row1, row2)

# # # Function to find similarity between two strings
# # def similarity(row1, row2):
# #    return fuzz.ratio(row1, row2)
# # Threshold for similarity (adjust as needed)
# similarity_threshold = 30
# # List to store indices of rows to be removed
# rows_to_remove = []
# # Iterate through rows for comparison
# for i in range(len(data)):
#    if i not in rows_to_remove:
#        for j in range(i + 1, len(data)):
#            if j not in rows_to_remove:
#                #if similarity(data['subcategory_comments'][i], data['subcategory_comments'][j]) > similarity_threshold:
#                if similarity(data.iloc[i,9], data.iloc[j,9]) > similarity_threshold:     
#                    rows_to_remove.append(j)


In [45]:
# # Drop rows with high similarity
# data_cleaned = data.drop(rows_to_remove, axis='index')

indexes_to_keep = set(range(data.shape[0]))- set(similar_rows)
data_cleaned = data.take(list(indexes_to_keep))

In [46]:
# Save the cleaned DataFrame to a new CSV file
data_cleaned.to_excel('cleaned_data_new.xlsx', index=False)

In [17]:
print(data_cleaned.shape())

TypeError: 'tuple' object is not callable

In [19]:
# from nltk.stem import WordNetLemmatizer
# from nltk.tokenize import word_tokenize
# nltk.download('punkt')
# nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cr245297\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cr245297\AppData\Roaming\nltk_data...


True

In [26]:
# def lemmatize_data(text):
#     lemmatizer = WordNetLemmatizer()
#     tokens = word_tokenize(text)
#     lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
#     lemmatized_data = ' '.join(lemmatized_tokens)
#     return lemmatized_data

In [42]:
input_text = "I have three visions for India. In 3000 years of our history, people from all over the world have come and invaded us, captured our lands, conquered our minds"
cleaned_text = lemmatize_data(input_text)
print(cleaned_text)

I have three vision for India . In 3000 year of our history , people from all over the world have come and invaded u , captured our land , conquered our mind


In [48]:
from spellchecker import SpellChecker

In [53]:
# def correct_spelling_data(text):
#     spell = SpellChecker()
    
#     #Split the text into words
#     words = text.split()
    
#     corrected_words = []
#     for word in words:
#         #Get the corrected version of each word
#         corrected_word = spell.correction(word)
#         corrected_words.append(corrected_word)
        
#     #join the corrected words back into the data
#     corrected_text_data = ' '.join(corrected_words)
    
#     return corrected_text_data

In [55]:
input_text = "tis is a sample sentenc wth speling mistakes"
cleaned_text = correct_spelling_data(input_text)
print(cleaned_text)

tis is a sample sentence with spelling mistakes
