In [8]:
# Imports

import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
# Download required NLTK data files
from nltk.corpus import wordnet

# Initialize SpaCy for tokenization and part of speech tagging
import spacy
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /Users/oliviagao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Loading in Data

Data Description from Github:
_______________________

Word Complexity Lexicon (https://github.com/mounicam/lexical_simplification/tree/master/word_complexity_lexicon)
_______________________


lexicon.tsv : Each line consists of word and its complexity scores calculated by aggregating over human ratings. 
              The score belongs to a scale of 1-6, where 1 represents "very simple" and 6 represents "very complex"

lexion_annotations.tsv: Each line consists of a word in the lexicon and its individual ratings from 11 annotators.
                        Each rating again belongs to the scale of 1-6. -1 indicates that the annotator did not rate
                        the word.
                        
NOTE: Both the files are tab delimited.

In [2]:
lexicon_data = pd.read_csv('../WCL_data/lexicon.csv')
lexicon_ann = pd.read_csv('../WCL_data/lexicon_annotations.csv')

## Data Cleaning

In [3]:
# TODO: 
# Clean Data
# split into train and test
# Get dimensions, summary, etc. of data
print("Lexicon data")
print(lexicon_data.shape)
print(lexicon_data.head())

print("Lexicon ann")
print(lexicon_ann.shape)
print(lexicon_ann.head())

Lexicon data
(15180, 2)
            word  rating
0            wet  1.5714
1          cargo  2.8571
2        Arsenal  3.7143
3  Manufacturing  3.8333
4           East  1.2857
Lexicon ann
(15180, 12)
            word  ann_1  ann_2  ann_3  ann_4  ann_5  ann_6  ann_7  ann_8  \
0            wet     -1      1      1     -1      1      1      2      3   
1          cargo     -1     -1      2     -1      4      2      2      4   
2        Arsenal      4     -1     -1      4      5     -1      4      3   
3  Manufacturing     -1      4      4     -1     -1      3      4      4   
4           East     -1      2      1      1      1     -1      1      2   

   ann_9  ann_10  ann_11  
0     -1       2      -1  
1     -1       3       3  
2      3      -1       3  
3     -1       4      -1  
4      1      -1      -1  


In [4]:
complexity_dict = {
    str(word).strip().lower(): rating
    for word, rating in lexicon_data.set_index("word")["rating"].to_dict().items()
    if pd.notnull(word)  # Check to ensure word is not NaN
}
print(list(complexity_dict.items())[:3])

[('wet', 1.5714), ('cargo', 2.8571), ('arsenal', 3.7143)]


In [5]:
COMPLEXITY_THRESHOLD = 3

def identify_complex_words(text):
    words = word_tokenize(text)
    complex_words = [word for word in words if complexity_dict.get(word.lower(), 0) >= COMPLEXITY_THRESHOLD]
    return complex_words 

In [11]:
def is_complex(word, lexicon_data, threshold=COMPLEXITY_THRESHOLD):
    """Check if a word is complex based on the lexicon."""
    entry = lexicon_data[lexicon_data['word'] == word]
    if not entry.empty:
        rating = entry['rating'].values[0]
        return rating >= threshold
    return False

def get_simpler_synonym(word, lexicon_data):
    """Find a simpler synonym using WordNet."""
    synonyms = wordnet.synsets(word)
    simpler_candidates = []
    
    for syn in synonyms:
        for lemma in syn.lemmas():
            candidate = lemma.name().replace("_", " ")
            if candidate != word and candidate in lexicon_data['word'].values:
                candidate_score = lexicon_data.loc[lexicon_data['word'] == candidate, 'rating'].values[0]
                if candidate_score < COMPLEXITY_THRESHOLD:
                    simpler_candidates.append((candidate, candidate_score))
    # Sort synonyms by complexity and return the simplest one
    simpler_candidates = sorted(simpler_candidates, key=lambda x: x[1])
    return simpler_candidates[0][0] if simpler_candidates else word

def simplify_sentence(sentence, lexicon_data):
    """Simplify a sentence by replacing complex words with simpler synonyms."""
    doc = nlp(sentence)
    simplified_sentence = []

    for token in doc:
        word = token.text.lower()
        if is_complex(word, lexicon_data):
            simple_word = get_simpler_synonym(word, lexicon_data)
            simplified_sentence.append(simple_word)
        else:
            simplified_sentence.append(word)
    
    return " ".join(simplified_sentence)

In [13]:
# Test the simplification function
sentence = "This methodology utilizes multifarious techniques for comprehensive analysis."
print(identify_complex_words(sentence))
simplified_sentence = simplify_sentence(sentence, lexicon_data)
print("Original Sentence:", sentence)
print("Simplified Sentence:", simplified_sentence)

['methodology', 'utilizes', 'techniques', 'comprehensive']
Original Sentence: This methodology utilizes multifarious techniques for comprehensive analysis.
Simplified Sentence: this methodology use multifarious techniques for comprehensive analysis .
