In [10]:
# Imports

import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
# Download required NLTK data files
from nltk.corpus import wordnet

# Initialize SpaCy for tokenization and part of speech tagging
# import spacy
# nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package punkt to /Users/oliviagao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Loading in Data

Data Description from Github:
_______________________

Word Complexity Lexicon (https://github.com/mounicam/lexical_simplification/tree/master/word_complexity_lexicon)
_______________________


lexicon.tsv : Each line consists of word and its complexity scores calculated by aggregating over human ratings. 
              The score belongs to a scale of 1-6, where 1 represents "very simple" and 6 represents "very complex"

lexion_annotations.tsv: Each line consists of a word in the lexicon and its individual ratings from 11 annotators.
                        Each rating again belongs to the scale of 1-6. -1 indicates that the annotator did not rate
                        the word.
                        
NOTE: Both the files are tab delimited.

In [11]:
lexicon_data = pd.read_csv('../WCL_data/lexicon.csv')
lexicon_ann = pd.read_csv('../WCL_data/lexicon_annotations.csv')

## Data Cleaning

In [12]:
# TODO: 
# Clean Data
# split into train and test
# Get dimensions, summary, etc. of data
print("Lexicon data")
print(lexicon_data.shape)
print(lexicon_data.head())

print("Lexicon ann")
print(lexicon_ann.shape)
print(lexicon_ann.head())

data = lexicon_data.dropna(subset=["word", "rating"])  # Drop rows with NaN


Lexicon data
(15180, 2)
            word  rating
0            wet  1.5714
1          cargo  2.8571
2        Arsenal  3.7143
3  Manufacturing  3.8333
4           East  1.2857
Lexicon ann
(15180, 12)
            word  ann_1  ann_2  ann_3  ann_4  ann_5  ann_6  ann_7  ann_8  \
0            wet     -1      1      1     -1      1      1      2      3   
1          cargo     -1     -1      2     -1      4      2      2      4   
2        Arsenal      4     -1     -1      4      5     -1      4      3   
3  Manufacturing     -1      4      4     -1     -1      3      4      4   
4           East     -1      2      1      1      1     -1      1      2   

   ann_9  ann_10  ann_11  
0     -1       2      -1  
1     -1       3       3  
2      3      -1       3  
3     -1       4      -1  
4      1      -1      -1  


## Vectorizing, Regressor

Training a regressor so that for words not present in the vocabulary, we can predict the difficulty

In [34]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import joblib
X = data["word"].values  # Target words
y = data["rating"].values.astype(float)  # Continuous difficulty ratings

# Vectorize words
vectorizer = TfidfVectorizer()
X_vectors = vectorizer.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_vectors, y, test_size=0.2, random_state=42)

# Train regressor
regressor = RandomForestRegressor()
regressor.fit(X_train, y_train)

# Evaluate model
y_pred = regressor.predict(X_test)
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R-squared Score:", r2_score(y_test, y_pred))
joblib.dump(regressor, "../WCL_regressor.pkl")
# regressor = joblib.load("WCL_regressor.pkl")

Mean Squared Error: 3.250991774195391
R-squared Score: -4.050382118710537


['../WCL_regressor.pkl']

## Simplification Function

In [None]:
import nltk

def simplify_sentence(sentence, regressor, vectorizer, word2vec_model, difficulty_threshold=3):
    """
    Simplifies a sentence by replacing difficult words with simpler alternatives.
    
    Args:
        sentence (str): Input sentence to be simplified.
        regressor: Trained regressor model for predicting word difficulty.
        vectorizer: Trained vectorizer for transforming words into features.
        word2vec_model: Trained Word2Vec model for word similarity.
        difficulty_threshold (float): Threshold above which words are considered difficult.
    
    Returns:
        str: Simplified sentence.
    """
    # Tokenize the sentence into words
    words = nltk.word_tokenize(sentence)
    simplified_words = []
    changed_words = []

    for word in words:
        # Check if the word exists in the difficulty dictionary (or predict difficulty if not)
        vector = vectorizer.transform([word])
        if word not in X:
            difficulty = regressor.predict(vector)[0]  
        else: 
            difficulty = data[data['word']==word]['rating'].to_numpy()
        if difficulty > difficulty_threshold:
            try:
                similar_words = word2vec_model.wv.most_similar(word, topn=10)
                print(similar_words)

                for sim_word, _ in similar_words:
                    sim_vector = vectorizer.transform([sim_word])
                    sim_difficulty = regressor.predict(sim_vector)[0]

                    if sim_difficulty <= difficulty:
                        simplified_words.append(sim_word) 
                        changed_words.append((word, sim_word))
                        break
                    else:
                        simplified_words.append(word)                                                            
            except KeyError:
                simplified_words.append(word)
        else:
            simplified_words.append(word)

    simplified_sentence = " ".join(simplified_words)
    return simplified_sentence, changed_words


## Example Usage

In [33]:
import gensim.downloader as api
from rouge_score import rouge_scorer

word2vec_model = api.load('word2vec-google-news-300')
scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

test_sentences = [
    "This is a simple sentence.",
    "Although she was considered smart, she failed all her exams.",
    "Anachronism in historical contexts can be confusing.",
    "accumulated, thesaurus, differing, terror"
]
for sentence in test_sentences:

    simplified, changed_words = simplify_sentence(
        sentence, 
        regressor, 
        vectorizer, 
        word2vec_model,  
        difficulty_threshold=2
    )
    print("Original Sentence:", sentence)
    print("Simplified Sentence:", simplified)
    print("Words Changed:", changed_words, "\n")

    scores = scorer.score(sentence, simplified)
    for key in scores:
        print(f'{key}: {scores[key]}')
    print("-" * 50)


Original Sentence: This is a simple sentence.
Simplified Sentence: This is a simple sentences .
Words Changed: [('sentence', 'sentences')] 

rouge1: Score(precision=1.0, recall=1.0, fmeasure=1.0)
rouge2: Score(precision=1.0, recall=1.0, fmeasure=1.0)
rougeL: Score(precision=1.0, recall=1.0, fmeasure=1.0)
--------------------------------------------------
Original Sentence: Although she was considered smart, she failed all her exams.
Simplified Sentence: Although she was regarded smart , she failed all her exams .
Words Changed: [('considered', 'regarded')] 

rouge1: Score(precision=0.9, recall=0.9, fmeasure=0.9)
rouge2: Score(precision=0.7777777777777778, recall=0.7777777777777778, fmeasure=0.7777777777777778)
rougeL: Score(precision=0.9, recall=0.9, fmeasure=0.9)
--------------------------------------------------
Original Sentence: Anachronism in historical contexts can be confusing.
Simplified Sentence: Anachronism in historial context can be convoluted .
Words Changed: [('historical