In [1]:
# Imports

import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
# Download required NLTK data files
from nltk.corpus import wordnet

# Initialize SpaCy for tokenization and part of speech tagging
import spacy
nlp = spacy.load("en_core_web_sm")

  from pandas.core import (
[nltk_data] Downloading package punkt to /Users/emijhang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Loading in Data

Data Description from Github:
_______________________

Word Complexity Lexicon (https://github.com/mounicam/lexical_simplification/tree/master/word_complexity_lexicon)
_______________________


lexicon.tsv : Each line consists of word and its complexity scores calculated by aggregating over human ratings. 
              The score belongs to a scale of 1-6, where 1 represents "very simple" and 6 represents "very complex"

lexion_annotations.tsv: Each line consists of a word in the lexicon and its individual ratings from 11 annotators.
                        Each rating again belongs to the scale of 1-6. -1 indicates that the annotator did not rate
                        the word.
                        
NOTE: Both the files are tab delimited.

In [2]:
lexicon_data = pd.read_csv('../WCL_data/lexicon.csv')
lexicon_ann = pd.read_csv('../WCL_data/lexicon_annotations.csv')

## Data Cleaning

In [3]:
# TODO: 
# Clean Data
# split into train and test
# Get dimensions, summary, etc. of data
print("Lexicon data")
print(lexicon_data.shape)
print(lexicon_data.head())

print("Lexicon ann")
print(lexicon_ann.shape)
print(lexicon_ann.head())

Lexicon data
(15180, 2)
            word  rating
0            wet  1.5714
1          cargo  2.8571
2        Arsenal  3.7143
3  Manufacturing  3.8333
4           East  1.2857
Lexicon ann
(15180, 12)
            word  ann_1  ann_2  ann_3  ann_4  ann_5  ann_6  ann_7  ann_8  \
0            wet     -1      1      1     -1      1      1      2      3   
1          cargo     -1     -1      2     -1      4      2      2      4   
2        Arsenal      4     -1     -1      4      5     -1      4      3   
3  Manufacturing     -1      4      4     -1     -1      3      4      4   
4           East     -1      2      1      1      1     -1      1      2   

   ann_9  ann_10  ann_11  
0     -1       2      -1  
1     -1       3       3  
2      3      -1       3  
3     -1       4      -1  
4      1      -1      -1  


In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Load and clean data
data = lexicon_data.dropna(subset=["word", "rating"])  # Drop rows with NaN

X = data["word"].values  # Target words
y = data["rating"].values.astype(float)  # Continuous difficulty ratings

# Vectorize words
vectorizer = TfidfVectorizer()
X_vectors = vectorizer.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_vectors, y, test_size=0.2, random_state=42)

# Train regressor
regressor = RandomForestRegressor()
regressor.fit(X_train, y_train)

# Evaluate model
y_pred = regressor.predict(X_test)
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R-squared Score:", r2_score(y_test, y_pred))

Mean Squared Error: 3.282869642779129
R-squared Score: -4.099904058063263


In [19]:
import nltk

def simplify_sentence(sentence, regressor, vectorizer, word2vec_model, difficulty_threshold=3):
    """
    Simplifies a sentence by replacing difficult words with simpler alternatives.
    
    Args:
        sentence (str): Input sentence to be simplified.
        regressor: Trained regressor model for predicting word difficulty.
        vectorizer: Trained vectorizer for transforming words into features.
        word2vec_model: Trained Word2Vec model for word similarity.
        difficulty_threshold (float): Threshold above which words are considered difficult.
    
    Returns:
        str: Simplified sentence.
    """
    # Tokenize the sentence into words
    words = nltk.word_tokenize(sentence)
    simplified_words = []

    # Loop over each word in the sentence
    for word in words:
        # Check if the word exists in the difficulty dictionary (or predict difficulty if not)
        vector = vectorizer.transform([word])
        if word not in X:
            difficulty = regressor.predict(vector)[0]  # Predict difficulty of the word
        else: 
            difficulty = data[data['word']==word]['rating'].to_numpy()
        print(word)
        print(difficulty)
        if difficulty > difficulty_threshold:
            # Word is considered difficult, try finding a simpler alternative
            try:
                # Get similar words using Word2Vec model
                similar_words = word2vec_model.most_similar(word, topn=5)
                print(similar_words)

                # Try to find a similar word with a lower difficulty rating
                for sim_word, _ in similar_words:
                    sim_vector = vectorizer.transform([sim_word])
                    sim_difficulty = regressor.predict(sim_vector)[0]

                    if sim_difficulty <= difficulty_threshold:
                        simplified_words.append(sim_word)  # Replace with simpler word
                        break
                else:
                    # No suitable replacement found, keep the original word
                    simplified_words.append(word)
            except KeyError:
                # Word not in Word2Vec vocabulary, keep the original word
                simplified_words.append(word)
        else:
            # Word is not considered difficult, keep the original word
            simplified_words.append(word)

    # Join the simplified words to form the new sentence
    simplified_sentence = " ".join(simplified_words)
    return simplified_sentence


In [25]:
import gensim.downloader as api

# Load the Word2Vec model
word2vec_model = api.load('word2vec-google-news-300')

# Sample sentence
test_sentences = [
    "This is a simple sentence.",
    "Although she was considered smart, she failed all her exams.",
    "Anachronism in historical contexts can be confusing.",
    "accumulated, thesaurus, differing, terror"
]
for sentence in test_sentences:
    
    # Assuming you have defined or imported simplify_sentence, regressor, and vectorizer
    simplified = simplify_sentence(
        sentence, 
        regressor, 
        vectorizer, 
        word2vec_model,  # Use the model directly
        difficulty_threshold=2.5
    )
    print("Original Sentence:", sentence)
    print("Simplified Sentence:", simplified)


This
1.1410666233766245
is
[1.]
a
[1.]
simple
[1.6667]
sentence
[2.3333]
.
1.07645651015651
Original Sentence: This is a simple sentence.
Simplified Sentence: This is a simple sentence .
Although
[2.]
she
[1.1429]
was
[1.4286]
considered
[3.1429]
[('regarded', 0.7731536030769348), ('deemed', 0.6961521506309509), ('viewed', 0.6467924118041992), ('Considered', 0.6333076357841492), ('considers', 0.5919919610023499)]
smart
1.4231477655677651
,
1.07645651015651
she
[1.1429]
failed
[1.6667]
all
[1.2857]
her
[1.]
exams
[1.6667]
.
1.07645651015651
Original Sentence: Although she was considered smart, she failed all her exams.
Simplified Sentence: Although she was regarded smart , she failed all her exams .
Anachronism
1.07645651015651
in
[1.]
historical
[2.5]
contexts
[3.]
[('context', 0.6575323939323425), ('meanings', 0.5609514713287354), ('subjectivities', 0.5602607131004333), ('frameworks', 0.5575310587882996), ('milieux', 0.5472158789634705)]
can
[1.]
be
[1.]
confusing
[2.8571]
[('convolut

In [None]:
# import nltk
# nltk.download('wordnet')
# nltk.download('omw-1.4')  # Download additional multilingual WordNet data

# import gensim.downloader as api
# import nltk
# from nltk.corpus import wordnet as wn
# from nltk.tokenize import word_tokenize

# # Load Word2Vec model
# wv = api.load('word2vec-google-news-300')

# # Define a function to get WordNet synonyms
# def get_synonyms(word):
#     synonyms = set()
#     for syn in wn.synsets(word):
#         for lemma in syn.lemmas():
#             synonyms.add(lemma.name())  # Add synonym lemma to the set
#     return list(synonyms)

# # Define a function to simplify the sentence
# def simplify_sentence(sentence, model, difficulty_threshold=3):
#     words = word_tokenize(sentence)
#     simplified_words = []

#     for word in words:
#         # Skip punctuation and stopwords
#         if word.isalnum():
#             # Get Word2Vec vector for the word
#             try:
#                 word_vector = model[word.lower()]
                
#                 # If the word is deemed too difficult (based on length, etc.), replace it
#                 if len(word) > difficulty_threshold:
#                     synonyms = get_synonyms(word)
                    
#                     # Find the synonym with the most similar Word2Vec vector
#                     best_synonym = None
#                     max_similarity = -1
#                     for syn in synonyms:
#                         try:
#                             # Compare the cosine similarity between word and synonym
#                             similarity = model.similarity(word.lower(), syn)
#                             if similarity > max_similarity:
#                                 best_synonym = syn
#                                 max_similarity = similarity
#                         except KeyError:
#                             continue

#                     # If a good synonym is found, use it; otherwise, keep the original word
#                     if best_synonym:
#                         simplified_words.append(best_synonym)
#                     else:
#                         simplified_words.append(word)
#                 else:
#                     simplified_words.append(word)
#             except KeyError:
#                 # If the word is not in Word2Vec model, just keep the word
#                 simplified_words.append(word)
#         else:
#             # For punctuation or non-alphanumeric, just keep it
#             simplified_words.append(word)

#     return ' '.join(simplified_words)

# # Example sentence
# sentence = "Anachronism in historical contexts can be confusing."

# # Simplify the sentence
# simplified_sentence = simplify_sentence(sentence, wv)
# print("Original Sentence:", sentence)
# print("Simplified Sentence:", simplified_sentence)
