In [109]:
# Imports

import pandas as pd 
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import nltk
nltk.download('punkt')
from nltk.corpus import cmudict

# Download the CMU Pronouncing Dictionary for syllable counting
nltk.download('cmudict')
d = cmudict.dict()

[nltk_data] Downloading package punkt to /Users/oliviagao/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package cmudict to
[nltk_data]     /Users/oliviagao/nltk_data...
[nltk_data]   Package cmudict is already up-to-date!


In [110]:
lexicon_data = pd.read_csv('../WCL_data/lexicon.csv')
lexicon_ann = pd.read_csv('../WCL_data/lexicon_annotations.csv')

wikipedia_train = pd.read_csv('../CWID_train/Wikipedia_Train.csv')
wikipedia_dev = pd.read_csv('../CWID_train/Wikipedia_Dev.csv')
wikipedia_test = pd.read_csv('../CWID_test/Wikipedia_Test.csv')
news_train = pd.read_csv('../CWID_train/News_Train.csv')
news_dev = pd.read_csv('../CWID_train/News_Dev.csv')
news_test = pd.read_csv('../CWID_test/News_Test.csv')

coca_df = pd.read_csv('../COCA/COCA_tokens.csv')
# Convert words to lowercase for uniformity
coca_df['Token'] = coca_df['Token'].str.lower()

# Remove any non-alphabetic characters 
coca_df['Token'] = coca_df['Token'].str.replace(r'[^a-zA-Z]', '', regex=True)

In [111]:
# Define a frequency threshold to label words as simple or complex
FREQUENCY_THRESHOLD = 0.87  # Adjust this threshold based on experimentation

def syllable_count(word):
    """Return the syllable count for a word."""
    word = word.lower()
    if word in d:
        return max([len(list(y for y in x if y[-1].isdigit())) for x in d[word]])  # Get the max syllables
    else:
        return None

coca_df.drop(["DocuScope Category", "Document Count", "% of Documents"], axis=1, errors="ignore")
coca_df["Word Length"] = coca_df["Token"].apply(len)
coca_df["Syllable Count"] = coca_df["Token"].apply(syllable_count)
coca_df["Syllable Count"] = coca_df["Syllable Count"].fillna(coca_df["Syllable Count"].mean())
print(coca_df.head())

# Define complexity label based on frequency, syllable count, and word length
# Complexity is 1 (complex) if frequency is below threshold, word length is long, or syllables are high.
# Complexity is 0 (simple) if frequency is above threshold and the word is shorter/simpler
coca_df["Complexity"] = (
    (coca_df["Frequency (per mil. tokens)"] < FREQUENCY_THRESHOLD) |   # Median frequency
    (coca_df["Syllable Count"] > 2.7408510638297874) |  # Median syllable count
    (coca_df["Word Length"] > 9)       # Median word length
).astype(int)

# Prepare the features (X) and target (y)
X = coca_df[['Word Length', 'Syllable Count', 'Frequency (per mil. tokens)']]
y = coca_df['Complexity']

    Token           DocuScope Category   Count  Document Count  \
0      or  ReaderDirectedMetadiscourse  431525           40847   
1     but  ReaderDirectedMetadiscourse  282210           38696   
2     and                    Reasoning  215145           29204   
3  people               CharacterTypes  206259           32557   
4    more        InformationComparison  145672           35039   

   Frequency (per mil. tokens)  % of Documents  Word Length  Syllable Count  
0                      2425.94           95.62            2             1.0  
1                      1586.52           90.58            3             1.0  
2                      1209.50           68.36            3             1.0  
3                      1159.54           76.21            6             2.0  
4                       818.94           82.02            4             1.0  


In [112]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42, class_weight='balanced')
rf_classifier.fit(X_train, y_train)

# Predictions
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Returns a 0 if simple, 1 if complex
def predict_complexity(word):
    # Fetch the row corresponding to the word from the DataFrame
    word_row = coca_df[coca_df['Token'] == word]
    
    # If the word is not in the DataFrame, use default values
    if word_row.empty:
        # Use the average values for missing word
        word_length = len(word)  # This is computed directly from the word
        syllables = syllable_count(word)  # Use the mean syllable count from the DataFrame
        word_frequency = coca_df['Frequency (per mil. tokens)'].median()  # Use the mean frequency from the DataFrame
    else:
        # Extract the values for 'Word Length', 'Syllable Count', and 'Frequency (per mil. tokens)' from the row
        word_length = len(word)  # This is computed directly from the word
        syllables = word_row['Syllable Count'].values[0]  # Extract syllable count from the dataset
        word_frequency = word_row['Frequency (per mil. tokens)'].values[0]  # Extract frequency from the dataset
    
    # Construct the feature DataFrame as done during training
    features_df = pd.DataFrame([[word_length, syllables, word_frequency]], 
                               columns=['Word Length', 'Syllable Count', 'Frequency (per mil. tokens)'])
    
    # Get the model's prediction
    prediction = rf_classifier.predict(features_df)
    
    # Return the result
    return prediction[0]

# Test the function with some example words
print(predict_complexity("dog"))
print(predict_complexity("but"))
print(predict_complexity("hello"))
print(predict_complexity("comprehensive"))
print(predict_complexity("multifarious"))
print(predict_complexity("techniques"))

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1434
           1       1.00      1.00      1.00      4236

    accuracy                           1.00      5670
   macro avg       1.00      1.00      1.00      5670
weighted avg       1.00      1.00      1.00      5670

0
0
0
1
1
1


In [120]:
# Set threshold for complexity
COMPLEXITY_THRESHOLD = 3

def is_complex(word, lexicon_data, threshold=COMPLEXITY_THRESHOLD):
    """Check if a word is complex based on the lexicon or using COCA and a Random Forest Classifier."""
    # First check in the lexicon
    entry = lexicon_data[lexicon_data['word'] == word]

    # If the word is found in the lexicon
    if not entry.empty:
        rating = entry['rating'].values[0] # Get the complexity rating of the word
        return rating >= threshold # Return True if the word's complexity rating is greater than or equal to the threshold
    
    # If the word is not in the lexicon, estimate its complexity using COCA and Random Forest
    predicted_complexity = predict_complexity(word) # Predict complexity using the Random Forest model

    return True if predicted_complexity == 1 else False

def simplify_sentence(sentence, lexicon_data):
    """Simplify a sentence by replacing complex words with simpler synonyms."""
    doc = nlp(sentence)
    simplified_sentence = []
    
    for token in doc:
        word = token.text.lower()
        
        # Check if the word is complex and replace with simpler synonym
        if is_complex(word, lexicon_data):
            print(word)
            # TODO: REPLACE WORD
            simple_word = word
            
            simplified_sentence.append(simple_word)
        else:
            simplified_sentence.append(word)
    
    return " ".join(simplified_sentence)

sentences = ["This is a simple sentence.", "Although she was considered smart, she failed all her exams.", "Anachronism"]
for sentence in sentences:
    simplified_sentence = simplify_sentence(sentence, lexicon_data)

    print("Original Sentence:", sentence)
    print("Simplified Sentence:", simplified_sentence)


Original Sentence: This is a simple sentence.
Simplified Sentence: this is a simple sentence .
considered
Original Sentence: Although she was considered smart, she failed all her exams.
Simplified Sentence: although she was considered smart , she failed all her exams .
anachronism
Original Sentence: Anachronism
Simplified Sentence: anachronism


In [None]:
# replacing complex words with simpler ones

# Load  NLTK data and SpaCy model
nltk.download('wordnet')
nltk.download('omw-1.4')
nlp = spacy.load("en_core_web_sm")

def find_simpler_synonym(word):
    """
    Find the simplest synonym for a given word using WordNet.
    Simplicity is determined based on the length of the synonym.
    """
    synonyms = wordnet.synsets(word)
    if not synonyms:
        return word  # Return the original word if no synonyms are found

    # Extract the lemmas (unique words) for all synonyms
    lemmas = set(lemma.name() for syn in synonyms for lemma in syn.lemmas())

    # Sort lemmas by length to find the simplest synonym
    simpler_synonym = min(lemmas, key=len, default=word)

    # Replace underscores with spaces for readability
    return simpler_synonym.replace('_', ' ') if simpler_synonym != word else word

def is_complex(word, lexicon_data, coca_df=None, rf_classifier=None, threshold=3):
    """
    Check if a word is complex using:
    1. Lexicon data 
    2. COCA dataset and Random Forest Classifier as fallback.
    """
    # Check the lexicon first
    entry = lexicon_data[lexicon_data["word"] == word]
    if not entry.empty:
        return entry["rating"].values[0] >= threshold

    # Fallback: Use Random Forest Classifier with COCA features
    if coca_df is not None and rf_classifier is not None:
        coca_entry = coca_df[coca_df["Token"] == word]
        if not coca_entry.empty:
            # Prepare features for the classifier
            features = coca_entry[["Word Length", "Syllable Count", "Frequency (per mil. tokens)"]]
            return rf_classifier.predict(features)[0] == 1

    # Default to not complex if no data is available
    return False


def simplify_sentence(sentence, lexicon_data=None, threshold=3):
    """
    Simplify a sentence by replacing complex words with simpler synonyms.
    """
    doc = nlp(sentence)
    simplified_sentence = []

    for token in doc:
        word = token.text
        # Check if the word is complex
        if is_complex(word.lower(), lexicon_data, threshold):
            # Replace with a simpler synonym
            simple_word = find_simpler_synonym(word)
            simplified_sentence.append(simple_word)
        else:
            simplified_sentence.append(word)

    return " ".join(simplified_sentence)

# test usage
test_sentences = [
    "This is a simple sentence.",
    "Although she was considered smart, she failed all her exams.",
    "Anachronism in historical contexts can be confusing."
]

# Dummy lexicon data for testing

#lexicon_data = pd.DataFrame({
#    "word": ["anachronism", "considered", "confusing"],
#    "rating": [5, 4, 4]
#})

# Simplify each sentence
for sentence in test_sentences:
    print("Original:", sentence)
    simplified = simplify_sentence(sentence, lexicon_data=lexicon_data)
    print("Simplified:", simplified)
    print()