In [1]:
import os
import pickle
from sklearn.neighbors import NearestCentroid
import pandas as pd

In [5]:
import pandas as pd
from sklearn.neighbors import NearestCentroid
from sklearn.impute import SimpleImputer
import os

# Load the word features DataFrames
wikipedia_word_features_file = 'features_NEW/Wikipedia_Train_NEW_Feats1.pkl'
wikinews_word_features_file = 'features_NEW/WikiNews_Train_NEW_Feats1.pkl'
news_word_features_file = 'features_NEW/News_Train_NEW_Feats1.pkl'

wikipedia_word_features = pd.read_pickle(wikipedia_word_features_file)
wikinews_word_features = pd.read_pickle(wikinews_word_features_file)
news_word_features = pd.read_pickle(news_word_features_file)

# Replace empty values with NaN
wikipedia_word_features.replace("", float("NaN"), inplace=True)
wikinews_word_features.replace("", float("NaN"), inplace=True)
news_word_features.replace("", float("NaN"), inplace=True)

# Replace NaN values with 0
imputer = SimpleImputer(missing_values=float("NaN"), strategy="constant", fill_value=0)
wikipedia_word_features = pd.DataFrame(imputer.fit_transform(wikipedia_word_features), columns=wikipedia_word_features.columns)
wikinews_word_features = pd.DataFrame(imputer.fit_transform(wikinews_word_features), columns=wikinews_word_features.columns)
news_word_features = pd.DataFrame(imputer.fit_transform(news_word_features), columns=news_word_features.columns)

# Train the individual models
wikipedia_classifier = NearestCentroid()
wikipedia_classifier.fit(
    wikipedia_word_features[['syllables', 'characters', 'vowels', 'simple_wiki_freq', 'HIT_count', 'absTotalMatchCount', 'relTotalMatchCount']].values,
    wikipedia_word_features['complex_binary'].values
)
save_model(wikipedia_classifier, 'lmodel', 'Wikipedia')

wikinews_classifier = NearestCentroid()
wikinews_classifier.fit(
    wikinews_word_features[['syllables', 'characters', 'vowels', 'simple_wiki_freq', 'HIT_count', 'absTotalMatchCount', 'relTotalMatchCount']].values,
    wikinews_word_features['complex_binary'].values
)
save_model(wikinews_classifier, 'lmodel', 'WikiNews')

news_classifier = NearestCentroid()
news_classifier.fit(
    news_word_features[['syllables', 'characters', 'vowels', 'simple_wiki_freq', 'HIT_count', 'absTotalMatchCount', 'relTotalMatchCount']].values,
    news_word_features['complex_binary'].values
)
save_model(news_classifier, 'lmodel', 'News')

# Combine the models
combined_classifier = NearestCentroid()
combined_classifier.fit(
    pd.concat([
        wikipedia_word_features,
        wikinews_word_features,
        news_word_features
    ])[['syllables', 'characters', 'vowels', 'simple_wiki_freq', 'HIT_count', 'absTotalMatchCount', 'relTotalMatchCount']].values,
    pd.concat([
        wikipedia_word_features,
        wikinews_word_features,
        news_word_features
    ])['complex_binary'].values
)
save_model(combined_classifier, 'lmodel', 'Combined')


In [None]:
# #DO NOT ALTER

# def train_classifier(word_features_file):
#     # Load the word_features DataFrame from the pickle file
#     with open(word_features_file, 'rb') as file:
#         word_features = pickle.load(file)

#     # Extract the features and labels from the word_features DataFrame
#     features_columns = ['syllables', 'characters', 'vowels', 'simple_wiki_freq', 'HIT_count', 'absTotalMatchCount','relTotalMatchCount', '1gram_freq', '2gram_freq', '3gram_freq', '4gram_freq', '5gram_freq']
#     X = word_features[features_columns].values
#     y = word_features['complex_binary'].values

#     # Create and train the Nearest Centroid classifier
#     clf = NearestCentroid()
#     clf.fit(X, y)

#     return clf

# def save_model(classifier, output_folder, word_features_file):
#     # Create the output folder if it doesn't exist
#     os.makedirs(output_folder, exist_ok=True)

#     # Get the base filename from the word_features_file
#     base_filename = os.path.splitext(os.path.basename(word_features_file))[0]
    
#     # Remove "Feats" from the base filename, if present
#     base_filename = base_filename.replace("Feats", "")

#     # Save the classifier model to a file
#     output_file_path = os.path.join(output_folder, f'{base_filename}_model.pkl')
#     with open(output_file_path, 'wb') as file:
#         pickle.dump(classifier, file)

# # Train the classifier
# #word_features_file = 'features/Wikipedia_Train_Feats.pkl'  # Replace with the path to your pickled word_features file
# word_features_file = 'features_NEW/Wikipedia_Train_NEW_Feats1.pkl'
# classifier = train_classifier(word_features_file)

# # Save the classifier model to the "lmodel" folder
# output_folder = 'lmodel'
# save_model(classifier, output_folder, word_features_file)


In [14]:
# 1- Classifier should now handle NaN values and change them to zero

def train_classifier(word_features_file):
    # Load the word_features DataFrame from the pickle file
    with open(word_features_file, 'rb') as file:
        word_features = pickle.load(file)

    # Replace empty values with NaN
    word_features.replace("", float("NaN"), inplace=True)

    # Replace NaN values with 0
    word_features.fillna(0, inplace=True)

    # Extract the features and labels from the word_features DataFrame
    features_columns = ['syllables', 'characters', 'vowels', 'simple_wiki_freq', 'HIT_count', 'absTotalMatchCount','relTotalMatchCount', '1gram_freq', '2gram_freq', '3gram_freq', '4gram_freq','5gram_freq'] 
    X = word_features[features_columns].values
    y = word_features['complex_binary'].values

    # Create and train the Nearest Centroid classifier
    clf = NearestCentroid()
    clf.fit(X, y)

    return clf

def save_model(classifier, output_folder, word_features_file):
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Get the base filename from the word_features_file
    base_filename = os.path.splitext(os.path.basename(word_features_file))[0]

    # Remove "Feats" from the base filename, if present
    base_filename = base_filename.replace("Feats", "")

    # Save the classifier model to a file
    output_file_path = os.path.join(output_folder, f'{base_filename}_model.pkl')
    with open(output_file_path, 'wb') as file:
        pickle.dump(classifier, file)

# Train the classifier
word_features_file = 'features_NEW/Wikipedia_Train_NEW_Feats1.pkl'
classifier = train_classifier(word_features_file)

# Save the classifier model to the "lmodel" folder
output_folder = 'lmodel'
save_model(classifier, output_folder, word_features_file)


In [18]:
#Train the classifier to without the raw Ngram data


# 1- Classifier should now handle NaN values and change them to zero

def train_classifier(word_features_file):
    # Load the word_features DataFrame from the pickle file
    with open(word_features_file, 'rb') as file:
        word_features = pickle.load(file)

    # Replace empty values with NaN
    word_features.replace("", float("NaN"), inplace=True)

    # Replace NaN values with 0
    word_features.fillna(0, inplace=True)

    # Extract the features and labels from the word_features DataFrame
    features_columns = ['syllables', 'characters', 'vowels', 'simple_wiki_freq', 'HIT_count', 'absTotalMatchCount','relTotalMatchCount'] 
    X = word_features[features_columns].values
    y = word_features['complex_binary'].values

    # Create and train the Nearest Centroid classifier
    clf = NearestCentroid()
    clf.fit(X, y)

    return clf

def save_model(classifier, output_folder, word_features_file):
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Get the base filename from the word_features_file
    base_filename = os.path.splitext(os.path.basename(word_features_file))[0]

    # Remove "Feats" from the base filename, if present
    base_filename = base_filename.replace("Feats", "")

    # Save the classifier model to a file
    output_file_path = os.path.join(output_folder, f'{base_filename}_model.pkl')
    with open(output_file_path, 'wb') as file:
        pickle.dump(classifier, file)

# Train the classifier
word_features_file = 'features_NEW/Wikipedia_Train_NEW_Feats1.pkl'
classifier = train_classifier(word_features_file)

# Save the classifier model to the "lmodel" folder
output_folder = 'lmodel'
save_model(classifier, output_folder, word_features_file)




In [30]:
#New code for combined data do not change

def train_classifier(word_features_file1, word_features_file2):
    # Load the first word_features DataFrame from the pickle file
    with open(word_features_file1, 'rb') as file:
        word_features1 = pickle.load(file)

    # Load the second word_features DataFrame from the pickle file
    with open(word_features_file2, 'rb') as file:
        word_features2 = pickle.load(file)

    # Concatenate the two word_features DataFrames
    word_features = pd.concat([word_features1, word_features2], ignore_index=True)

    # Extract the features and labels from the combined word_features DataFrame
    features_columns = ['syllables', 'length', 'vowels', 'simple_wiki_freq', 'HIT_freq', 'google frequency']
    X = word_features[features_columns].values
    y = word_features['complex_binary'].values

    # Create and train the Nearest Centroid classifier
    clf = NearestCentroid()
    clf.fit(X, y)

    return clf

def save_model(classifier, output_folder, model_name):
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Save the classifier model to a file
    output_file_path = os.path.join(output_folder, f'{model_name}_model.pkl')
    with open(output_file_path, 'wb') as file:
        pickle.dump(classifier, file)


In [33]:


word_features_file1 = 'features/Wikipedia_Dev_allInfo_1.pkl'
word_features_file2 = 'features/Wikipedia_Train_allInfo_1.pkl'
classifier = train_classifier(word_features_file1, word_features_file2)


# Save the classifier model to the "lm" folder
output_folder = 'lm'
model_name = 'Wikipedia_combined_Dev_Train_Fast'
save_model(classifier, output_folder, model_name)

In [5]:
import pandas as pd

# Load the pickled DataFrame
df = pd.read_pickle('features_NEW/Wikipedia_Dev_NEW_Feats1.pkl')

# Define the columns to check for NaN values
features_columns = ['syllables', 'characters', 'vowels', 'simple_wiki_freq', 'HIT_count', '1gram_freq', '2gram_freq', '3gram_freq', '4gram_freq',] #'5gram_freq']


# Check for NaN values in the specified columns
nan_values = df[features_columns].isna().sum()

# Print the number of NaN values for each column
for column, count in nan_values.items():
    print(f"NaN values in column '{column}': {count}")

NaN values in column 'syllables': 0
NaN values in column 'characters': 0
NaN values in column 'vowels': 0
NaN values in column 'simple_wiki_freq': 0
NaN values in column 'HIT_count': 0
NaN values in column '1gram_freq': 113
NaN values in column '2gram_freq': 666
NaN values in column '3gram_freq': 691
NaN values in column '4gram_freq': 692


In [15]:
import pandas as pd

# Load the pickled DataFrame
df = pd.read_pickle('features_NEW/Wikipedia_Train_NEW_Feats1.pkl')

# Define the columns to check for zero values
features_columns = ['syllables', 'characters', 'vowels', 'simple_wiki_freq', 'HIT_count', '1gram_freq', '2gram_freq', '3gram_freq', '4gram_freq']

# Check for zero values in the specified columns
zero_counts = (df[features_columns] == 0).sum()

# Print the number of zero values for each column
for column, count in zero_counts.items():
    print(f"Zero counts in column '{column}': {count}")


Zero counts in column 'syllables': 78
Zero counts in column 'characters': 0
Zero counts in column 'vowels': 62
Zero counts in column 'simple_wiki_freq': 5551
Zero counts in column 'HIT_count': 6
Zero counts in column '1gram_freq': 0
Zero counts in column '2gram_freq': 0
Zero counts in column '3gram_freq': 0
Zero counts in column '4gram_freq': 0


In [16]:
import pandas as pd

# Load the pickled DataFrame
df = pd.read_pickle('features_NEW/Wikipedia_Train_NEW_Feats1.pkl')

# Find the rows with zero value in "HIT_count" column
zero_hit_count_rows = df[df['HIT_count'] == 0]

# Print the rows with zero "HIT_count"
print(zero_hit_count_rows)


                                  ID  \
777   34D9ZRXCYRVYV0Y20MTM8EXYR5MASA   
1002  3UQ1LLR26A9BRN3CGDWLWSJ3ZBEALW   
2716  3D06DR5225KVX5LXGPX0W5YSZG4AMY   
2823  3P458N04Q1IHMEPXHH6U14VSUWW2X3   
4212  3O2Y2UIUCQVV38226T6CVTHF7RMKFG   
4842  3YZ7A3YHR5U1PUML5Q2503HCZRLS5M   

                                               sentence start_index end_index  \
777   Two also has the unique property such that and...          33        42   
1002  Aemilian was killed by them at Spoletium or th...         100       108   
2716  It is also well known for being the seat of im...          11        21   
2823  As of the census of 2000 , there were 24,276 p...           0         5   
4212  Koca Mi 'm âr Sinân Âğâ ( Ottoman Turkish : مع...          44        55   
4842  Zhang et al. suggest that unless Epidexipteryx...           6        11   

             word total_native total_non_native native_complex  \
777     such that           10               10              1   
1002     half way  

In [40]:
import pandas as pd

# Load the pickled DataFrame
df = pd.read_pickle('features/WikiNews_Train_Feats.pkl')

# Define the columns to check for NaN values
features_columns = ['syllables', 'length', 'vowels', 'simple_wiki_freq', 'HIT_count', 'google frequency']

# Check for NaN values in the specified columns
nan_rows = df[df[features_columns].isna().any(axis=1)]

# Print the rows where NaN values occur
print("Rows with NaN values:")
print(nan_rows)



Rows with NaN values:
                                  ID  \
531   3OEWW2KGQJCHVF9LDF3FEAV3EC8OD6   
1282  3WYZV0QBFJEBARPT0AZ52XYWNV1BXR   
2371  3LCXHSGDLT71LDFEGRV84XNPBDCESM   
4683  3WPCIUYH1A9X87ET9WPE8K1QDKOTDW   
4720  3DIIW4IV8PWUZXFPM9PHR95BPQG4IE   

                                               sentence start_index end_index  \
531   #42-110 I still have some questions left unans...          60        64   
1282  #3-7 Anastasia Slonina, an actress working in ...          73        82   
2371  #42-103 Dalial Freitak, who is also on this pa...          82        86   
4683  #30-2 A woman in Spain has been fined €800 aft...         143       163   
4720  #39-1 Austrian police find dozens dead inside ...         140       151   

                      word total_native total_non_native native_complex  \
531                   ph.d           10               10              1   
1282             teatr.doc           10               10              0   
2371                  ph.

In [4]:
#Funtion for final baseline model

def train_classifier(word_features_file1, word_features_file2, word_features_file3, word_features_file4, word_features_file5, word_features_file6):
    # Load the word_features DataFrames from the pickle files
    with open(word_features_file1, 'rb') as file:
        word_features1 = pickle.load(file)
    with open(word_features_file2, 'rb') as file:
        word_features2 = pickle.load(file)
    with open(word_features_file3, 'rb') as file:
        word_features3 = pickle.load(file)
    with open(word_features_file4, 'rb') as file:
        word_features4 = pickle.load(file)
    with open(word_features_file5, 'rb') as file:
        word_features5 = pickle.load(file)
    with open(word_features_file6, 'rb') as file:
        word_features6 = pickle.load(file)

    # Concatenate the word_features DataFrames
    word_features = pd.concat([word_features1, word_features2, word_features3, word_features4, word_features5, word_features6], ignore_index=True)

    # Extract the features and labels from the combined word_features DataFrame
    features_columns = ['syllables', 'length', 'vowels', 'simple_wiki_freq', 'HIT_freq', 'google frequency']
    X = word_features[features_columns].values
    y = word_features['complex_binary'].values

   # Create and train the Nearest Centroid classifier
    clf = NearestCentroid()
    clf.fit(X, y)

    return clf

def save_model(model, output_folder, model_name):
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Save the model to a file
    output_file_path = os.path.join(output_folder, f'{model_name}_model.pkl')
    with open(output_file_path, 'wb') as file:
        pickle.dump(model, file)


In [5]:
# Train the classifier and save the model
word_features_file1 = 'features/WikiNews_Train_allInfo_1.pkl'
word_features_file2 = 'features/WikiNews_Dev_allInfo_1.pkl'
word_features_file3 = 'features/News_Train_allInfo_1.pkl'
word_features_file4 = 'features/News_Dev_allInfo_1.pkl'
word_features_file5 = 'features/WikiNews_Train_allInfo_1.pkl'
word_features_file6 = 'features/WikiNews_Dev_allInfo_1.pkl'
model = train_classifier(word_features_file1, word_features_file2, word_features_file3, word_features_file4, word_features_file5, word_features_file6)
output_folder = 'lm'
model_name = 'Baseline_Binary_allInfo'
save_model(model, output_folder, model_name)


In [6]:
#Build model with all features

import pandas as pd
import pickle
import os
from sklearn.neighbors import NearestCentroid

def train_classifier(word_features_file1, word_features_file2, word_features_file3, word_features_file4, word_features_file5, word_features_file6):
    # Load the word_features DataFrames from the pickle files
    with open(word_features_file1, 'rb') as file:
        word_features1 = pickle.load(file)
    with open(word_features_file2, 'rb') as file:
        word_features2 = pickle.load(file)
    with open(word_features_file3, 'rb') as file:
        word_features3 = pickle.load(file)
    with open(word_features_file4, 'rb') as file:
        word_features4 = pickle.load(file)
    with open(word_features_file5, 'rb') as file:
        word_features5 = pickle.load(file)
    with open(word_features_file6, 'rb') as file:
        word_features6 = pickle.load(file)

    # Concatenate the word_features DataFrames
    word_features = pd.concat([word_features1, word_features2, word_features3, word_features4, word_features5, word_features6], ignore_index=True)

    # Extract the features and labels from the combined word_features DataFrame
    features_columns = ['syllables', 'length', 'vowels', 'pos', 'dep num', 'lemma', 'synonyms', 'hypernyms', 'hyponyms', 'wikipedia_freq', 'subtitles_freq', 'learner_corpus_freq', 'complex_lexicon', 'bnc_freq', 'ogden', 'simple_wiki', 'cald', 'sub_imdb', 'cnc', 'img', 'aoa', 'fam', 'google frequency', 'KFCAT', 'KFSMP', 'KFFRQ', 'NPHN', 'TLFRQ', 'holonyms', 'meronyms', 'consonants', 'learners_bigrams', 'simple_wiki_bigrams', 'ner', 'google_char_bigram', 'google_char_trigram', 'simple_wiki_fourgram', 'learner_fourgram', 'simple_wiki_freq', 'HIT_freq']
    X = word_features[features_columns].values
    y = word_features['complex_binary'].values

    # Create and train the Nearest Centroid classifier
    clf = NearestCentroid()
    clf.fit(X, y)

    return clf

def save_model(model, output_folder, model_name):
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Save the model to a file
    output_file_path = os.path.join(output_folder, f'{model_name}_model.pkl')
    with open(output_file_path, 'wb') as file:
        pickle.dump(model, file)



In [7]:
word_features

NameError: name 'word_features' is not defined