In [3]:
import os
import pickle
from sklearn.neighbors import NearestCentroid
import pandas as pd

In [9]:
#Train the classifier to without the raw Ngram data


# 1- Classifier should now handle NaN values and change them to zero

def train_classifier(word_features_file):
    # Load the word_features DataFrame from the pickle file
    with open(word_features_file, 'rb') as file:
        word_features = pickle.load(file)

    # Replace empty values with NaN
    word_features.replace("", float("NaN"), inplace=True)

    # Replace NaN values with 0
    word_features.fillna(0, inplace=True)

    # Extract the features and labels from the word_features DataFrame
    features_columns = ['syllables', 'characters', 'vowels', 'simple_wiki_freq', 'HIT_count', 'absTotalMatchCount','relTotalMatchCount'] 
    X = word_features[features_columns].values
    y = word_features['complex_binary'].values

    # Create and train the Nearest Centroid classifier
    clf = NearestCentroid()
    clf.fit(X, y)

    return clf

def save_model(classifier, output_folder, word_features_file):
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Get the base filename from the word_features_file
    base_filename = os.path.splitext(os.path.basename(word_features_file))[0]

    # Remove "Feats" from the base filename, if present
    base_filename = base_filename.replace("Feats", "")

    # Save the classifier model to a file
    output_file_path = os.path.join(output_folder, f'{base_filename}_model.pkl')
    with open(output_file_path, 'wb') as file:
        pickle.dump(classifier, file)

# Train the classifier
word_features_file = 'features_NEW/Wikipedia_Train_NEW_Feats1.pkl'
classifier = train_classifier(word_features_file)

# Save the classifier model to the "lmodel" folder
output_folder = 'lmodel'
save_model(classifier, output_folder, word_features_file)




In [10]:
#New code for combined data do not change

def train_classifier(word_features_file1, word_features_file2):
    # Load the first word_features DataFrame from the pickle file
    with open(word_features_file1, 'rb') as file:
        word_features1 = pickle.load(file)

    # Load the second word_features DataFrame from the pickle file
    with open(word_features_file2, 'rb') as file:
        word_features2 = pickle.load(file)

    # Concatenate the two word_features DataFrames
    word_features = pd.concat([word_features1, word_features2], ignore_index=True)

    # Extract the features and labels from the combined word_features DataFrame
    features_columns = ['syllables', 'length', 'vowels', 'simple_wiki_freq', 'HIT_freq', 'google frequency']
    X = word_features[features_columns].values
    y = word_features['complex_binary'].values

    # Create and train the Nearest Centroid classifier
    clf = NearestCentroid()
    clf.fit(X, y)

    return clf

def save_model(classifier, output_folder, model_name):
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Save the classifier model to a file
    output_file_path = os.path.join(output_folder, f'{model_name}_model.pkl')
    with open(output_file_path, 'wb') as file:
        pickle.dump(classifier, file)


In [11]:


word_features_file1 = 'features/Wikipedia_Dev_allInfo_1.pkl'
word_features_file2 = 'features/Wikipedia_Train_allInfo_1.pkl'
classifier = train_classifier(word_features_file1, word_features_file2)


# Save the classifier model to the "lm" folder
output_folder = 'lm'
model_name = 'Wikipedia_combined_Dev_Train_Fast'
save_model(classifier, output_folder, model_name)

In [12]:
#Funtion for final baseline model

def train_classifier(word_features_file1, word_features_file2, word_features_file3, word_features_file4, word_features_file5, word_features_file6):
    # Load the word_features DataFrames from the pickle files
    with open(word_features_file1, 'rb') as file:
        word_features1 = pickle.load(file)
    with open(word_features_file2, 'rb') as file:
        word_features2 = pickle.load(file)
    with open(word_features_file3, 'rb') as file:
        word_features3 = pickle.load(file)
    with open(word_features_file4, 'rb') as file:
        word_features4 = pickle.load(file)
    with open(word_features_file5, 'rb') as file:
        word_features5 = pickle.load(file)
    with open(word_features_file6, 'rb') as file:
        word_features6 = pickle.load(file)

    # Concatenate the word_features DataFrames
    word_features = pd.concat([word_features1, word_features2, word_features3, word_features4, word_features5, word_features6], ignore_index=True)

    # Extract the features and labels from the combined word_features DataFrame
    features_columns = ['syllables', 'length', 'vowels', 'simple_wiki_freq', 'HIT_freq', 'google frequency']
    X = word_features[features_columns].values
    y = word_features['complex_binary'].values

   # Create and train the Nearest Centroid classifier
    clf = NearestCentroid()
    clf.fit(X, y)

    return clf

def save_model(model, output_folder, model_name):
    # Create the output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    # Save the model to a file
    output_file_path = os.path.join(output_folder, f'{model_name}_model.pkl')
    with open(output_file_path, 'wb') as file:
        pickle.dump(model, file)


In [13]:
# Train the classifier and save the model
word_features_file1 = 'features/WikiNews_Train_allInfo_1.pkl'
word_features_file2 = 'features/WikiNews_Dev_allInfo_1.pkl'
word_features_file3 = 'features/News_Train_allInfo_1.pkl'
word_features_file4 = 'features/News_Dev_allInfo_1.pkl'
word_features_file5 = 'features/WikiNews_Train_allInfo_1.pkl'
word_features_file6 = 'features/WikiNews_Dev_allInfo_1.pkl'
model = train_classifier(word_features_file1, word_features_file2, word_features_file3, word_features_file4, word_features_file5, word_features_file6)
output_folder = 'lm'
model_name = 'Baseline_Binary_allInfo'
save_model(model, output_folder, model_name)


In [6]:
# #Build model with all CAMB features

# import pandas as pd
# import pickle
# import os
# from sklearn.neighbors import NearestCentroid

# def train_classifier(word_features_file1, word_features_file2, word_features_file3, word_features_file4, word_features_file5, word_features_file6):
#     # Load the word_features DataFrames from the pickle files
#     with open(word_features_file1, 'rb') as file:
#         word_features1 = pickle.load(file)
#     with open(word_features_file2, 'rb') as file:
#         word_features2 = pickle.load(file)
#     with open(word_features_file3, 'rb') as file:
#         word_features3 = pickle.load(file)
#     with open(word_features_file4, 'rb') as file:
#         word_features4 = pickle.load(file)
#     with open(word_features_file5, 'rb') as file:
#         word_features5 = pickle.load(file)
#     with open(word_features_file6, 'rb') as file:
#         word_features6 = pickle.load(file)

#     # Concatenate the word_features DataFrames
#     word_features = pd.concat([word_features1, word_features2, word_features3, word_features4, word_features5, word_features6], ignore_index=True)

#     # Extract the features and labels from the combined word_features DataFrame
#     features_columns = ['syllables', 'length', 'vowels', 'pos', 'dep num', 'lemma', 'synonyms', 'hypernyms', 'hyponyms', 'wikipedia_freq', 'subtitles_freq', 'learner_corpus_freq', 'complex_lexicon', 'bnc_freq', 'ogden', 'simple_wiki', 'cald', 'sub_imdb', 'cnc', 'img', 'aoa', 'fam', 'google frequency', 'KFCAT', 'KFSMP', 'KFFRQ', 'NPHN', 'TLFRQ', 'holonyms', 'meronyms', 'consonants', 'learners_bigrams', 'simple_wiki_bigrams', 'ner', 'google_char_bigram', 'google_char_trigram', 'simple_wiki_fourgram', 'learner_fourgram', 'simple_wiki_freq', 'HIT_freq']
#     X = word_features[features_columns].values
#     y = word_features['complex_binary'].values

#     # Create and train the Nearest Centroid classifier
#     clf = NearestCentroid()
#     clf.fit(X, y)

#     return clf

# def save_model(model, output_folder, model_name):
#     # Create the output folder if it doesn't exist
#     os.makedirs(output_folder, exist_ok=True)

#     # Save the model to a file
#     output_file_path = os.path.join(output_folder, f'{model_name}_model.pkl')
#     with open(output_file_path, 'wb') as file:
#         pickle.dump(model, file)

