This notebook takes the features from the CAMB inspired system and evaluates for the probabilitic complexity 

In [3]:
import pandas as pd
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error


In [6]:
def run_models(classifiers, testing_features):  # testing_features is now a DataFrame
    

    # Extract the feature columns from the testing features DataFrame
    feature_columns = ['syllables', 'length', 'dep num', 'synonyms', 'hypernyms', 'ogden', 'simple_wiki', 'CNC', 'IMG', 'sub_imdb', 'google frequency', 'KFCAT', 'FAM', 'KFSMP', 'KFFRQ', 'AOA', 'NPHN', 'T-LFRQ' ]
    X_test = testing_features[feature_columns].values

    # Replace NaN values with 0
    imputer = SimpleImputer(strategy='constant', fill_value=0)
    X_test = imputer.fit_transform(X_test)

    # Create an empty DataFrame to store the predictions
    predictions_df = pd.DataFrame()

    for classifier_file in classifiers:
        # Load the trained model from the pickle file
        with open(classifier_file, 'rb') as file:
            classifier = pickle.load(file)

        # Make predictions using the model
        y_pred = classifier.predict(X_test)

        # Add the predictions to the DataFrame
        predictions_df[classifier_file] = y_pred

    # Add the 'complex_probabilistic' label from the testing features file to the DataFrame
    predictions_df['complex_probabilistic'] = testing_features['complex_probabilistic'].values

    return predictions_df

def evaluate(predictions_df):
    # Get the predicted values and actual labels
    y_pred = predictions_df['complex_probabilistic']
    y_true = testing_features ['complex_probabilistic'] 

    # Calculate MAE for each model
    mae_per_model = {}
    for column in predictions_df.columns[:-1]:
        mae = mean_absolute_error(y_true, predictions_df[column])
        mae_per_model[column] = mae

    return mae_per_model

# Define the trained classifier files
classifiers = [
    "Models/prob/News_prob_CAMB.pkl",
    "Models/prob/Wikinews_prob_CAMB.pkl",
    "Models/prob/Wikipedia_prob_CAMB.pkl",
    "Models/prob/Combined_prob_CAMB.pkl"
]


# Define the testing features file paths

testing_features_files = [
    "All_features/Wikipedia_Dev_pp.pkl"]
    


# Load each testing features file into a separate DataFrame
testing_feature_dfs = [pd.read_pickle(file) for file in testing_features_files]

# Concatenate the separate DataFrames into a single DataFrame
testing_features = pd.concat(testing_feature_dfs)

# Run the models and make predictions
predictions_df = run_models(classifiers, testing_features)


# Evaluate the predictions using MAE for each model
mae_per_model = evaluate(predictions_df)

# Print the MAE for each model
for model, mae in mae_per_model.items():
    print(f"Model: {model}\tMAE: {mae}")

Model: Models/prob/News_prob_CAMB.pkl	MAE: 0.10285842685760904
Model: Models/prob/Wikinews_prob_CAMB.pkl	MAE: 0.1026810989773711
Model: Models/prob/Wikipedia_prob_CAMB.pkl	MAE: 0.10305216575989859
Model: Models/prob/Combined_prob_CAMB.pkl	MAE: 0.10148337967620562


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Model: lmodel/prob/News_prob.pkl	MAE: 0.11361416574728281
Model: lmodel/prob/Wikinews_prob.pkl	MAE: 0.11148948763200345
Model: lmodel/prob/Wikipedia_prob.pkl	MAE: 0.11352341079626468
Model: lmodel/prob/Combined_prob.pkl	MAE: 0.11281649160711754

In [9]:
import pandas as pd
from sklearn.metrics import mean_absolute_error

def evaluate(predictions_df, testing_feature_file):
    # Get the predicted values and actual labels
    y_pred = predictions_df['complex_probabilistic']
    y_true = testing_features['complex_probabilistic'] 

    # Define the model names
    model_names = ["NEWS", "WIKINEWS", "WIKIPEDIA", "Combined"]

    # Calculate MAE for each model
    mae_per_model = {}
    for i, column in enumerate(predictions_df.columns[:-1]):
        mae = mean_absolute_error(y_true, predictions_df[column])
        mae_per_model[model_names[i]] = mae

    # Create a DataFrame to store the MAE per model
    mae_df = pd.DataFrame(list(mae_per_model.items()), columns=['Model', 'MAE'])

    # Format MAE values to 4 decimal places
    mae_df['MAE'] = mae_df['MAE'].apply(lambda x: "{:.4f}".format(x))

    # Print the MAE DataFrame in LaTeX format
    latex_table = mae_df.to_latex(index=False, caption=f"MAE per Model - {testing_feature_file}", label=f"table:{testing_feature_file}")
    print(latex_table)

    return mae_per_model

# Replace 'predictions_df' and 'testing_features' with your actual data
# evaluate(predictions_df, testing_features_file)

evaluate(predictions_df, testing_features_file)

\begin{table}
\centering
\caption{MAE per Model - final_camb_feats_Test/combined_Test_Final.pkl}
\label{table:final_camb_feats_Test/combined_Test_Final.pkl}
\begin{tabular}{ll}
\toprule
    Model &    MAE \\
\midrule
     NEWS & 0.1079 \\
 WIKINEWS & 0.1062 \\
WIKIPEDIA & 0.1070 \\
 Combined & 0.1061 \\
\bottomrule
\end{tabular}
\end{table}



  latex_table = mae_df.to_latex(index=False, caption=f"MAE per Model - {testing_feature_file}", label=f"table:{testing_feature_file}")


{'NEWS': 0.10786376245846555,
 'WIKINEWS': 0.1061570633409926,
 'WIKIPEDIA': 0.10698594475258484,
 'Combined': 0.1060734513675542}

In [10]:
print(mae_per_model)

{'lmodel/prob/News_prob_CAMB.pkl': 0.10786376245846555, 'lmodel/prob/Wikinews_prob_CAMB.pkl': 0.1061570633409926, 'lmodel/prob/Wikipedia_prob_CAMB.pkl': 0.10698594475258484, 'lmodel/prob/Combined_prob_CAMB.pkl': 0.1060734513675542}
