In [8]:
#1 
import pandas as pd
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error





def run_models(classifiers, testing_features):  # testing_features is now a DataFrame
    

    # Extract the feature columns from the testing features DataFrame
    feature_columns = ['syllables', 'characters', 'vowels', 'simple_wiki_freq', 'HIT_count', 'absTotalMatchCount', 'relTotalMatchCount']
    X_test = testing_features[feature_columns].values

    # Replace NaN values with 0
    imputer = SimpleImputer(strategy='constant', fill_value=0)
    X_test = imputer.fit_transform(X_test)

    # Create an empty DataFrame to store the predictions
    predictions_df = pd.DataFrame()

    for classifier_file in classifiers:
        # Load the trained model from the pickle file
        with open(classifier_file, 'rb') as file:
            classifier = pickle.load(file)

        # Make predictions using the model
        y_pred = classifier.predict(X_test)

        # Add the predictions to the DataFrame
        predictions_df[classifier_file] = y_pred

    # Add the 'complex_probabilistic' label from the testing features file to the DataFrame
    predictions_df['complex_probabilistic'] = testing_features['complex_probabilistic'].values

    return predictions_df

def evaluate(predictions_df):
    # Get the predicted values and actual labels
    y_pred = predictions_df['complex_probabilistic']
    y_true = testing_features ['complex_probabilistic'] 

    # Calculate MAE for each model
    mae_per_model = {}
    for column in predictions_df.columns[:-1]:
        mae = mean_absolute_error(y_true, predictions_df[column])
        mae_per_model[column] = mae

    return mae_per_model

# Define the trained classifier files
classifiers = [
    "lmodel/prob/News_prob_NEW.pkl",
    "lmodel/prob/Wikinews_prob_NEW.pkl",
    "lmodel/prob/Wikipedia_prob_NEW.pkl",
    "lmodel/prob/Combined_prob_NEW.pkl"
]

# Define the testing features file
testing_features_file = "features_NEW/Wikipedia_Dev_NEW_Feats1.pkl"
testing_features = pd.read_pickle(testing_features_file)

# Run the models and make predictions
predictions_df = run_models(classifiers, testing_features)

# Evaluate the predictions using MAE for each model
mae_per_model = evaluate(predictions_df)

# Print the MAE for each model
for model, mae in mae_per_model.items():
    print(f"Model: {model}\tMAE: {mae}")


Model: lmodel/prob/News_prob_NEW.pkl	MAE: 0.11843706234176962
Model: lmodel/prob/Wikinews_prob_NEW.pkl	MAE: 0.11779519861671522
Model: lmodel/prob/Wikipedia_prob_NEW.pkl	MAE: 0.11918676171268122
Model: lmodel/prob/Combined_prob_NEW.pkl	MAE: 0.11821856043100747


Model: lmodel/prob/News_prob.pkl	MAE: 0.11361416574728281
Model: lmodel/prob/Wikinews_prob.pkl	MAE: 0.11148948763200345
Model: lmodel/prob/Wikipedia_prob.pkl	MAE: 0.11352341079626468
Model: lmodel/prob/Combined_prob.pkl	MAE: 0.11281649160711754

In [26]:
def evaluate(predictions_df, testing_feature_file):
    # Get the predicted values and actual labels
    y_pred = predictions_df['complex_probabilistic']
    y_true = testing_features ['complex_probabilistic'] 

    # Define the model names
    model_names = ["NEWS", "WIKINEWS", "WIKIPEDIA", "Combined"]

    # Calculate MAE for each model
    mae_per_model = {}
    for i, column in enumerate(predictions_df.columns[:-1]):
        mae = mean_absolute_error(y_true, predictions_df[column])
        mae_per_model[model_names[i]] = mae

    # Create a DataFrame to store the MAE per model
    mae_df = pd.DataFrame(list(mae_per_model.items()), columns=['Model', 'MAE'])

    # Print the MAE DataFrame in LaTeX format
    latex_table = mae_df.to_latex(index=False, caption=f"MAE per Model - {testing_feature_file}", label=f"table:{testing_feature_file}")
    print(latex_table)

    return mae_per_model
evaluate(predictions_df, testing_features_file)

\begin{table}
\centering
\caption{MAE per Model - features_NEW/News_Dev_NEW_Feats1.pkl}
\label{table:features_NEW/News_Dev_NEW_Feats1.pkl}
\begin{tabular}{lr}
\toprule
    Model &      MAE \\
\midrule
     NEWS & 0.113614 \\
 WIKINEWS & 0.111489 \\
WIKIPEDIA & 0.113523 \\
 Combined & 0.112816 \\
\bottomrule
\end{tabular}
\end{table}



  latex_table = mae_df.to_latex(index=False, caption=f"MAE per Model - {testing_feature_file}", label=f"table:{testing_feature_file}")


{'NEWS': 0.11361416574728281,
 'WIKINEWS': 0.11148948763200345,
 'WIKIPEDIA': 0.11352341079626468,
 'Combined': 0.11281649160711754}

In [None]:
print(mae_per_model)

{'lmodel/prob/News_prob.pkl': 0.008393695824554678, 'lmodel/prob/Wikinews_prob.pkl': 0.006723632283289598, 'lmodel/prob/Wikipedia_prob.pkl': 0.009716911911144625, 'lmodel/prob/Combined_prob.pkl': 0.0030427213866589574}
