In [1]:
### CAPSTONE PROJECT - 2025 - ARXIV.ORG METADATA - PREDICT PUBLICATION AND SELECT HIGHER QUALITY PAPERS

In [2]:
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tabulate import tabulate

import re
import ast

import warnings
from sklearn.exceptions import ConvergenceWarning

### Word Cloud
from wordcloud import WordCloud

### Modelling Libraries
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import MaxAbsScaler, MinMaxScaler
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import SequentialFeatureSelector

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, roc_auc_score, ConfusionMatrixDisplay

#SMOTE
from imblearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [3]:
#### 1. Business Understanding

In [4]:
###Overview of the Question to Be Solved:
#This project aims to predict if a paper on arxiv.org is going to be published or not in a research magazine.
#Primary question: “How can we effectively classify the Arxiv papers and predict if the paper is going to be published or not?"
# Target variable: a binary response (y) for the paper being published or not.
# Data sources: arxiv.org dataset for the subset catgeory "cs.AI".

In [5]:
###Capstone Project Overview - Predicting if a research paper on arxiv.org will be published or not to select the high quality papers.

#The research question you intend to answer:
# - "To predict if a paper on arxiv.org is going to be published or not in a research magazine to help select the higher quality papers".

#Expected data source(s):
# - Two arxiv.org metadata files from the following sources:
#   - https://www.kaggle.com/datasets/Cornell-University/arxiv (filename = arxiv-metadata-oai-snapshot.json);
#   - https://www.kaggle.com/datasets/spsayakpaul/arxiv-paper-abstracts (filename = arxi_data.csv)

#The techniques expected to be used in my analysis:
# - TF-IDF (Term Frequency-Inverse Document Frequency) for embeedings on title and abstract data
# - BERT for embeedings on title and abstract data
# - Word Cloud and Word Count to understand the title and abstract
# - PCA, K-means clustering, feature selection and hyperparameter tunning
# - Predictive modeling
#   - Techquines to be used for predictive modeling:
#     - Logistic Regression
#     - Decision Tree
#     - Random Forest
#     - Support Vector Machine (SVM)
#     - KNN: K-Nearest Neighbors
#     - Gaussian Naive Bayes
#     - Gradient Boosting
#     - Neural Network

#The expected results
# - Target binary response variable (y) for an arxiv.org paper being published or not (dataset column "journal-ref" NaN or filled with content).
# - The column journal-ref contains the arxiv.org "Reference to the journal where the paper was published (if applicable)".
# - I assume NaN as "not published", and any content as "published".

#Why this question is important
# - As not all arxiv.org papers are actually published, this project can help researchers select the "higher quality" papers to read first, assuming
#   that a "higher quality" paper is normally published. This will help researchers save time on searching for and reading content for their projects.

In [6]:
#### 2. Data Understanding

In [7]:
#Preprocessing ARXIV.org metadata for AI (category = 'cs.AI')
#Source for data -- https://www.kaggle.com/datasets/Cornell-University/arxiv
#DataSets:
#Arxiv.org AI Research Papers Dataset: Contains metadata of 10,000 AI research papers from Arxiv.
# - https://www.kaggle.com/datasets/spsayakpaul/arxiv-paper-abstracts

#Arxiv Paper Abstracts: A dataset for building multi-label text classifiers based on Arxiv paper abstracts.
# - https://www.kaggle.com/datasets/yasirabdaali/arxivorg-ai-research-papers-dataset
# - https://www.kaggle.com/datasets/yasirabdaali/arxivorg-ai-research-papers-dataset?select=arxiv_ai.csv

In [8]:
#Metadata of the Arxiv.org OAI Snapshot dataset (main source)
#id: A unique identifier for each arXiv paper.
#submitter: The user who submitted the paper to arXiv.
#authors: A list of authors who contributed to the paper.
#title: The title of the paper.
#comments: Additional comments about the paper, such as notes on revisions or submission details.
#journal-ref: Reference to the journal where the paper was published (if applicable).
#doi: The DOI (Digital Object Identifier) of the paper (if available).
#report-no: Report number associated with the paper (if any).
#categories: Categories or subjects to which the paper belongs (e.g., cs.AI, math.ST, etc.).
#license: Licensing information for the paper.
#abstract: A brief summary or abstract of the paper.
#versions: Information about different versions of the paper submitted to arXiv.
#update_date: The date when the paper was last updated on arXiv.

In [9]:
# Define the file path
input_file = 'C:/Users/mnkub/Desktop/Capstone/data/arxiv_final.json' 
#input_file = '../data/arxiv_final.json' 

# Check if the input file exists
if not os.path.exists(input_file):
    raise FileNotFoundError(f"File {input_file} does not exist")

In [10]:
#Read output file (JSON)
arxiv = pd.read_json(input_file)

In [11]:
arxiv.head()

Unnamed: 0,id,submitter,authors,title,comments,journal_ref,doi,report_no,categories,license,...,abstract_tfidf,cluster_abstract_tfidf,title_tfidf,cluster_title_tfidf,abstract_embedding_bert,abstract_reduced_embeddings_bert,cluster_abstract_bert,title_embedding_bert,title_reduced_embeddings_bert,cluster_title_bert
0,704.1394,Tarik Hadvzic,Tarik Hadzic Rune Moller Jensen Henrik Reif An...,Calculating Valid Domains for BDDBased Interac...,,,,,cs.AI,,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,"[-0.0707399398, 0.2497202158, 0.0697884262, -0...","[-2.0193810463, -2.4993662834, -0.3357081413, ...",2,"[-0.762732029, -0.5272296667, -0.3555475771, -...","[2.5423688889, -0.6378098726, -1.2343144417, -...",2
1,704.201,Juliana Bernardes,Juliana S Bernardes Alberto Davila Vitor Santo...,A study of structural properties on profiles HMMs,"6 pages, 7 figures",,,,cs.AI,http://arxiv.org/licenses/nonexclusive-distrib...,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2,"[-0.3843331933, -0.2263615727, 0.2963011265, 0...","[1.3566942215, 1.1760284901, 1.6757059097, -1....",1,"[-0.451326102, -0.3827941716, -0.4721180499, -...","[0.7403473854, 0.6475714445, 0.193864598900000...",4
2,704.3433,Tshilidzi Marwala,Tshilidzi Marwala and Bodie Crossingham,Bayesian approach to rough set,"20 pages, 3 figures",,,,cs.AI,,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2,"[-0.3154313266, -0.24923917650000002, -0.32852...","[-0.27569633720000003, -2.1775341034, 2.087684...",3,"[-0.48346123100000005, -0.259116143, -0.323963...","[-1.4785642624, -1.9715181589, 2.7166180611, 0...",1
3,704.3515,Jegor Uglov Mr,J Uglov V Schetinin C Maple,Comparing Robustness of Pairwise and Multiclas...,,,10.1155/2008/468693,,cs.AI,,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",4,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3,"[0.040906731, 0.4251708388, -0.162817806, 0.18...","[1.2501780987, 1.6642855406, -0.07740855220000...",1,"[-0.5709663033, -0.6318161488, -0.868732035200...","[3.9551944733, 1.2371120453, -0.20816591380000...",2
4,704.3905,Marc Schoenauer,Christian Gagne INFORMATIQUE WGZ INC Michele S...,Ensemble Learning for Free with Evolutionary A...,,Dans GECCO (2007),,,cs.AI,,...,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",2,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",1,"[-0.12671619650000002, -0.0426417366, -0.20895...","[-0.2887310386, 2.3744587898000002, -0.4303132...",0,"[-0.4691320956, -0.2000201046, -0.2584528625, ...","[-0.4662218094, -1.8894429207, -0.305285722000...",3


In [12]:
arxiv.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11597 entries, 0 to 11625
Data columns (total 35 columns):
 #   Column                            Non-Null Count  Dtype 
---  ------                            --------------  ----- 
 0   id                                11597 non-null  object
 1   submitter                         11597 non-null  object
 2   authors                           11597 non-null  object
 3   title                             11597 non-null  object
 4   comments                          11597 non-null  object
 5   journal_ref                       11597 non-null  object
 6   doi                               11597 non-null  object
 7   report_no                         11597 non-null  object
 8   categories                        11597 non-null  object
 9   license                           11597 non-null  object
 10  abstract                          11597 non-null  object
 11  versions                          11597 non-null  object
 12  update_date            

In [13]:
print(arxiv.dtypes)

id                                  object
submitter                           object
authors                             object
title                               object
comments                            object
journal_ref                         object
doi                                 object
report_no                           object
categories                          object
license                             object
abstract                            object
versions                            object
update_date                          int64
authors_parsed                      object
pdf_url                             object
paper_published                      int64
update_year                          int64
newest_date                          int64
version1_year                        int64
num_pages                            int64
num_figures                          int64
submitter_frequency                  int64
title_keyword_count                  int64
abstract_ke

In [14]:
##### 5. Predictive Modelling - Which Articles Are More Likely to be Published?

In [15]:
###Distinct values of paper_published
# Get distinct values of the 'paper_published' column
distinct_primary_categories = arxiv['paper_published'].unique()

# Convert to a list for better readability
distinct_primary_categories_list = distinct_primary_categories.tolist()

# Display the distinct values
print(distinct_primary_categories_list)

# Count occurrences of each unique value in the 'paper_published' column
counts = arxiv['paper_published'].value_counts()
print(counts)

[0, 1]
paper_published
0    9817
1    1780
Name: count, dtype: int64


In [16]:
# Define numerical and list_of_lists and categorical columns
numerical_cols = ['num_pages', 'num_figures', 'submitter_frequency', 
                  'title_keyword_count', 'abstract_keyword_count',
                  'comments_keyword_count', 'update_year', 'version1_year', 
                  'cluster_abstract_tfidf', 'cluster_title_tfidf', 
                  'cluster_abstract_bert', 'cluster_title_bert'
                  ]

list_columns = ['abstract_tfidf', 'title_tfidf',
                'abstract_reduced_embeddings_bert', 'title_reduced_embeddings_bert',
                'abstract_embedding_bert', 'title_embedding_bert'
]

numerical_and_list_columns = numerical_cols + list_columns

#categorical_cols = ['journal_ref', 'categories', 'license']

In [17]:
### Convert lists and scale arxiv; save to arxiv_scaled

def scale_fit_columns(df, list_columns, numeric_columns):
    scaler = MinMaxScaler()
    
    # Scale list columns
    for column in list_columns:
        # Check if column exists in DataFrame
        if column not in df.columns:
            print(f"Column '{column}' not found in DataFrame. Skipping...")
            continue
        
        # Check if column contains lists of numbers
        if not all(isinstance(x, list) and all(isinstance(y, (int, float)) for y in x) for x in df[column]):
            print(f"Column '{column}' does not contain lists of numbers. Skipping...")
            continue
        
        # Find the maximum length
        max_length = max(len(x) for x in df[column])
        
        # Pad lists to the same length
        padded_lists = [list(x) + [0] * (max_length - len(x)) for x in df[column]]
        
        # Convert padded lists to NumPy array and scale
        padded_arrays = np.array(padded_lists)
        scaled_padded_arrays = scaler.fit_transform(padded_arrays)
        
        # Convert scaled arrays back to lists
        scaled_padded_lists = [list(row) for row in scaled_padded_arrays]
        
        # Update the DataFrame
        df[f"scaled_{column}_padded"] = scaled_padded_lists
    
    # Scale numeric columns
    for column in numeric_columns:
        # Check if column exists in DataFrame
        if column not in df.columns:
            print(f"Column '{column}' not found in DataFrame. Skipping...")
            continue
        
        # Check if column contains numeric values
        if not pd.api.types.is_numeric_dtype(df[column]):
            print(f"Column '{column}' does not contain numeric values. Skipping...")
            continue
        
        # Scale the column
        df[f"scaled_{column}"] = scaler.fit_transform(df[[column]])
    
    return df

# Apply the function to the DataFrame
arxiv_scaled = scale_fit_columns(arxiv, list_columns, numerical_cols)

In [18]:
# Create a list to hold all new feature DataFrames
new_feature_dfs = []

# Convert list columns into separate features
for column in list_columns:
    # Pad lists to the same length
    max_length = max(len(x) for x in arxiv_scaled[column])
    padded_lists = [list(x) + [0] * (max_length - len(x)) for x in arxiv_scaled[column]]
    
    # Create DataFrame for features
    padded_array = np.array(padded_lists)
    feature_df = pd.DataFrame(
        padded_array,
        columns=[f"{column}_feature_{i}" for i in range(padded_array.shape[1])],
        index=arxiv_scaled.index
    )
    
    new_feature_dfs.append(feature_df)

# Concatenate all new features at once
arxiv_scaled = pd.concat(
    [arxiv_scaled] + new_feature_dfs,
    axis=1).copy()  # The copy() helps with defragmentation

In [19]:
# Drop original list columns
arxiv_scaled.drop(columns=list_columns, inplace=True)

In [20]:
#### Scale based on affinity of columns - givenby the prefix usedc to create the columns
### Define the function to sum columns based on prefixes
def combine_columns_by_prefix(df, prefixes, output_columns):
    for prefix, output_column in zip(prefixes, output_columns):
        # Find columns that start with the given prefix
        matching_columns = [col for col in df.columns if col.startswith(prefix)]
        
        # Sum the values of the matching columns; make sure it's positive with abs()
        df[output_column] = df[matching_columns].sum(axis=1, min_count=1).abs()
    return df

# Define prefixes and output column names
prefixes = ['abstract_t', 'title_t', 'abstract_r', 'title_r', 'abstract_e', 'title_e']

output_columns = [
    'abstract_tfidf_feature_combined',
    'title_tfidf_feature_combined',
    'abstract_reduced_embeddings_bert_feature_combined',
    'title_reduced_embeddings_bert_feature_combined',
    'abstract_embedding_bert_feature_combined',
    'title_embedding_bert_feature_combined'
]

### Apply the function to the arxiv_scaled DataFrame
arxiv_scaled = combine_columns_by_prefix(arxiv_scaled, prefixes, output_columns)

In [21]:
### Verify the new columns
#print(arxiv_scaled[output_columns].head())

# Check if arxiv_scaled is defined
#if 'arxiv_scaled' in locals():
#    num_columns = len(arxiv_scaled.columns)
#    print(f"Number of columns in arxiv_scaled: {num_columns}")
#else:
#    print("arxiv_scaled is not defined.")

#arxiv_scaled.head()
#arxiv_scaled.info()

In [22]:
### Drop columns whose names start with any of the prefixes, excluding those in list_output_feature_columns
columns_to_drop = [col for col in arxiv_scaled.columns 
                   if any(col.startswith(prefix) for prefix in prefixes) 
                   and col not in output_columns]

arxiv_scaled.drop(columns=columns_to_drop, inplace=True)

#Plot number of columns dropped
num_columns_dropped = len(columns_to_drop)
print(f"Number of columns dropped: {num_columns_dropped}")

Number of columns dropped: 11792


In [23]:
###drop "old" scaled columns
# Check if arxiv_scaled is defined
if 'arxiv_scaled' in locals():
    # Identify columns to drop
    columns_to_drop = [col for col in arxiv_scaled.columns if col.startswith('scaled_')]
    
    # Drop the identified columns
    arxiv_scaled = arxiv_scaled.drop(columns=columns_to_drop, inplace=False)
    
    # Output the remaining columns
    remaining_columns = arxiv_scaled.columns.tolist()
    print("Remaining columns:", remaining_columns)
else:
    print("arxiv_scaled is not defined.")

Remaining columns: ['id', 'submitter', 'authors', 'title', 'comments', 'journal_ref', 'doi', 'report_no', 'categories', 'license', 'abstract', 'versions', 'update_date', 'authors_parsed', 'pdf_url', 'paper_published', 'update_year', 'newest_date', 'version1_year', 'num_pages', 'num_figures', 'submitter_frequency', 'title_keyword_count', 'abstract_keyword_count', 'comments_keyword_count', 'cluster_abstract_tfidf', 'cluster_title_tfidf', 'cluster_abstract_bert', 'cluster_title_bert', 'abstract_tfidf_feature_combined', 'title_tfidf_feature_combined', 'abstract_reduced_embeddings_bert_feature_combined', 'title_reduced_embeddings_bert_feature_combined', 'abstract_embedding_bert_feature_combined', 'title_embedding_bert_feature_combined']


In [24]:
### Generate the output feature columns by combining the scaled features

numerical_cols = ['num_pages', 'num_figures', 'submitter_frequency', 
                  'title_keyword_count', 'abstract_keyword_count',
                  'comments_keyword_count', 'update_year', 'version1_year', 
                  'cluster_abstract_tfidf', 'cluster_title_tfidf', 
                  'cluster_abstract_bert', 'cluster_title_bert'
                  ]

list_columns = [
    'abstract_tfidf_feature_combined',
    'title_tfidf_feature_combined',
    'abstract_reduced_embeddings_bert_feature_combined',
    'title_reduced_embeddings_bert_feature_combined',
    'abstract_embedding_bert_feature_combined',
    'title_embedding_bert_feature_combined'
]

#Combine features
numerical_and_list_columns = numerical_cols + list_columns
#numerical_and_list_columns = numerical_cols

In [25]:
### Output of the updated DataFrame columns
print(arxiv_scaled.columns.tolist())

['id', 'submitter', 'authors', 'title', 'comments', 'journal_ref', 'doi', 'report_no', 'categories', 'license', 'abstract', 'versions', 'update_date', 'authors_parsed', 'pdf_url', 'paper_published', 'update_year', 'newest_date', 'version1_year', 'num_pages', 'num_figures', 'submitter_frequency', 'title_keyword_count', 'abstract_keyword_count', 'comments_keyword_count', 'cluster_abstract_tfidf', 'cluster_title_tfidf', 'cluster_abstract_bert', 'cluster_title_bert', 'abstract_tfidf_feature_combined', 'title_tfidf_feature_combined', 'abstract_reduced_embeddings_bert_feature_combined', 'title_reduced_embeddings_bert_feature_combined', 'abstract_embedding_bert_feature_combined', 'title_embedding_bert_feature_combined']


In [26]:
arxiv_scaled.head()

Unnamed: 0,id,submitter,authors,title,comments,journal_ref,doi,report_no,categories,license,...,cluster_abstract_tfidf,cluster_title_tfidf,cluster_abstract_bert,cluster_title_bert,abstract_tfidf_feature_combined,title_tfidf_feature_combined,abstract_reduced_embeddings_bert_feature_combined,title_reduced_embeddings_bert_feature_combined,abstract_embedding_bert_feature_combined,title_embedding_bert_feature_combined
0,704.1394,Tarik Hadvzic,Tarik Hadzic Rune Moller Jensen Henrik Reif An...,Calculating Valid Domains for BDDBased Interac...,,,,,cs.AI,,...,2,1,2,2,4.339616,2.508605,0.51496,2.867864,6.452453,7.462064
1,704.201,Juliana Bernardes,Juliana S Bernardes Alberto Davila Vitor Santo...,A study of structural properties on profiles HMMs,"6 pages, 7 figures",,,,cs.AI,http://arxiv.org/licenses/nonexclusive-distrib...,...,4,2,1,4,8.82405,2.297803,0.380061,8.640687,7.046748,6.688396
2,704.3433,Tshilidzi Marwala,Tshilidzi Marwala and Bodie Crossingham,Bayesian approach to rough set,"20 pages, 3 figures",,,,cs.AI,,...,2,2,3,1,7.069886,2.186416,2.640302,1.810724,7.616964,7.551085
3,704.3515,Jegor Uglov Mr,J Uglov V Schetinin C Maple,Comparing Robustness of Pairwise and Multiclas...,,,10.1155/2008/468693,,cs.AI,,...,4,3,1,2,5.737238,3.045996,4.420754,4.224183,6.616488,8.452498
4,704.3905,Marc Schoenauer,Christian Gagne INFORMATIQUE WGZ INC Michele S...,Ensemble Learning for Free with Evolutionary A...,,Dans GECCO (2007),,,cs.AI,,...,2,1,0,3,7.184695,2.431577,1.79467,0.696287,7.978225,6.603048


In [27]:
arxiv_scaled.info()

<class 'pandas.core.frame.DataFrame'>
Index: 11597 entries, 0 to 11625
Data columns (total 35 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   id                                                 11597 non-null  object 
 1   submitter                                          11597 non-null  object 
 2   authors                                            11597 non-null  object 
 3   title                                              11597 non-null  object 
 4   comments                                           11597 non-null  object 
 5   journal_ref                                        11597 non-null  object 
 6   doi                                                11597 non-null  object 
 7   report_no                                          11597 non-null  object 
 8   categories                                         11597 non-null  object 
 9   license    

In [28]:
# Define features (X) and target (y) of the scaled dataframe
X = arxiv_scaled[numerical_and_list_columns]
y = arxiv_scaled['paper_published']

In [29]:
# Split data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Without SMOTE
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)  # With SMOTE

In [30]:
# SMOTE Define resampling strategy
over_sample = SMOTE(sampling_strategy=0.5)  # 50% minority class
under_sample = RandomUnderSampler(sampling_strategy=0.8)  # 80% majority class

In [31]:
# Suppress warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=ConvergenceWarning)

In [32]:
### Define models using default parameters
models = {
    'Logistic Regression': LogisticRegression(max_iter=5000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Naive Bayes': MultinomialNB(),
    'Gradient Boosting': GradientBoostingClassifier(),
    'Neural Network': MLPClassifier(max_iter=5000)
}

### Define models with resampling pipeline - SMOTE
models = {
    'Logistic Regression': Pipeline([
        ('over', over_sample),
        ('under', under_sample),
        ('model', LogisticRegression(class_weight='balanced', max_iter=5000))
    ]),
    'Decision Tree': Pipeline([
        ('over', over_sample),
        ('under', under_sample),
        ('model', DecisionTreeClassifier())
    ]),
    'Random Forest': Pipeline([
        ('over', over_sample),
        ('under', under_sample),
        ('model', RandomForestClassifier())
    ]),
    'SVM': Pipeline([
        ('over', over_sample),
        ('under', under_sample),
        ('model', SVC())
    ]),
    'KNN': Pipeline([
        ('over', over_sample),
        ('under', under_sample),
        ('model', KNeighborsClassifier())
    ]),
    'Naive Bayes': Pipeline([
        ('over', over_sample),
        ('under', under_sample),
        ('model', MultinomialNB())
    ]),
    'Gradient Boosting': Pipeline([
        ('over', over_sample),
        ('under', under_sample),
        ('model', GradientBoostingClassifier())
    ]),
    'Neural Network': Pipeline([
        ('over', over_sample),
        ('under', under_sample),
        ('model', MLPClassifier(max_iter=5000))
    ])
}

In [33]:
# Initialize a list to store the optimized hyperparameters for each model
optimized_models = {}

In [34]:
#### Define hyperparameter grids for each model nd perform GridSearchCV for all models

#Define parameters
param_grids = {
    'Logistic Regression':{
        'C': [0.1, 1, 10, 100],
        'penalty': ['l1'],
        'solver': ['liblinear'],
        'tol': [1e-6],
        'max_iter': [1000]
},
    'Decision Tree': {
        'criterion': ['gini'],
        'max_depth': [5],
        'min_samples_split': [2],
        'min_samples_leaf': [1]
    },
    'Random Forest': {
        'n_estimators': [100, 200],
        'max_depth': [5, 10, None]
    },
    'SVM': {
        'C': [0.1, 1, 10],
        'kernel': ['linear', 'rbf']
    },
    'KNN': {
        'n_neighbors': [5],
        'weights': ['uniform'],
        'algorithm': ['auto']
    },
    'Naive Bayes': {
        'alpha': [0.1, 0.5, 1.0, 2.0],  # Smoothing parameter
        'fit_prior': [True, False]  # Whether to learn class prior probabilities
    },
    'Gradient Boosting': {
        'n_estimators': [50],
        'learning_rate': [0.1],
        'max_depth': [5],
        'min_samples_split': [2],
        'min_samples_leaf': [1]
    },
    'Neural Network': {
        'hidden_layer_sizes': [(20,)],
        'activation': ['relu'],
        'solver': ['adam'],
        'alpha': [0.0001],
        'max_iter': [1000]
    }
}

In [35]:
### Perform GridSearchCV for each model
#for name, model in models.items():
#    if name in param_grids:
#        grid_search = GridSearchCV(model, param_grids[name], cv=5, scoring='accuracy', error_score='raise')
#        grid_search.fit(X_train, y_train)
#        optimized_models[name] = grid_search.best_estimator_
#    else:
#        optimized_models[name] = model

In [36]:
### Print results of hyperparameters for each model
#for name, model in optimized_models.items():
#    print(f"Optimized {name}:")
#    print(model.get_params())
#    print()

In [37]:
### Adjust models to selected hyperparameters

#New parameters, adjusted post GridSearchCV
param_grids = {
    'Logistic Regression': {
        'C': [100],                  # From optimized LogisticRegression
        'penalty': ['l1'],           # Matches optimized penalty
        'solver': ['liblinear'],     # Correct solver for L1 regularization
        'max_iter': [1000],          # Keep optimized iteration count
        'class_weight': [None],      # From optimization
        'tol': [1e-06]               # Matches optimized tolerance
    },
    'Decision Tree': {
        'criterion': ['gini'],       # From optimized DecisionTree
        'max_depth': [5],            # Matches optimized depth constraint
        'min_samples_split': [2],    # Maintains optimized split policy
        'min_samples_leaf': [1]      # Matches leaf node requirements
    },
    'Random Forest': {
        'n_estimators': [200],       # From optimized RandomForest
        'max_features': ['sqrt'],    # Matches feature selection strategy
        'bootstrap': [True],         # Maintain optimized resampling method
        'min_samples_split': [2],    # Keep optimized split threshold
        'min_samples_leaf': [1]      # Matches leaf node requirements
    },
    'SVM': {
        'C': [0.1],                  # From optimized SVM parameters
        'kernel': ['linear'],        # Matches optimized kernel type
        'gamma': ['scale'],          # Maintain optimized scaling
        'tol': [0.001]               # Keep convergence tolerance
    },
    'KNN': {
        'n_neighbors': [5],          # From optimized KNN
        'weights': ['uniform'],      # Matches weighting strategy
        'algorithm': ['auto']        # Maintain automatic algorithm selection
    },
    'Naive Bayes': {
        'alpha': [0.1],              # From optimized NaiveBayes
        'fit_prior': [True]          # Matches prior learning strategy
    },
    'Gradient Boosting': {
        'n_estimators': [50],        # From optimized GradientBoosting
        'learning_rate': [0.1],      # Matches optimized learning rate
        'max_depth': [5],            # Keep optimized depth constraint
        'min_samples_split': [2],    # Maintain split requirements
        'min_samples_leaf': [1]      # Match leaf node policy
    },
    'Neural Network': {
        'hidden_layer_sizes': [(20,)],  # From optimized architecture
        'activation': ['relu'],      # Matches activation function
        'solver': ['adam'],          # Keep optimized solver
        'alpha': [0.0001],           # Matches regularization strength
        'max_iter': [1000],          # Maintain iteration count
        'tol': [0.0001]              # Keep convergence tolerance
    }
}

In [None]:
### Train and evaluate each model with adjusted hyperparameters and SFS - feature selection

# Initialize results list
results = []

# Apply SFS with critical optimizations
for name, model in models.items():
    # Configure common SFS parameters
    sfs_params = {
        'n_features_to_select': 0.5,    # Explicit ratio instead of 'auto'
        'direction': 'forward',         # Consider bidirectional if relevant
        'scoring': 'accuracy',
        'cv': 3,                        # Reduced from 5 folds
        'n_jobs': -1,                   # Enable parallel processing
        'tol': 0.01                     # Early stopping tolerance
    }
    
    # Create appropriate pipeline
    if name in ['Logistic Regression', 'SVM', 'Neural Network']:
        # Pipeline with scaling for sensitive models
        pipeline = make_pipeline(StandardScaler(), model)
        sfs = SequentialFeatureSelector(pipeline, **sfs_params)
    else:
        # Direct model for tree-based algorithms
        sfs = SequentialFeatureSelector(model, **sfs_params)
    
    # Fit SFS and transform data
    sfs.fit(X_train, y_train)
    X_train_sfs = sfs.transform(X_train)
    X_test_sfs = sfs.transform(X_test)
    
    # Train final model with proper preprocessing
    if name in ['Logistic Regression', 'SVM', 'Neural Network']:
        pipeline.fit(X_train_sfs, y_train)
        y_pred_test = pipeline.predict(X_test_sfs)
        y_pred_train = pipeline.predict(X_train_sfs)
    else:
        model.fit(X_train_sfs, y_train)
        y_pred_test = model.predict(X_test_sfs)
        y_pred_train = model.predict(X_train_sfs)
        
        # Train model with selected features
        model.fit(X_train_sfs, y_train)
        y_pred_train = model.predict(X_train_sfs)
        y_pred_test = model.predict(X_test_sfs)

    # Get selected feature names (if using pandas DataFrame)
    try:
        selected_features = X_train.columns[sfs.get_support()].tolist()
    except AttributeError:
        selected_features = sfs.get_support(indices=True).tolist()

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred_test)
    precision = precision_score(y_test, y_pred_test)
    recall = recall_score(y_test, y_pred_test)
    f1 = f1_score(y_test, y_pred_test)
    conf_mat = confusion_matrix(y_test, y_pred_test)
    TN, FP, FN, TP = conf_mat.ravel()
    bias = (TP / (TP + FN)) - (FP / (FP + TN)) if (TP + FN) != 0 and (FP + TN) != 0 else 0
    variance = np.var(y_pred_test)

    # Append results with feature selection info
    results.append({
        'Model': name,
        'Selected Features': selected_features,
        'Num Features': len(selected_features),
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1,
        'True Positives (TP)': TP,
        'False Positives (FP)': FP,
        'True Negatives (TN)': TN,
        'False Negatives (FN)': FN,
        'Bias': bias,
        'Variance': variance
    })

In [None]:
# Convert results to a DataFrame
results_df = pd.DataFrame(results)

In [None]:
# Sort the DataFrame by Accuracy in descending order
results_df = results_df.sort_values(by='Accuracy', ascending=False)

In [None]:
### Save results to a CSV file
# Change folder to store the output
folder = 'C:/Users/mnkub/Desktop/Capstone/results/'  # Define the folder where images will be saved
#folder = '../results/')  # Define the folder where images will be saved

filename = 'model_performance_feature_selection.csv'

# Create the folder if it doesn't exist
if not os.path.exists(folder):
    os.makedirs(folder)

# Create the filename dynamically and save figure
filepath = os.path.join(folder, filename)
results_df.to_csv(filepath, index=False)

In [None]:
# Display the DataFrame with formatting
print(results_df)

In [None]:
# Print formatted features table per model

# Expand the Selected Features into separate columns
features_expanded = results_df['Selected Features'].apply(pd.Series)

# Combine the Model column with the expanded features
expanded_table = pd.concat([results_df['Model'], features_expanded], axis=1)

# Rename the columns for clarity
expanded_table.columns = ['Model'] + [f'Feature #{i+1}' for i in range(features_expanded.shape[1])]

# Print the table in a formatted way using tabulate
print(tabulate(expanded_table.fillna(''), headers='keys', tablefmt='grid'))

# Save the expanded table to a CSV file
filename = 'formatted_features_table.xlsx'

# Create the folder if it doesn't exist
if not os.path.exists(folder):
    os.makedirs(folder)

# Create the filename dynamically and save figure
filepath = os.path.join(folder, filename)
expanded_table.to_excel(filepath, index=False, engine='openpyxl')

In [None]:
### Plot Metrics Results
#results_df = pd.DataFrame(data)

# Plotting the metrics
metrics = ['Accuracy', 'Precision', 'Recall', 'F1 Score']

# Set up the figure and axes
fig, ax = plt.subplots(figsize=(10, 6))

# Plot each metric
for metric in metrics:
    ax.plot(results_df['Model'], results_df[metric], marker='o', label=metric)

# Add labels, title, and legend
ax.set_title('Model Performance Metrics - Higher Accuracy on the Left')
ax.set_xlabel('Model')
ax.set_ylabel('Score')
ax.legend()
plt.xticks(rotation=45)
plt.tight_layout()

# Save the plot dynamically
filename = f"Model_Performance_Metrics.png"
filepath = os.path.join(folder, filename)
plt.savefig(filepath)  # Save figure
#plt.savefig('../results/Model_Performance_Metrics.png') #adjust folder when fisnihed using the GIT structure

# Show the plot
plt.show()

In [None]:
###Plot Bias X Variance (note useful for all models)
# Plotting Bias and Variance
fig, ax = plt.subplots(figsize=(10, 6))

# Plot Bias
ax.plot(results_df['Model'], results_df['Bias'], marker='o', label='Bias', color='blue')

# Plot Variance
ax.plot(results_df['Model'], results_df['Variance'], marker='o', label='Variance', color='orange')

# Add labels and title
ax.set_title('Bias and Variance for Each Model - Higher Accuracy on the Left')
ax.set_xlabel('Model')
ax.set_ylabel('Value')
ax.legend()

# Rotate x-axis labels for better readability
plt.xticks(rotation=45)

# Adjust layout
plt.tight_layout()

# Save the plot dynamically
filename = f"Model_Bias_vs_Variance.png"
filepath = os.path.join(folder, filename)
plt.savefig(filepath)  # Save figure
#plt.savefig('../results/Model_Bias_vs_Variance.png') #adjust folder when fisnihed using the GIT structure

# Show the plot
plt.show()

In [None]:
###Plot confusion matrix

# Create subplot grid for 8 models (2 rows x 4 columns)
fig, axes = plt.subplots(2, 4, figsize=(24, 12))
axes = axes.ravel()  # Flatten the 2x4 grid to 1D array

# Create confusion matrices from results
for idx, result in enumerate(results):
    # Extract values from results
    model_name = result['Model']
    TN = result['True Negatives (TN)']
    FP = result['False Positives (FP)']
    FN = result['False Negatives (FN)']
    TP = result['True Positives (TP)']
    
    # Create confusion matrix array
    conf_mat = np.array([[TN, FP],
                         [FN, TP]])
    
    # Create ConfusionMatrixDisplay
    disp = ConfusionMatrixDisplay(confusion_matrix=conf_mat,
                                  display_labels=['Negative', 'Positive'])
    
    # Plot on corresponding axis
    disp.plot(ax=axes[idx], colorbar=False)
    axes[idx].set_title(model_name, fontsize=12, pad=10)
    axes[idx].tick_params(axis='both', which='major', labelsize=10)

# Remove empty subplot (if odd number of models)
if len(results) % 4 != 0:
    for ax in axes[len(results):]:
        ax.remove()

# Adjust layout and save
plt.tight_layout(pad=3.0)

# Save the plot dynamically
filename = f"Confusion_Matrices.png"
filepath = os.path.join(folder, filename)
plt.savefig(filepath)  # Save figure
#plt.savefig('../results/confusion_matrices.png') #adjust folder when fisnihed using the GIT structure

# Show the plot
plt.show()