In [31]:
import networkx as nx
import matplotlib.pyplot as plt
import os
import json
import random
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
from sklearn import preprocessing
import nltk
import csv
import re
from time import time

In [32]:
nltk.download('punkt') # for tokenization
nltk.download('stopwords')
stpwds = set(nltk.corpus.stopwords.words("english"))
stemmer = nltk.stem.PorterStemmer()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vedch\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vedch\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [8]:
import pickle

# Load the graph using pickle
with open('new_build_parallel/citation_graph.gpickle', 'rb') as f:
    G = pickle.load(f)


# Display basic graph information
print(f"Number of nodes: {G.number_of_nodes()}")
print(f"Number of edges: {G.number_of_edges()}")

Number of nodes: 6545
Number of edges: 32874


In [25]:
def get_paper_info(node_id):
    """Retrieve title and abstract information for a paper given its node ID.
    
    Args:
        node_id: The ID of the paper node in the citation graph
        
    Returns:
        tuple: (title, abstract) where both can be None if not found
    """
    # Load the mapping from citation graph id to folder
    with open("new_build_parallel/citation_graph_id_to_folder.json", "r") as f:
        id_to_folder = json.load(f)
    

    # Get the folder path for the given node_id
    folder_path = id_to_folder.get(f"{node_id}")

    if not folder_path:
        return None, None
    
    folder_path = os.path.join("../dataset_papers/dataset_papers", folder_path) # set path as needed
    # Extract title from title.txt
    title = None
    title_path = os.path.join(folder_path, "title.txt")
    if os.path.exists(title_path):
        with open(title_path, "r", encoding='utf-8', errors='ignore') as f:
            title = f.read().strip()
            
    # Extract abstract from abstract.txt
    abstract = None
    abstract_path = os.path.join(folder_path, "abstract.txt")
    if os.path.exists(abstract_path):
        with open(abstract_path, "r", encoding='utf-8', errors='ignore') as f:
            abstract = f.read().strip()
            
    return title, abstract

### Create Train-Test Split

In [76]:
import networkx as nx
import pandas as pd
import random
from sklearn.model_selection import train_test_split

# Set random seed
random.seed(42)

# Step 1: Extract all real edges (positive samples)
edges = list(G.edges())

# Step 2: Train-test split on positive edges
train_edges, test_edges = train_test_split(edges, test_size=0.2, random_state=42)

# Step 3: Create training graph (with test edges removed)
train_G = G.copy()
train_G.remove_edges_from(test_edges)

# Step 4: Sample negative edges (non-edges)

nodes = list(train_G.nodes())

# Use sets to avoid duplicate samples
train_non_edges = set()
while len(train_non_edges) < len(train_edges):
    u, v = random.choice(nodes), random.choice(nodes)
    if u != v and not train_G.has_edge(u, v):
        train_non_edges.add((u, v))

test_non_edges = set()
while len(test_non_edges) < len(test_edges):
    u, v = random.choice(nodes), random.choice(nodes)
    if u != v and not G.has_edge(u, v):  # for test set, check on original full graph
        test_non_edges.add((u, v))

# Step 5: Prepare training and testing datasets
X_train = train_edges + list(train_non_edges)
y_train = [1] * len(train_edges) + [0] * len(train_non_edges)

X_test = test_edges + list(test_non_edges)
y_test = [1] * len(test_edges) + [0] * len(test_non_edges)

# Step 6: Create DataFrames
train_df = pd.DataFrame({
    'source': [u for u, v in X_train],
    'target': [v for u, v in X_train],
    'label': y_train
})

test_df = pd.DataFrame({
    'source': [u for u, v in X_test],
    'target': [v for u, v in X_test],
    'label': y_test
})

# Step 7: Print dataset statistics
print("\nTraining Set Statistics:")
print(f"Total samples: {len(train_df)}")
print(f"Positive samples: {train_df['label'].sum()}")
print(f"Negative samples: {len(train_df) - train_df['label'].sum()}")

print("\nTest Set Statistics:")
print(f"Total samples: {len(test_df)}")
print(f"Positive samples: {test_df['label'].sum()}")
print(f"Negative samples: {len(test_df) - test_df['label'].sum()}")

# Optionally preview the data
print("\nTraining Set Preview:")
display(train_df)
print("\nTest Set Preview:")
display(test_df)



Training Set Statistics:
Total samples: 52598
Positive samples: 26299
Negative samples: 26299

Test Set Statistics:
Total samples: 13150
Positive samples: 6575
Negative samples: 6575

Training Set Preview:


Unnamed: 0,source,target,label
0,4862,615,1
1,87,1150,1
2,91,4125,1
3,2349,6532,1
4,5547,1947,1
...,...,...,...
52593,2235,3092,0
52594,6403,4819,0
52595,3922,6388,0
52596,134,3347,0



Test Set Preview:


Unnamed: 0,source,target,label
0,3152,4222,1
1,772,471,1
2,6538,2791,1
3,4119,10,1
4,2060,1455,1
...,...,...,...
13145,5553,219,0
13146,3442,567,0
13147,1757,3733,0
13148,790,1729,0


### Feature Engineering

In [38]:
# Define useful information for the nodes
# Example: build id_to_title using your get_paper_info function
id_to_title = {}
id_to_abstract = {}
for node in G.nodes():
    title, abstract = get_paper_info(node)
    id_to_title[node] = title if title else ""
    id_to_abstract[node] = abstract if abstract else ""


In [40]:
import pandas as pd

def compute_title_overlap(df: pd.DataFrame, id_to_title: dict) -> pd.DataFrame:
    """
    Adds a column to the DataFrame with the number of overlapping words in titles of source and target nodes.
    
    Parameters:
    - df: DataFrame with 'source' and 'target' columns
    - id_to_title: Dictionary mapping node IDs to title strings
    
    Returns:
    - df with a new column 'title_overlap'
    """
    def overlap_count(row):
        title_u = id_to_title.get(row['source'], "").lower().split()
        title_v = id_to_title.get(row['target'], "").lower().split()
        return len(set(title_u) & set(title_v))
    
    df['title_overlap'] = df.apply(overlap_count, axis=1)
    return df


In [42]:
def compute_abstract_overlap(df: pd.DataFrame, id_to_abstract: dict) -> pd.DataFrame:
    """
    Adds a column to the DataFrame with the number of overlapping words in abstracts of source and target nodes.
    
    Parameters:
    - df: DataFrame with 'source' and 'target' columns
    - id_to_abstract: Dictionary mapping node IDs to abstract strings
    
    Returns:
    - df with a new column 'abstract_overlap'
    """
    def overlap_count(row):
        abstract_u = id_to_abstract.get(row['source'], "").lower().split()
        abstract_v = id_to_abstract.get(row['target'], "").lower().split()
        return len(set(abstract_u) & set(abstract_v))
    
    df['abstract_overlap'] = df.apply(overlap_count, axis=1)
    return df


In [45]:
def compute_title_cosine_similarity(df: pd.DataFrame, id_to_title: dict) -> pd.DataFrame:
    """
    Adds a column with cosine similarity between TF-IDF vectors of titles.
    
    Parameters:
    - df: DataFrame with 'source' and 'target'
    - id_to_title: Dictionary mapping node IDs to title strings
    
    Returns:
    - df with a new column 'title_cosine'
    """
    # Extract all unique titles needed
    all_ids = pd.concat([df['source'], df['target']]).unique()
    id_text_map = {i: id_to_title.get(i, "") for i in all_ids}

    # Vectorize all titles
    vectorizer = TfidfVectorizer()
    corpus = [id_text_map[i] for i in all_ids]
    tfidf_matrix = vectorizer.fit_transform(corpus)

    # Map node id to index in matrix
    id_to_index = {i: idx for idx, i in enumerate(all_ids)}

    # Compute cosine similarity for each pair
    def sim(row):
        i = id_to_index[row['source']]
        j = id_to_index[row['target']]
        return cosine_similarity(tfidf_matrix[i], tfidf_matrix[j])[0, 0]
    
    df['title_cosine'] = df.apply(sim, axis=1)
    return df


In [46]:
def compute_abstract_cosine_similarity(df: pd.DataFrame, id_to_abstract: dict) -> pd.DataFrame:
    """
    Adds a column with cosine similarity between TF-IDF vectors of abstracts.
    
    Parameters:
    - df: DataFrame with 'source' and 'target'
    - id_to_abstract: Dictionary mapping node IDs to abstract strings
    
    Returns:
    - df with a new column 'abstract_cosine'
    """
    all_ids = pd.concat([df['source'], df['target']]).unique()
    id_text_map = {i: id_to_abstract.get(i, "") for i in all_ids}

    vectorizer = TfidfVectorizer()
    corpus = [id_text_map[i] for i in all_ids]
    tfidf_matrix = vectorizer.fit_transform(corpus)
    id_to_index = {i: idx for idx, i in enumerate(all_ids)}

    def sim(row):
        i = id_to_index[row['source']]
        j = id_to_index[row['target']]
        return cosine_similarity(tfidf_matrix[i], tfidf_matrix[j])[0, 0]
    
    df['abstract_cosine'] = df.apply(sim, axis=1)
    return df


In [63]:
def jaccard_similarity(u, v, train_G):
    if not train_G.has_node(u) or not train_G.has_node(v):
        return 0  # or np.nan if you want to flag this
    neighbors_u = set(train_G.neighbors(u))
    neighbors_v = set(train_G.neighbors(v))
    intersection = len(neighbors_u.intersection(neighbors_v))
    union = len(neighbors_u.union(neighbors_v))
    return intersection / union if union != 0 else 0


def compute_jaccard_similarity_feature(df: pd.DataFrame, train_G) -> pd.DataFrame:
    df['jaccard_similarity'] = df.apply(lambda row: jaccard_similarity(row['source'], row['target'], train_G), axis=1)
    return df


In [65]:
def preferential_attachment(u, v, train_G):
    """
    Computes the Preferential Attachment score between two nodes in the training graph.
    
    Parameters:
    - u, v: Nodes for which to compute Preferential Attachment
    - train_G: The training graph
    
    Returns:
    - Preferential Attachment score between u and v
    """
    if not train_G.has_node(u) or not train_G.has_node(v):
        return 0  # or np.nan if you prefer
    deg_u = train_G.degree(u)
    deg_v = train_G.degree(v)
    return deg_u * deg_v

def compute_preferential_attachment_feature(df: pd.DataFrame, train_G) -> pd.DataFrame:
    df['preferential_attachment'] = df.apply(lambda row: preferential_attachment(row['source'], row['target'], train_G), axis=1)
    return df


In [69]:
import math

def adamic_adar(u, v, train_G):
    """
    Computes the Adamic-Adar Index between two nodes in the training graph.
    
    Parameters:
    - u, v: Nodes for which to compute Adamic-Adar Index
    - train_G: The training graph
    
    Returns:
    - Adamic-Adar Index between u and v
    """
    # Check if both nodes exist in the graph
    if not train_G.has_node(u) or not train_G.has_node(v):
        return 0  # or np.nan if you prefer

    # Get the common neighbors between u and v
    common_neighbors = set(train_G.neighbors(u)).intersection(train_G.neighbors(v))

    # Calculate the Adamic-Adar index
    return sum(1 / math.log(train_G.degree(w)) for w in common_neighbors if train_G.degree(w) > 1)

def compute_adamic_adar_feature(df: pd.DataFrame, train_G) -> pd.DataFrame:
    # Apply the Adamic-Adar index computation and check lengths
    adamic_adar_values = df.apply(lambda row: adamic_adar(row['source'], row['target'], train_G), axis=1)
    
    # Check if the length of computed values matches the length of the DataFrame
    print(f"Length of computed values: {len(adamic_adar_values)}, Length of DataFrame: {len(df)}")

    # If lengths match, assign the computed values
    if len(adamic_adar_values) == len(df):
        df['adamic_adar'] = adamic_adar_values
    else:
        print("Mismatch in lengths! Check the applied function.")
    
    return df


In [70]:
def common_neighbors(u, v, train_G):
    """
    Computes the number of common neighbors between two nodes in the training graph.
    
    Parameters:
    - u, v: Nodes for which to compute common neighbors
    - train_G: The training graph
    
    Returns:
    - Number of common neighbors between u and v
    """
    # Check if both nodes exist in the graph
    if not train_G.has_node(u) or not train_G.has_node(v):
        return 0  # or np.nan if you prefer

    # Get the neighbors for each node
    neighbors_u = set(train_G.neighbors(u))
    neighbors_v = set(train_G.neighbors(v))

    # Return the number of common neighbors
    return len(neighbors_u.intersection(neighbors_v))


def compute_common_neighbors_feature(df: pd.DataFrame, train_G) -> pd.DataFrame:
    df['common_neighbors'] = df.apply(lambda row: common_neighbors(row['source'], row['target'], train_G), axis=1)
    return df


In [71]:
def community_resource_allocation(u, v, train_G):
    """
    Computes the Community Resource Allocation (CRA) score between two nodes in the training graph.
    
    Parameters:
    - u, v: Nodes for which to compute CRA
    - train_G: The training graph
    
    Returns:
    - CRA score between u and v
    """
    # Ensure both nodes exist in the graph
    if not train_G.has_node(u) or not train_G.has_node(v):
        return 0  # or np.nan if you prefer
    
    # Get the common neighbors of u and v
    common_neighbors = set(train_G.neighbors(u)).intersection(train_G.neighbors(v))

    # If there are no common neighbors, return 0
    if not common_neighbors:
        return 0

    # Calculate the CRA score
    score = sum(1 / (train_G.degree(w) * (train_G.degree(u) + train_G.degree(v))) for w in common_neighbors)
    
    return score


def compute_community_resource_allocation_feature(df: pd.DataFrame, train_G) -> pd.DataFrame:
    df['community_resource_allocation'] = df.apply(lambda row: community_resource_allocation(row['source'], row['target'], train_G), axis=1)
    return df


In [117]:
import networkx as nx
import pandas as pd

def get_node_community_map(train_G):
    communities = list(nx.algorithms.community.modularity_max.greedy_modularity_communities(train_G))
    
    node_to_community = {}
    for idx, community in enumerate(communities):
        for node in community:
            node_to_community[node] = idx
    
    return node_to_community

def same_cluster_fast(u, v, node_to_community):
    return int(node_to_community.get(u) == node_to_community.get(v))

def compute_same_cluster_feature(df: pd.DataFrame, train_G) -> pd.DataFrame:
    node_to_community = get_node_community_map(train_G)
    df['same_cluster'] = df.apply(lambda row: same_cluster_fast(row['source'], row['target'], node_to_community), axis=1)

    return df


In [118]:
import pandas as pd
import networkx as nx
import numpy as np

def extract_all_features(df, train_G, id_to_title, id_to_abstract):
    """
    This function extracts all the features and appends them to the dataframe.

    ```
    Parameters:
    - df: The input dataframe containing the source and target nodes for link prediction.
    - train_G: The training graph for calculating graph-based features.

    Returns:
    - df: The dataframe with appended features.
    """
    # Overlapping words in title
    df = compute_title_overlap(df, id_to_title)

    # Overlapping words in abstract
    df = compute_abstract_overlap(df, id_to_abstract)

    # Cosine similarity of titles
    df = compute_title_cosine_similarity(df, id_to_title)

    # Cosine similarity of abstracts
    df = compute_abstract_cosine_similarity(df, id_to_abstract)

    # Jaccard similarity coefficient
    df = compute_jaccard_similarity_feature(df, train_G)

    # Preferential attachment score
    df = compute_preferential_attachment_feature(df, train_G)

    # Adamic Adar Index
    df = compute_adamic_adar_feature(df, train_G)

    # Common neighbors
    df = compute_common_neighbors_feature(df, train_G)

    # Same Cluster (community)
    df = compute_same_cluster_feature(df, train_G)

    # Community resource allocation
    df = compute_community_resource_allocation_feature(df, train_G)

    return df

# train_G is the graph used for feature extraction

train_df = extract_all_features(train_df, train_G, id_to_title, id_to_abstract)
test_df = extract_all_features(test_df, train_G, id_to_title, id_to_abstract)

# Optionally display the first few rows of the updated DataFrames

print(train_df.head())
print(test_df.head())


2025-05-04 21:33:07,810 - INFO - Computing title overlaps...


computing title overlap


2025-05-04 21:33:08,226 - INFO - Computing abstract overlaps...


computing abstract overlap
Length of computed values: 52598, Length of DataFrame: 52598


2025-05-04 21:34:45,350 - INFO - Computing title overlaps...
2025-05-04 21:34:45,456 - INFO - Computing abstract overlaps...


computing title overlap
computing abstract overlap
Length of computed values: 13150, Length of DataFrame: 13150
   source  target  label  title_overlap  abstract_overlap  title_cosine  \
0    4862     615      1              0                20      0.000000   
1      87    1150      1              2                24      0.205991   
2      91    4125      1              0                18      0.000000   
3    2349    6532      1              0                23      0.000000   
4    5547    1947      1              0                25      0.000000   

   abstract_cosine  jaccard_similarity  preferential_attachment  adamic_adar  \
0         0.105672            0.000000                      138     0.000000   
1         0.196653            0.000000                      522     0.000000   
2         0.051316            0.000000                      920     0.000000   
3         0.059388            0.040268                     4060     1.388428   
4         0.150296            0.00000

In [119]:
train_df.to_csv("ved_train_df.csv", index=False)
test_df.to_csv("ved_test_df.csv", index=False)

### Building Models

In [133]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Assuming train_df and test_df are already defined
X_train = train_df.drop(columns=['label'])  # Features for training
y_train = train_df['label']                # Target label for training

X_test = test_df.drop(columns=['label'])  # Features for testing
# Ensure X_test has the same columns as X_train
X_test = X_test[X_train.columns]
y_test = test_df['label']                # Target label for testing

logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train, y_train)

y_pred = logreg.predict(X_test)

# Print classification report
print(classification_report(y_test, y_pred))



              precision    recall  f1-score   support

           0       0.80      0.86      0.83      6575
           1       0.85      0.79      0.82      6575

    accuracy                           0.82     13150
   macro avg       0.82      0.82      0.82     13150
weighted avg       0.82      0.82      0.82     13150



In [134]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.90      0.88      6575
           1       0.89      0.85      0.87      6575

    accuracy                           0.87     13150
   macro avg       0.88      0.87      0.87     13150
weighted avg       0.88      0.87      0.87     13150



In [136]:
# Using XGBoost
import xgboost as xgb
model = xgb.XGBClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.90      0.88      6575
           1       0.90      0.85      0.87      6575

    accuracy                           0.88     13150
   macro avg       0.88      0.88      0.88     13150
weighted avg       0.88      0.88      0.88     13150



In [139]:
# Using LightGBM
import lightgbm as lgb
model = lgb.LGBMClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

[LightGBM] [Info] Number of positive: 26299, number of negative: 26299
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001388 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2111
[LightGBM] [Info] Number of data points in the train set: 52598, number of used features: 12
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
              precision    recall  f1-score   support

           0       0.86      0.90      0.88      6575
           1       0.90      0.85      0.88      6575

    accuracy                           0.88     13150
   macro avg       0.88      0.88      0.88     13150
weighted avg       0.88      0.88      0.88     13150



In [140]:
# Using CatBoost
from catboost import CatBoostClassifier
model = CatBoostClassifier(random_state=42, verbose=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.86      0.90      0.88      6575
           1       0.90      0.86      0.88      6575

    accuracy                           0.88     13150
   macro avg       0.88      0.88      0.88     13150
weighted avg       0.88      0.88      0.88     13150



In [141]:
from sklearn.svm import SVC
model = SVC(kernel='rbf', random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.93      0.81      6575
           1       0.89      0.63      0.74      6575

    accuracy                           0.78     13150
   macro avg       0.81      0.78      0.77     13150
weighted avg       0.81      0.78      0.77     13150



In [143]:
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier(n_neighbors=5)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.76      0.85      0.80      6575
           1       0.83      0.72      0.77      6575

    accuracy                           0.79     13150
   macro avg       0.79      0.79      0.79     13150
weighted avg       0.79      0.79      0.79     13150



In [145]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.65      0.97      0.78      6575
           1       0.94      0.48      0.63      6575

    accuracy                           0.72     13150
   macro avg       0.80      0.72      0.71     13150
weighted avg       0.80      0.72      0.71     13150



In [146]:
from sklearn.neural_network import MLPClassifier
model = MLPClassifier(hidden_layer_sizes=(100,), max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.89      0.75      0.81      6575
           1       0.78      0.91      0.84      6575

    accuracy                           0.83     13150
   macro avg       0.84      0.83      0.83     13150
weighted avg       0.84      0.83      0.83     13150



In [148]:
from sklearn.ensemble import AdaBoostClassifier
model = AdaBoostClassifier(random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.85      0.91      0.88      6575
           1       0.90      0.83      0.87      6575

    accuracy                           0.87     13150
   macro avg       0.87      0.87      0.87     13150
weighted avg       0.87      0.87      0.87     13150



In [149]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(penalty='l2', max_iter=1000, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.80      0.86      0.83      6575
           1       0.85      0.79      0.82      6575

    accuracy                           0.82     13150
   macro avg       0.82      0.82      0.82     13150
weighted avg       0.82      0.82      0.82     13150



In [150]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
model = LinearDiscriminantAnalysis()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.72      0.87      0.79      6575
           1       0.84      0.66      0.74      6575

    accuracy                           0.77     13150
   macro avg       0.78      0.77      0.76     13150
weighted avg       0.78      0.77      0.76     13150



In [151]:
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
model = QuadraticDiscriminantAnalysis()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.96      0.79      6575
           1       0.92      0.53      0.67      6575

    accuracy                           0.74     13150
   macro avg       0.80      0.74      0.73     13150
weighted avg       0.80      0.74      0.73     13150

