In [69]:
import pandas as pd
import numpy as np
import networkx as nx
import itertools
import statsmodels.api as sm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import itertools
import pickle


In [244]:
flat=None
with open("../040_model_formatting/flattened_data_10_dec.pkl", "rb") as f:
        flat = pickle.load(f)

if flat==None:
    print("Error retrieving pickle")


In [245]:
df_features = flat[2000]["sims_pairs"].copy(deep=True)
df_features["network_proximity"] = np.where(df_features["distance"] != np.inf, 1 / df_features["distance"], 0)
df_features["log_num_paths"] = np.where(df_features["num_paths"] != 0, np.log(1+df_features["num_paths"]), -1)
df_features["pair_min"] = df_features[["pair_1", "pair_2"]].min(axis=1)
df_features["pair_max"] = df_features[["pair_1", "pair_2"]].max(axis=1)


df_features_t = flat[2010]["sims_pairs"].copy(deep=True)
df_features_t["pair_min"] = df_features_t[["pair_1", "pair_2"]].min(axis=1)
df_features_t["pair_max"] = df_features_t[["pair_1", "pair_2"]].max(axis=1)
df_features_t["collabs_t"] = np.where(df_features_t["collabs"] != 0, 1, 0)


In [257]:
df_features["network_proximity"].value_counts()

network_proximity
0.000000    6048015
0.111111     178517
0.125000     177934
0.100000     160437
0.142857     153812
0.090909     132376
0.166667     113198
0.083333      99550
0.200000      70406
0.076923      67749
0.071429      41440
0.250000      36975
0.066667      23747
0.333333      17473
0.062500      12322
0.500000       7947
0.058824       5996
1.000000       4052
0.055556       2421
0.052632        909
0.050000        207
0.047619         44
0.045455          3
Name: count, dtype: int64

In [246]:
df_features_t.head()

Unnamed: 0,pair_1,pair_2,aff_jacc_sim,collabs,cit_cos_sim,distance,num_paths,pair_min,pair_max,collabs_t
0,14038,8213,0.0,1,0.742462,1.0,1.0,8213,14038,1
1,14038,10751,0.0,1,0.742462,1.0,1.0,10751,14038,1
2,14038,5152,0.166667,0,0.0,5.0,2.0,5152,14038,0
3,14038,13059,0.0,0,0.0,6.0,2.0,13059,14038,0
4,14038,11009,0.4,0,0.0,6.0,1.0,11009,14038,0


In [247]:
df_features.head()

Unnamed: 0,pair_1,pair_2,aff_jacc_sim,collabs,cit_cos_sim,distance,num_paths,network_proximity,log_num_paths,pair_min,pair_max
0,4927,8311,0.0,1,0.20998,1.0,1.0,1.0,0.693147,4927,8311
1,4927,7823,0.0,1,0.32075,1.0,1.0,1.0,0.693147,4927,7823
2,4927,16012,0.0,0,0.0,inf,0.0,0.0,-1.0,4927,16012
3,4927,2311,0.0,0,0.0,inf,0.0,0.0,-1.0,2311,4927
4,4927,8685,0.0,0,0.024845,inf,0.0,0.0,-1.0,4927,8685


In [248]:
authors_t_minus_1 = set(df_features["pair_1"]).union(set(df_features["pair_2"]))
authors_t = set(df_features_t["pair_1"]).union(set(df_features_t["pair_2"]))

# Detect new authors at t
new_authors_t = authors_t - authors_t_minus_1

In [249]:
print(len(authors_t_minus_1))
print(len(authors_t))
print(len(new_authors_t))

3836
5171
3597


In [250]:
df_features_t_strict = df_features_t[df_features_t["pair_1"].isin(authors_t_minus_1) & df_features_t["pair_2"].isin(authors_t_minus_1)]


In [251]:
df_features_t_strict["collabs_t"].value_counts()

collabs_t
0    1236517
1       1434
Name: count, dtype: int64

In [252]:
merged_df = df_features.merge(df_features_t_strict[["pair_min", "pair_max","collabs_t"]], on=["pair_min", "pair_max"], how="left")


In [253]:
df_features_t_strict["collabs_t"].unique()

array([0, 1])

In [254]:
merged_df.head()

Unnamed: 0,pair_1,pair_2,aff_jacc_sim,collabs,cit_cos_sim,distance,num_paths,network_proximity,log_num_paths,pair_min,pair_max,collabs_t
0,4927,8311,0.0,1,0.20998,1.0,1.0,1.0,0.693147,4927,8311,0.0
1,4927,7823,0.0,1,0.32075,1.0,1.0,1.0,0.693147,4927,7823,1.0
2,4927,16012,0.0,0,0.0,inf,0.0,0.0,-1.0,4927,16012,0.0
3,4927,2311,0.0,0,0.0,inf,0.0,0.0,-1.0,2311,4927,0.0
4,4927,8685,0.0,0,0.024845,inf,0.0,0.0,-1.0,4927,8685,0.0


In [255]:
merged_df=merged_df.fillna(0)

In [256]:

# Example data: Replace with your actual citation, affiliation, and paper data
# Citation matrix (authors x papers): 1 if author cites the paper, 0 otherwise
# Example format: rows=authors, cols=papers

# Affiliation data (author x paper): affiliation IDs per paper
# Example format: rows=authors, cols=papers

# Previous collaborations (author1, author2, count)
# Example format: author1, author2, count

# Convert to DataFrame
# columns = ["pair_1", "pair_2", "proximity", "log_num_paths", "citation_sim", "affiliation_sim", "prev_collab_count", "collaboration"]

# -------------------------------
# Step 5: Scale Features
# -------------------------------
scaler = MinMaxScaler()

X = merged_df[["network_proximity", "log_num_paths", "cit_cos_sim", "aff_jacc_sim", "collabs"]]
X_scaled = scaler.fit_transform(X)
y = merged_df["collabs_t"]

# -------------------------------
# Step 6: Train Logistic Regression Model
# -------------------------------
model = LogisticRegression()
model.fit(X_scaled, y)

# Print Model Coefficients
coefficients = pd.DataFrame({"Feature": X.columns, "Coefficient": model.coef_[0]})
print(coefficients)

# -------------------------------
# Step 7: Evaluate Model
# -------------------------------

y_pred = model.predict(X_scaled)
print(classification_report(y, y_pred, zero_division=1))
print("Confusion Matrix:")
print(confusion_matrix(y, y_pred))

             Feature  Coefficient
0  network_proximity     0.687988
1      log_num_paths    -0.144590
2        cit_cos_sim     0.375818
3       aff_jacc_sim     0.274898
4            collabs     0.107279
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00   7354096
         1.0       1.00      0.00      0.00      1434

    accuracy                           1.00   7355530
   macro avg       1.00      0.50      0.50   7355530
weighted avg       1.00      1.00      1.00   7355530

Confusion Matrix:
[[7354096       0]
 [   1434       0]]
