In [81]:
from gensim_utils import create_dic_authordid_lineauthor, tokenize_abstracts,latentSemanticIndexing,similarityIndex, get_author_abstract_similarity
import pandas as pd
import networkx as nx
from read_data import get_graph, get_train_data_json
import tqdm
from preprocess_utils import clean_columns, get_numpy_data
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np

In [82]:
def select_columns(data):
    columns = [
        "author",
        "hindex",
        "nb_paper",
        # "core_number",
        # "eigenvector_centrality",
        # "n_coauthors_with_hindex",
        "pagerank",
        "authority",
        "clustering_coef",
        "n_neighbors_dist_1",
        # "min_neighbors_dist_1",
        "mean_neighbors_dist_1",
        "max_neighbors_dist_1",
        # "max-min_neighbors_dist_1",
        # "n_neighbors_dist_2",
        # "min_neighbors_dist_2",
        # "mean_neighbors_dist_2",
        # "max_neighbors_dist_2",
    ]
    columns += [column for column in data if column.startswith("vector_coord_")]
    columns += [column for column in data if column.startswith("lda_cat_")]

    return data[columns]

In [83]:
def get_numpy_data_1(n):
    train = pd.read_csv("../tmp/processed_data.csv")[:n]
    train = train.sample(n=n, random_state=1)
    text = train["author"]
    train, test = train_test_split(train, random_state =1)
    train = select_columns(train)
    test = select_columns(test)
    X_train = train.drop(
        ["author", "hindex"], axis=1
    ).to_numpy()
    y_train = train["hindex"].to_numpy()
    X_test = test.drop(
        ["author", "hindex"], axis=1
    ).to_numpy()
    y_test = test["hindex"].to_numpy()
    return X_train, y_train, X_test, y_test,text

In [84]:
X_train, y_train, X_test, y_test, column = get_numpy_data_1(n=10000)

  X_train, y_train, X_test, y_test, column = get_numpy_data_1(n=10000)


In [85]:
import lightgbm as lgb
d_train = lgb.Dataset(X_train, label = y_train)
params = {}
params['learning_rate'] = 0.2
params['num_iterations'] = 100
params['boosting_type'] = 'dart'
params['objective'] = 'regression'
params['metric'] = 'rmse'
params['sub_feature'] = 0.6
params['num_leaves'] = 50
params['min_data'] = 60
params['max_depth'] = 35
clf = lgb.train(params, d_train, 100)
y_pred=clf.predict(X_test)
mean_squared_error(y_test,y_pred)



You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1731
[LightGBM] [Info] Number of data points in the train set: 7500, number of used features: 9
[LightGBM] [Info] Start training from score 11.701733


57.93060829667537

In [86]:
dic, corp = tokenize_abstracts(column)

num_topics = 250
lsi = latentSemanticIndexing(dic,corp,num_topics)

index = similarityIndex(lsi,corp,num_topics)

data = get_author_abstract_similarity(50)

TypeError: tokenize_abstracts() takes 0 positional arguments but 1 was given

In [None]:
graphArray = pd.read_csv("../tmp/similGraph_full.csv").to_numpy()


In [None]:
G = nx.Graph()
for i in range(len(graphArray)):
    for j in range(1,(len(graphArray[0])),2):
        G.add_edge(graphArray[i][0],graphArray[i][j],weight = graphArray[i][j+1])

In [None]:
def get_core_number(author_ids):
    core_number = nx.core_number(G)
    author_core_numbers = [core_number[author_id] for author_id in author_ids]
    df = pd.DataFrame({"author": author_ids, "new_core_number": author_core_numbers})
    return df


def get_page_rank(author_ids):
    core_number = nx.pagerank(G)
    author_pagerank = [core_number[author_id] for author_id in author_ids]
    df = pd.DataFrame({"author": author_ids, "new_pagerank": author_pagerank})
    return df


def get_authority(author_ids):
    authority, _ = nx.hits(G)
    author_authority = [authority[author_id] for author_id in author_ids]
    df = pd.DataFrame({"author": author_ids, "new_authority": author_authority})
    return df


def get_clustering_coef(author_ids):
    clustering_coefs = nx.clustering(G, nodes=author_ids)
    author_clusering_coef = [clustering_coefs[author_id] for author_id in author_ids]
    df = pd.DataFrame({"author": author_ids, "new_clustering_coef": author_clusering_coef})
    return df


def get_eigenvector_centrality(author_ids):
    eigenvector_centralities = nx.algorithms.centrality.eigenvector_centrality(G)
    author_eigenvector_centrality = [
        eigenvector_centralities[author_id] for author_id in author_ids
    ]
    df = pd.DataFrame(
        {"author": author_ids, "new_eigenvector_centrality": author_eigenvector_centrality}
    )
    return df
    
def add_features(data, new_features):
    return data.merge(new_features, left_on="author", right_on="author", how="inner")

In [None]:
def store_full_dataset_with_features(neighborhood_level=2):

    data = pd.read_csv("../tmp/data10000.csv")

    data = clean_columns(data, neighborhood_level=neighborhood_level)

    print("Starting data columns :", list(data.columns))

    if not "new_core_number" in data.columns:
        print("Add core number to data")
        data = add_features(data, get_core_number(data["author"]))

    if not "new_pagerank" in data.columns:
        print("Add pagerank to data")
        data = add_features(data, get_page_rank(data["author"]))

    if not "new_authority" in data.columns:
        print("Add authority to data")
        data = add_features(data, get_authority(data["author"]))

    if not "new_clustering_coef" in data.columns:
        print("Add clustering coef to data")
        data = add_features(data, get_clustering_coef(data["author"]))

    if not "new_eigenvector_centrality" in data.columns:
        print("Add eigenvector centrality to data")
        data = add_features(data, get_eigenvector_centrality(data["author"]))

    print("Ending data columns :", list(data.columns))

    data.to_csv("../tmp/dataGensim.csv", index=None)
    return data

In [None]:
def select_columns(data):
    columns = [
        "author",
        "hindex",
        "nb_paper",
        # "core_number",
        # "eigenvector_centrality",
        # "n_coauthors_with_hindex",
        "new_pagerank",
        "new_authority",
        "new_clustering_coef",
        "n_neighbors_dist_1",
        # "min_neighbors_dist_1",
        "mean_neighbors_dist_1",
        "max_neighbors_dist_1",
        # "max-min_neighbors_dist_1",
        # "n_neighbors_dist_2",
        # "min_neighbors_dist_2",
        # "mean_neighbors_dist_2",
        # "max_neighbors_dist_2",
    ]
    columns += [column for column in data if column.startswith("vector_coord_")]
    columns += [column for column in data if column.startswith("lda_cat_")]

    return data[columns]


In [None]:
def get_numpy_data_2(n=10000):
    train = pd.read_csv("../tmp/dataGensim.csv")[:10000]
    train = train.sample(n=n, random_state=1)
    train, test = train_test_split(train, random_state =1)
    train = select_columns(train)
    test = select_columns(test)
    X_train = train.drop(
        ["author", "hindex"], axis=1
    ).to_numpy()
    y_train = train["hindex"].to_numpy()
    X_test = test.drop(
        ["author", "hindex"], axis=1
    ).to_numpy()
    y_test = test["hindex"].to_numpy()
    return X_train, y_train, X_test, y_test

In [None]:
d = store_full_dataset_with_features()
X_train, y_train, X_test, y_test = get_numpy_data_2(n=10000)

In [None]:
import lightgbm as lgb
d_train = lgb.Dataset(X_train, label = y_train)
params = {}
params['learning_rate'] = 0.16
params['num_iterations'] = 100
params['boosting_type'] = 'dart'
params['objective'] = 'regression'
params['metric'] = 'rmse'
params['sub_feature'] = 0.6
params['num_leaves'] = 50
params['min_data'] = 60
params['max_depth'] = 35
clf = lgb.train(params, d_train, 100)
y_pred=clf.predict(X_test)
mean_squared_error(y_test,y_pred)