In [1]:
import csv
import random
from autogluon.tabular import TabularDataset, TabularPredictor
from os.path import exists

In [22]:
property_label = "P106"
label = 'Yval'
percent_train_data = .75

In [23]:
# Read input file and create subject/object map
all_lines = []
node_map = {}
print("Reading claims data for property",property_label,"...(may take some time depending on chosen property)")
with open(f"../data/propertiesSplit_final/claims.{property_label}.tsv", encoding="utf-8") as in_file:
    tsv_reader = csv.reader(in_file, delimiter="\t")
    header = next(tsv_reader)
    for line in tsv_reader:
        all_lines.append(line)
        if line[0] in node_map:
            node_map[line[0]].append(line[2])
        else:
            node_map[line[0]] = [line[2]]
print(property_label,"claims:",len(all_lines))
print("Found",len(node_map),"distinct nodes with at least one",property_label,"relationship")

Reading claims data for property P106 ...(may take some time depending on chosen property)
P106 claims: 8273101
Found 6339032 distinct nodes with at least one P106 relationship


In [24]:
# create shuffled output file
print("Creating shuffled output file...")
random.shuffle(all_lines)
with open(f"../data/propertiesSplit_final/claims.{property_label}.shuffled.tsv", "w", encoding="utf-8", newline='') as out_file:
    tsv_writer = csv.writer(out_file, delimiter="\t")
    tsv_writer.writerow(header)
    for line in all_lines:
        tsv_writer.writerow(line)
print("Shuffled output written to claims."+property_label+".tsv")

Creating shuffled output file...
Shuffled output written to claims.P106.tsv


In [25]:
# create embeddings-based training data
if (exists(f"../data/link_prediction_data/{property_label}.transE.tsv")):
    print(f"File ../data/link_prediction_data/{property_label}.transE.tsv already exists!")
else:
    transEheader = ['Qnode']
    for i in range(0,100):
        transEheader.append("pos"+str(i))
    transEheader.append("Yval")
    linecount = 0
    print("Reading the transE file...(this will take some time)")
    with open(f"../embeddings/profile_graph_embeddings.transE.tsv", encoding="utf-8") as in_file, open(f"../data/link_prediction_data/{property_label}.transE.tsv", "w", encoding="utf-8", newline='') as out_file:
        tsv_reader = csv.reader(in_file, delimiter=" ")
        tsv_writer = csv.writer(out_file, delimiter="\t")
        next(tsv_reader)
        tsv_writer.writerow(transEheader)
        for line in tsv_reader:
            if line[0] in node_map:
                this_arr = []
                for elem in line:
                    this_arr.append(elem)
                this_arr.append(random.choice(node_map[line[0]]))
                tsv_writer.writerow(this_arr)
                linecount += 1
    print("Created training data file for property",property_label,"with",linecount,"rows")

Reading the transE file...(this will take some time)
Created training data file for property P106 with 5852167 rows


In [28]:
# make train/test split
print("Making train/test split for property",property_label,"with",str(percent_train_data)[2:],"% training data...")
# limit data to 200,000 training rows
all_data = TabularDataset(f"../data/link_prediction_data/{property_label}.transE.tsv")[:200000]
print("Data length:",int(len(all_data)))
train_data = all_data[:int(len(all_data)*percent_train_data)]
test_data = all_data[int(len(all_data)*percent_train_data):]
print("Training data size:",len(train_data))
print("Test data size:",len(test_data))

Making train/test split for property P106 with 75 % training data...


Loaded data from: ../data/link_prediction_data/P106.transE.tsv | Columns = 102 / 102 | Rows = 5852167 -> 5852167


Data length: 200000
Training data size: 150000
Test data size: 50000


In [29]:
# fit autogluon models
save_path = f'../embeddings/models/agModels.{property_label}.transE'
print("Summary of class variable: \n", train_data[label].describe())
predictor = TabularPredictor(label=label, path=save_path, learner_kwargs={"label_count_threshold":10}).fit(train_data, hyperparameters={'NN':{},'GBM':{},'XT':{},'KNN':{}})
print("Best model:",predictor.get_model_best())
print(f"Model saved to ../embeddings/models/agModels.{property_label}.transE")

	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "../embeddings/models/agModels.P106.transE\"
AutoGluon Version:  0.4.0
Python Version:     3.8.0
Operating System:   Windows
Train Data Rows:    150000
Train Data Columns: 101
Label Column: Yval
Preprocessing data ...
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == object).
	First 10 (of 2568) unique label values:  ['Q1650915', 'Q5716684', 'Q3391743', 'Q1234713', 'Q1800680', 'Q13381376', 'Q47064', 'Q1622272', 'Q189290', 'Q82955']
	If 'multiclass' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])


Summary of class variable: 
 count       150000
unique        2568
top       Q1650915
freq         42885
Name: Yval, dtype: object


Fraction of data from classes with at least 7 examples that will be kept for training models: 0.9760866666666667
Train Data Class Count: 764
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...
	Available Memory:                    17975.43 MB
	Train Data (Original)  Memory Usage: 126.75 MB (0.7% of available memory)
	Inferring data type of each feature based on column values. Set feature_metadata_in to manually specify special dtypes of the features.
	Stage 1 Generators:
		Fitting AsTypeFeatureGenerator...
	Stage 2 Generators:
		Fitting FillNaFeatureGenerator...
	Stage 3 Generators:
		Fitting IdentityFeatureGenerator...
		Fitting CategoryFeatureGenerator...
			Fitting CategoryMemoryMinimizeFeatureGenerator...
	Stage 4 Generators:
		Fitting DropUniqueFeatureGenerator...
	Unused Original Features (Count: 1): ['Qnode']
		These features were not used to generate any of the output features. Add a feature generator compatible with these features to u

Best model: WeightedEnsemble_L2
Model saved to ../embeddings/models/agModels.P106.transE


In [34]:
# uncomment the 2 lines below to load the saved model; if the above cell has just been ran this is not necessary
# save_path = f'../embeddings/models/agModels.{property_label}.{orientation}.transE.synchronized.250000'
# predictor = TabularPredictor.load(save_path)
print("Making predictions over test data...")
y_test = test_data[label]
test_data_nolab = test_data.drop(columns=[label])
y_pred = list(predictor.predict(test_data_nolab))
entries = list(test_data["Qnode"])
correct = 0
total = 0
for index, entry in enumerate(entries):
    prediction = y_pred[index]
    if prediction in node_map[entry]:
        correct+=1
    total+=1
print("Overall accuracy:",(correct/total))

Making predictions over test data...
Overall accuracy: 0.73622


In [99]:
# calculate top-k metric
#nodes_to_check = {"Q34969":["Q37226","Q2259451"], "Q38111":["Q37226","Q2259451"]}
node_list = ["Q34969","Q38111"]

# count = 0
# print("Calculating requested node probabilities...")
# with open(f"../data/link_prediction_data/{property_label}.transE.tsv") as in_file, open(f"../data/temp/custom_test_{property_label}.tsv", 'w', encoding="utf-8", newline='') as out_file:
#     tsv_reader = csv.reader(in_file, delimiter="\t")
#     tsv_writer = csv.writer(out_file, delimiter="\t")
#     header = next(tsv_reader)
#     tsv_writer.writerow(header)
#     for line in tsv_reader:
#         if line[0] in nodes_to_check:
#             tsv_writer.writerow(line)
#             count = count + 1
#             if count == len(nodes_to_check):
#                 break

preddata = TabularDataset(f"../data/temp/custom_test_{property_label}.tsv").drop(columns=[label])
probabilities = predictor.predict_proba(preddata)
probabilities_dict = probabilities.to_dict()

# for qnode in nodes_to_check:
#     for test_node in nodes_to_check[qnode]:
#         index = preddata.index[preddata["Qnode"] == qnode].tolist()[0]
#         prob = probabilities_dict[test_node][index]
#         print(f"The predicted probability of {qnode} -> {property_label} -> {test_node} is {prob}")

for node in node_list:
    index = preddata.index[preddata["Qnode"] == node].tolist()[0]
    node_list = {}
    for key in probabilities_dict:
        if probabilities_dict[key][index] > 0:
            node_list[key] = probabilities_dict[key][index]
    if node == "Q34969":
        print("Probabilities for Benjamin Franklin")
    elif node == "Q38111":
        print("Probabilities for Leonardo DiCaprio")
    print({k: v for k, v in sorted(node_list.items(), key=lambda item: item[1], reverse=True)})

Loaded data from: ../data/temp/custom_test_P106.tsv | Columns = 102 / 102 | Rows = 2 -> 2


Probabilities for Benjamin Franklin
{'Q1622272': 0.4000000059604645, 'Q10873124': 0.20000000298023224, 'Q170790': 0.20000000298023224, 'Q593644': 0.20000000298023224}
Probabilities for Leonardo DiCaprio
{'Q2259451': 0.6000000238418579, 'Q2405480': 0.20000000298023224, 'Q2526255': 0.20000000298023224}
