In [18]:
!pip install transformers torch sklearn


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting sklearn
  Downloading sklearn-0.0.post4.tar.gz (3.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: sklearn
  Building wheel for sklearn (setup.py) ... [?25l[?25hdone
  Created wheel for sklearn: filename=sklearn-0.0.post4-py3-none-any.whl size=2973 sha256=e0bb1244238a0fe03cb367ecfc25a89fa3df893a8a52fdbc6b6742a81b17a19c
  Stored in directory: /root/.cache/pip/wheels/d5/b2/a9/590d15767d34955f20a9a033e8db973b79cb5672d95790c0a9
Successfully built sklearn
Installing collected packages: sklearn
Successfully installed sklearn-0.0.post4


In [11]:
import json
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

data = []
with open('train.json', 'r') as file:
    for line in file:
        data.append(json.loads(line))

# Preprocess the data
subjects, objects, relations = [], [], []
for item in data:
    for fact in item["passages"][0]["facts"]:
        subjects.append(fact["subjectText"])
        objects.append(fact["objectText"])
        relations.append(fact["propertyId"])
print(subjects[10])
print(objects[10])


# Vectorize using Bag of Words
vectorizer = CountVectorizer()
subject_vectors = vectorizer.fit_transform(subjects)
object_vectors = vectorizer.fit_transform(objects)


# Concatenate subject and object vectors
X = np.hstack((subject_vectors.toarray(), object_vectors.toarray()))
y = np.array(relations)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
print(X_test.shape)
print(X_test[100])
clf.fit(X_train, y_train)

# Evaluate the classifier
y_pred = clf.predict(X_test)
print(y_pred[10])
print(classification_report(y_test, y_pred))


James Patrick Harris
American
(1469, 7111)
[0 0 0 ... 0 0 0]
11
              precision    recall  f1-score   support

           1       0.94      0.49      0.65        67
          10       0.92      0.80      0.86        85
          11       0.26      0.83      0.40       139
          12       0.75      0.46      0.57       132
          14       0.70      0.51      0.59        63
          15       0.86      0.76      0.80        90
           2       0.47      0.27      0.34        81
          25       0.68      0.44      0.53       125
           3       0.79      0.51      0.62       181
          34       0.59      0.46      0.52       102
           4       0.53      0.48      0.50        71
          45       0.91      0.76      0.83        51
           5       0.72      0.37      0.49        71
           6       0.60      0.75      0.66       102
           9       0.73      0.81      0.77       109

    accuracy                           0.58      1469
   macro avg    

In [4]:
import json

# Read the original JSON data line by line
data = []
with open('train.json', 'r') as file:
    for line in file:
        data.append(json.loads(line.strip()))

# Define the relations you want to keep
relations_to_keep = {
    'DATE_OF_BIRTH',
    'RESIDENCE',
    'BIRTHPLACE',
    'NATIONALITY',
    'EMPLOYEE_OF',
    'EDUCATED_AT'
}

# Create a new subset containing only the desired relations
subset = []
for entry in data:
    passages = entry['passages']
    new_passages = []

    for passage in passages:
        properties = passage['exhaustivelyAnnotatedProperties']

        for prop in properties:
            if prop['propertyName'] in relations_to_keep:
                new_passages.append(passage)
                break

    if new_passages:
        new_entry = entry.copy()
        new_entry['passages'] = new_passages
        subset.append(new_entry)

# Save the subset to a new JSON file
with open('subset_train.json', 'w') as file:
    for item in subset:
        json.dump(item, file)
        file.write('\n')
