In [None]:
import json
import matplotlib.pyplot as plt
from collections import Counter

# Function to read JSON file and return data
def read_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

# Function to bin the counts into specified ranges
def bin_counts(data):
    bins = {
        'Over 500,000': 0,
        '300,000 to 500,000': 0,
        '100,000 to 300,000': 0,
        '50,000 to 100,000': 0,
        '10,000 to 50,000': 0,
        '5,000 to 10,000': 0,
        '1,000 to 5,000': 0,
        '500 to 1,000': 0,
        '100 to 500': 0,
        '50 to 100': 0,
        '10 to 50': 0,
        '1 to 10': 0,
    }
    
    for count in data.values():
        if count > 500000:
            bins['Over 500,000'] += 1
        elif count > 300000:
            bins['300,000 to 500,000'] += 1
        elif count > 100000:
            bins['100,000 to 300,000'] += 1
        elif count > 50000:
            bins['50,000 to 100,000'] += 1
        elif count > 10000:
            bins['10,000 to 50,000'] += 1
        elif count > 5000:
            bins['5,000 to 10,000'] += 1
        elif count > 1000:
            bins['1,000 to 5,000'] += 1
        elif count > 500:
            bins['500 to 1,000'] += 1
        elif count > 100:
            bins['100 to 500'] += 1
        elif count > 50:
            bins['50 to 100'] += 1
        elif count > 10:
            bins['10 to 50'] += 1
        else:
            bins['1 to 10'] += 1
    
    return bins

# Function to plot the counts
def plot_counts(bins):
    labels = list(bins.keys())
    counts = list(bins.values())
    
    plt.figure(figsize=(14, 8))
    plt.bar(labels, counts, color='skyblue')
    plt.xlabel('Count Ranges')
    plt.ylabel('Number of Verbs')
    plt.title('Binned Counts of Verb Occurrences in Wikipedia')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# Main function
def main(file_path):
    data = read_json(file_path)
    bins = bin_counts(data)
    plot_counts(bins)

# Example usage
if __name__ == "__main__":
    file_path = 'wiki_knowledge/verb_counts_0.json'  # Replace with your JSON file path
    main(file_path)


In [None]:
import json
import matplotlib.pyplot as plt
import numpy as np

# Function to read JSON file and return data
def read_json(file_path):
    with open(file_path, 'r') as file:
        data = json.load(file)
    return data

# Function to dynamically build bins based on a Zipfian distribution
def build_zipfian_bins(data, num_bins=20):
    # Extract and sort values in descending order
    values = sorted(data.values(), reverse=True)
    n = len(values)

    # Calculate the cutoffs based on a Zipfian-like distribution
    bin_ratios = np.logspace(0, np.log10(n), num_bins + 1, base=10).astype(int)
    bin_edges = sorted(set(bin_ratios))  # Ensure unique and sorted
    bin_edges = [edge for edge in bin_edges if edge < n]  # Remove any edges beyond n

    # Generate bin labels with actual value ranges
    bin_labels = []
    for i in range(len(bin_edges)):
        if i == 0:
            bin_labels.append(f"{values[0]}-{values[bin_edges[i]-1]}")
        else:
            bin_labels.append(f"{values[bin_edges[i-1]]}-{values[bin_edges[i]-1]}")

    # Add an additional bin for the last edge to the end
    bin_labels.append(f"{values[bin_edges[-1]]}-{values[-1]}")

    # Count the occurrences of values falling within each bin
    bins = {label: 0 for label in bin_labels}
    bin_idx = 0
    for i in range(n):
        if bin_idx < len(bin_edges) and i >= bin_edges[bin_idx]:
            bin_idx += 1
        if bin_idx < len(bin_labels):
            bins[bin_labels[bin_idx]] += 1

    return bins

# Function to plot the counts
def plot_counts(bins):
    labels = list(bins.keys())
    counts = list(bins.values())
    
    plt.figure(figsize=(14, 8))
    plt.bar(labels, counts, color='skyblue')
    plt.xlabel('Value Ranges')
    plt.ylabel('Number of Verbs')
    plt.title('Binned Counts of Verb Occurrences Based on Zipfian Distribution')
    plt.xticks(rotation=45, ha='right')
    plt.tight_layout()
    plt.show()

    # Print the counts
    for label, count in bins.items():
        print(f"{label}: {count}")

# Main function
def main(file_path, num_bins=20):
    data = read_json(file_path)
    bins = build_zipfian_bins(data, num_bins)
    plot_counts(bins)

# Example usage
if __name__ == "__main__":
    file_path = 'wiki_knowledge/link_counts.json'  # Replace with your JSON file path
    main(file_path, num_bins=20)  # Adjust num_bins for more or fewer bins


In [None]:
import matplotlib.pyplot as plt

# List of strings representing the bins and counts
bins_counts = [
    "10796855-10796855: 1",
    "5777540-5777540: 1",
    "3679427-2504580: 2",
    "2234089-1674826: 4",
    "1612306-1242745: 6",
    "1242107-940140: 10",
    "928497-779129: 17",
    "778630-566576: 28",
    "564589-360152: 49",
    "358906-225738: 84",
    "225639-133407: 141",
    "133345-74552: 241",
    "74545-36100: 410",
    "36082-15853: 696",
    "15847-5803: 1184",
    "5800-1768: 2014",
    "1764-388: 3424",
    "388-51: 5823",
    "51-6: 9902",
    "6-1: 16838"
]

# Extract the leftmost number in each row
leftmost_numbers = [int(bin_count.split('-')[0]) for bin_count in bins_counts]

# Plot the leftmost numbers
plt.figure(figsize=(14, 8))
plt.plot(leftmost_numbers, marker='o', linestyle='-')
plt.xlabel('Bin Index')
plt.ylabel('Leftmost Number')
plt.title('Leftmost Number in Each Bin')
plt.yscale('log')  # Log scale to better visualize the Zipfian distribution
plt.grid(True)
plt.show()

# Print the leftmost numbers for reference
print("Leftmost numbers:", leftmost_numbers)


In [None]:
import matplotlib.pyplot as plt

# List of strings representing the bins and counts
bins_counts = [
    "10796855-10796855: 1",
    "5777540-5777540: 1",
    "3679427-2504580: 2",
    "2234089-1674826: 4",
    "1612306-1242745: 6",
    "1242107-940140: 10",
    "928497-779129: 17",
    "778630-566576: 28",
    "564589-360152: 49",
    "358906-225738: 84",
    "225639-133407: 141",
    "133345-74552: 241",
    "74545-36100: 410",
    "36082-15853: 696",
    "15847-5803: 1184",
    "5800-1768: 2014",
    "1764-388: 3424",
    "388-51: 5823",
    "51-6: 9902",
    "6-1: 16838"
]

# Extract the leftmost number in each row
leftmost_numbers = [int(bin_count.split('-')[0]) for bin_count in bins_counts]

# Plot the leftmost numbers
plt.figure(figsize=(14, 8))
plt.plot(leftmost_numbers, marker='o', linestyle='-')
plt.xlabel('Bin Index')
plt.ylabel('Leftmost Number')
plt.title('Leftmost Number in Each Bin')
plt.yscale('log')  # Log scale to better visualize the Zipfian distribution
plt.grid(True)
plt.gca().invert_yaxis()  # Invert the y-axis to show the distribution in the "upside-down" format
plt.show()

# Print the leftmost numbers for reference
print("Leftmost numbers:", leftmost_numbers)


In [None]:
import sqlite3
import json

def create_table(cursor):
    """Create the table in the SQLite database."""
    cursor.execute('''
        CREATE TABLE IF NOT EXISTS records (
            id INTEGER PRIMARY KEY AUTOINCREMENT,
            key TEXT,
            value TEXT
        )
    ''')

def insert_record(cursor, key, value):
    """Insert a record into the SQLite database."""
    cursor.execute('''
        INSERT INTO records (key, value)
        VALUES (?, ?)
    ''', (key, value))

def retrieve_record(cursor, key):
    """Retrieve a record from the SQLite database by key."""
    cursor.execute('''
        SELECT value FROM records WHERE key=?
    ''', (key,))
    row = cursor.fetchone()
    return row[0] if row else None

def load_db(db_filename):
    conn = sqlite3.connect(db_filename)
    cursor = conn.cursor()
    return cursor

def test_database(cursor, target):
    """Test the SQLite database by creating a table, inserting records, and retrieving them."""
    value_str = retrieve_record(cursor, target)
    value = json.loads(value_str)
    print(f"Key: {target}, Value: {value}")

# Example usage
db_filename = 'wikilinksdata.db'
cursor = load_db(db_filename)



In [None]:
test_database(cursor, "Ronald Reagan")

In [49]:
import sqlite3
import pandas as pd

# Function to query the level1alt table and return a pandas DataFrame
def query_level1_table(db_filename):
    """Query the level1alt table and return its contents as a pandas DataFrame."""
    conn = sqlite3.connect(db_filename)
    
    # Use pandas to read the level1alt table into a DataFrame
    #df = pd.read_sql_query('SELECT * FROM level1final LIMIT 100', conn)
    df = pd.read_sql_query('SELECT * FROM level1unified LIMIT 500', conn)
    # Close the connection
    conn.close()
    
    return df

# Function to get the total number of rows in the level1alt table
def get_row_count(db_filename):
    """Get the total number of rows in the level1 table."""
    conn = sqlite3.connect(db_filename)
    
    # Execute a query to count the rows
    cursor = conn.cursor()
    cursor.execute('SELECT COUNT(*) FROM level1unified')
    row_count = cursor.fetchone()[0]
    
    # Close the connection
    conn.close()
    
    return row_count

# Example usage
db_filename = 'database/wikilinks_main.db'
df_level1 = query_level1_table(db_filename)
# Get and display the total number of rows
total_rows = get_row_count(db_filename)
print(f"Total number of rows in the table: {total_rows}")
# Display the head of the DataFrame
df_level1.head(50)




Total number of rows in the table: 22946567


Unnamed: 0,id,keys,name,names
0,1,"[3075584, 10290947, 9859588, 8950408, 2739723,...",Draft:Gysbert Reitz Hofmeyr (1871-1942),"[""Historical"", ""Coloured"", ""References"", ""Albe..."
1,2,"[10040322, 4850180, 5428238, 6678032, 4922386,...",List of solved missing person cases: pre-2000,"[""JewishEncyclopedia.com"", ""Islamic"", ""Babysit..."
2,3,"[603905, 1582211, 603910, 8626322, 648212, 100...",Bakugan: Armored Alliance,"[""7a"", ""Secrets"", ""Bakugan"", ""3b"", ""Premieres""..."
3,4,"[4646912, 9859588, 1500677, 4196888, 7182368, ...",History of Palestine,"[""Islamic"", ""Neo-Babylonian"", ""3700"", ""Rural"",..."
4,5,"[5169409, 9464577, 4261635, 10314372, 2801415,...",List of acts of the Parliament of Great Britai...,"[""Shore"", ""Amsinck"", ""Merchants"", ""Terms"", ""Ri..."
5,6,"[3209224, 2739723, 8046606, 6678032, 1329173, ...",Foreign relations of Australia,"[""Islamic"", ""Nuku'alofa"", ""RAMSI"", ""Right"", ""T..."
6,7,"[4920713, 2709130, 10704397, 7544980, 7068571,...",List of The Beverly Hillbillies episodes,"[""Topless"", ""References"", ""Muscles"", ""Hungry"",..."
7,8,"[9471876, 10174855, 3746056, 3562634, 4198027,...",List of In Our Time programmes,"[""Historical"", ""Patristic"", ""Islamic"", ""Archit..."
8,9,"[3808131, 5083284, 11119124, 14491, 6488363, 2...",List of cases of the January 6 United States C...,"[""Iowan"", ""Sweet"", ""Leader"", ""Shalvey"", ""Insur..."
9,10,"[9859588, 7193611, 8496653, 9738253, 9494035, ...",2023 in American television,"[""Right"", ""Blacklist"", ""Rural"", ""WNBA"", ""Premi..."


In [2]:
import sqlite3

def create_index(db_filename, table_name, column_name):
    """Create an index on the specified column of the specified table."""
    conn = sqlite3.connect(db_filename)
    cursor = conn.cursor()
    
    index_name = f'idx_{column_name}'
    cursor.execute(f'CREATE INDEX IF NOT EXISTS {index_name} ON {table_name}({column_name})')
    
    conn.commit()
    conn.close()
    print(f"Index on {column_name} created successfully.")

# Example usage
db_filename = 'wikilinksdata.db'
table_name = 'records'
column_name = 'emb_idx'
create_index(db_filename, table_name, column_name)


Index on emb_idx created successfully.


In [3]:
import sqlite3
import pandas as pd

# Function to query the level1alt table and return a pandas DataFrame
def query_level1_table(db_filename):
    """Query the level1alt table and return its contents as a pandas DataFrame."""
    conn = sqlite3.connect(db_filename)
    
    # Use pandas to read the level1alt table into a DataFrame
    #df = pd.read_sql_query('SELECT * FROM level1final LIMIT 100', conn)
    df = pd.read_sql_query('SELECT * FROM embeddings LIMIT 5', conn)
    # Close the connection
    conn.close()
    
    return df

# Function to get the total number of rows in the level1alt table
def get_row_count(db_filename):
    """Get the total number of rows in the level1 table."""
    conn = sqlite3.connect(db_filename)
    
    # Execute a query to count the rows
    cursor = conn.cursor()
    cursor.execute('SELECT COUNT(*) FROM embeddings WHERE LENGTH("values") > 5')
    row_count = cursor.fetchone()[0]
    
    # Close the connection
    conn.close()
    
    return row_count

# Example usage
db_filename = 'database/final_embedding.db'
df_level1 = query_level1_table(db_filename)
# Get and display the total number of rows
total_rows = get_row_count(db_filename)
print(f"Total number of rows in the table: {total_rows}")
# Display the head of the DataFrame
df_level1.head(50)




Total number of rows in the table: 12983465


Unnamed: 0,id,emb_idx,values
0,1,UNKNOWN,"[[0, 0.02531645569620253], [1, 0.0238095238095..."
1,2,2881033,"[[0, 0.43037974683544306], [1, 0.5119047619047..."
2,3,1094758,"[[0, 0.6329113924050633], [1, 0.67857142857142..."
3,4,UNKNOWN,[]
4,5,303091,"[[0, 0.4177215189873419], [1, 0.28571428571428..."


In [9]:
import sqlite3

def delete_all_from_table(db_filename, table_name):
    """Delete all contents from the specified table."""
    conn = sqlite3.connect(db_filename)
    cursor = conn.cursor()
    
    # Execute the delete statement
    cursor.execute(f'DELETE FROM {table_name}')
    
    # Commit the changes and close the connection
    conn.commit()
    conn.close()
    print(f"All contents from table {table_name} have been deleted.")

# Example usage
db_filename = 'wikilinksdata.db'
table_name = 'level1'
delete_all_from_table(db_filename, table_name)


All contents from table level1 have been deleted.


In [None]:
import sqlite3

def reset_level1_table(db_filename):
    """Reset the level1 table so that it starts indexing from 1 again."""
    conn = sqlite3.connect(db_filename)
    cursor = conn.cursor()
    
    # Drop the existing level1 table if it exists
    cursor.execute('DROP TABLE IF EXISTS embeddings')
    
    # Recreate the level1 tabl
    conn.commit()
    conn.close()
    print("The level1 table has been reset and will start indexing from 1.")

# Example usage
db_filename = 'embeddings.db'
reset_level1_table(db_filename)


In [57]:
import sqlite3

def count_rows_in_table(db_filename, table_name):
    """Count the number of rows in the specified table."""
    conn = sqlite3.connect(db_filename)
    cursor = conn.cursor()
    
    cursor.execute(f'SELECT * FROM {table_name}')
    count = cursor.fetchall()
    
    conn.close()
    
    return count

# Example usage
db_filename = 'database/wikilinksdata_0.db'
table_name = 'embeddings'
row_count = count_rows_in_table(db_filename, table_name)
print(row_count[:20])


OperationalError: no such table: embeddings

In [18]:
import sqlite3
import pandas as pd
import json

# Function to query the level1 table and return a pandas DataFrame
def query_level1_table(db_filename):
    """Query the level1 table and return its contents as a pandas DataFrame."""
    conn = sqlite3.connect(db_filename)
    
    # Use pandas to read the level1 table into a DataFrame
    df = pd.read_sql_query('SELECT * FROM level1', conn)
    
    # Close the connection
    conn.close()
    
    return df

# Function to count the total number of keys (links) in the level1 table
def count_total_keys(df, column_name='keys'):
    """Count the total number of keys (links) across all rows in the specified column."""
    total_keys = 0
    
    for index, row in df.iterrows():
        keys = json.loads(row[column_name])
        total_keys += len(keys)
    
    return total_keys

# Example usage
db_filename = 'wikilinksdata.db'
df_level1 = query_level1_table(db_filename)

# Count the total number of keys (links) in the 'keys' column
total_keys = count_total_keys(df_level1, 'keys')

# Display the total number of keys
print(f"Total number of keys in the table: {total_keys}")


Total number of keys in the table: 8008908


In [3]:
import sqlite3

def list_tables(db_filename):
    """List all tables in the SQLite database."""
    conn = sqlite3.connect(db_filename)
    cursor = conn.cursor()

    # Query to get the list of all tables
    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
    tables = cursor.fetchall()

    conn.close()
    return [table[0] for table in tables]

def main():
    db_filename = 'embeddings.db'  # Replace with your database filename
    tables = list_tables(db_filename)

    print(f"Tables in {db_filename}:")
    for table in tables:
        print(table)

main()


Tables in embeddings.db:
embeddings
sqlite_sequence


In [2]:
import sqlite3
import pandas as pd

# Function to query the level1alt table and return a pandas DataFrame
def query_level1_table(db_filename):
    """Query the level1alt table and return its contents as a pandas DataFrame."""
    conn = sqlite3.connect(db_filename)
    
    # Use pandas to read the level1alt table into a DataFrame
    #df = pd.read_sql_query('SELECT * FROM level1final LIMIT 100', conn)
    df = pd.read_sql_query('SELECT * FROM Entities LIMIT 50', conn)
    # Close the connection
    conn.close()
    
    return df

# Function to get the total number of rows in the level1alt table
def get_row_count(db_filename):
    """Get the total number of rows in the level1 table."""
    conn = sqlite3.connect(db_filename)
    
    # Execute a query to count the rows
    cursor = conn.cursor()
    cursor.execute('SELECT COUNT(*) FROM Entities')
    row_count = cursor.fetchone()[0]
    
    # Close the connection
    conn.close()
    
    return row_count

# Example usage
db_filename = 'ukwacentity.db'
df_level1 = query_level1_table(db_filename)
# Get and display the total number of rows
total_rows = get_row_count(db_filename)
print(f"Total number of rows in the table: {total_rows}")
# Display the head of the DataFrame
df_level1.head(500)




Total number of rows in the table: 519315


Unnamed: 0,sentence,entities,tagged
0,If you feel that would like to make a donation...,"[""IsItFair couk Christine Melsom Willow Cottag...",If you feel that would like to make a donation...
1,Someone else you should contact is your local ...,"[""Philip Rees Director TEL"", ""FAX"", ""wwwresult...",Someone else you should contact is your local ...
2,That is it cannot play songs from Apple s onl...,"[""Apple"", ""iTunes Store"", ""Internet"", ""Tacoma ...",That is it cannot play songs from <entity> Ap...
3,Try the Logitech ChillStream,"[""Logitech ChillStream""]",Try the <entity> Logitech ChillStream </entity>
4,In Listening Post iPod transmitters to be lega...,"[""Listening Post"", ""Addict 3D"", ""MP3"", ""Decemb...",In <entity> Listening Post </entity> iPod tran...
5,One important point to note is that once aware...,"[""Duncan"", ""Robertson Struan"", ""Gordon Stewart...",One important point to note is that once aware...
6,1592 Hague Roll 1672 Alexander Duncan of Seas...,"[""Hague Roll"", ""Alexander Duncan"", ""Seaside"", ...",1592 <entity> Hague Roll </entity> 1672 <enti...
7,Motto Deo Juvante Vinco 1809 Alexander Duncan...,"[""Deo Juvante Vinco"", ""Alexander Duncan"", ""Par...",Motto <entity> Deo Juvante Vinco </entity> 18...
8,Motto Suffer 1876 MorisonDuncan of Naughton Q...,"[""MorisonDuncan"", ""Naughton"", ""Gules"", ""Argent...",Motto Suffer 1876 <entity> MorisonDuncan </en...
9,Motto Beneath Secundis dubiisque rectus 1897 ...,"[""Motto Beneath"", ""Secundis"", ""Lieut""]",<entity> Motto Beneath </entity> <entity> Sec...
