In [None]:
import sqlite3
import spacy
import de_dep_news_trf
import pandas as pd


In [None]:
def run_query(database_path, query):
    # Connect to the SQLite database
    # If the database does not exist, it will be created
    connection = sqlite3.connect(database_path)
    
    try:
        # Create a cursor object using the connection
        cursor = connection.cursor()
        
        # Execute the SQL query
        cursor.execute(query)
        
        # Fetch all results from the executed query
        results = cursor.fetchall()
        
        # Turning list of tuples to list of strings for further processing
        return [str(t[0]) for t in results]

            
    except sqlite3.Error as e:
        print(f"An error occurred: {e}")
        
    finally:
        
        # Close the connection to the database
        if connection:
            connection.close()

# spaCy German

In [None]:
!python -m spacy download de_core_news_lg

In [None]:
def extract_person_names(text, nlp_model):
    # Process the text using the specified language model
    doc = nlp_model(text)
    # Extract person names
    z = [ent.text for ent in doc.ents if ent.label_ == "GPE"]

    return [ent.text for ent in doc.ents if ent.label_ == "PERSON"]


In [55]:
import spacy

# Load the German transformer model
de_dep_nlp = spacy.load('de_core_news_lg')

database_path = 'openregister.db'  # Path to your SQLite database file
query = 'SELECT name FROM company LIMIT 100;'  # Replace 'your_table' with your actual table name

company_names = run_query(database_path, query)

results = []
for company_name in company_names:
    # Create a spaCy document for the company name
    doc = de_dep_nlp(company_name)

    # Iterate over each token and perform NER
    for token in doc:
        # Store each token's text and entity type in a dictionary
        results.append({
            'Company Name': company_name,
            'Token': token.text,
            'Entity': token.ent_type_ if token.ent_type_ else 'No entity'
        })

# Convert results to a pandas DataFrame for better structure and analysis
df_results = pd.DataFrame(results)

# Display the DataFrame
print(df_results)

# Optionally, save the DataFrame to a CSV file
df_results.to_csv('ner_results.csv', index=False)

                                Company Name                  Token     Entity
0               olly UG (haftungsbeschränkt)                   olly        ORG
1               olly UG (haftungsbeschränkt)                     UG        ORG
2               olly UG (haftungsbeschränkt)                      (  No entity
3               olly UG (haftungsbeschränkt)     haftungsbeschränkt  No entity
4               olly UG (haftungsbeschränkt)                      )  No entity
..                                       ...                    ...        ...
464                Muehlhan Deutschland GmbH            Deutschland        ORG
465                Muehlhan Deutschland GmbH                   GmbH  No entity
466  Harburg-Freudenberger Maschinenbau GmbH  Harburg-Freudenberger        ORG
467  Harburg-Freudenberger Maschinenbau GmbH           Maschinenbau        ORG
468  Harburg-Freudenberger Maschinenbau GmbH                   GmbH        ORG

[469 rows x 3 columns]


In [54]:
company_names = [" Opel ist eine tolle deutsche christlich Firma", "Volkswagen baut gute Autos", "BASF"]

for company_name in company_names:
    doc = de_dep_nlp(company_name)
    # Extract and print all entities with their labels
    for ent in doc.ents:
        print(ent.text, "-", ent.label_)


Opel - ORG
deutsche - MISC
Volkswagen - ORG
BASF - ORG


In [None]:
# Extract entities labeled as PERSON
filtered_labels = [ent.text for ent in doc.ents if ent.label_ == "PERSON"]

# Extract tokens and their part-of-speech tags
tokens_pos = [(w.text, w.pos_) for w in doc]

# Create a DataFrame for entities
df_entities = pd.DataFrame(filtered_labels, columns=['Person'])

# Create a DataFrame for tokens and POS tags
df_tokens_pos = pd.DataFrame(tokens_pos, columns=['Token', 'POS'])

print("Entities DataFrame:")
print(df_entities)

print("\nTokens and POS Tags DataFrame:")
print(df_tokens_pos)

# spaCy English + Google Translate