<a href="https://colab.research.google.com/github/clizar5302/NLP-Data-Extraction-ColeLizar/blob/main/unstructuredDataExtractionNLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Set up the Environment

In [None]:
!pip install spacy pandas
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Load the Data

In [None]:
import pandas as pd

# Mock dataset
data = {
    "documents": [
        "Apple Inc. is based in Cupertino, California and was founded by Steve Jobs.",
        "Tesla, headquartered in Palo Alto, is known for its electric cars.",
        "Microsoft, founded by Bill Gates, is a technology company in Redmond."
    ]
}

df = pd.DataFrame(data)
print(df)

                                           documents
0  Apple Inc. is based in Cupertino, California a...
1  Tesla, headquartered in Palo Alto, is known fo...
2  Microsoft, founded by Bill Gates, is a technol...


Load the Model

In [None]:
import spacy

# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

Process with NLP

In [None]:
# Function to extract entities from a document
def extract_entities(doc):
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

# Apply the model to each document
df['entities'] = df['documents'].apply(lambda x: extract_entities(nlp(x)))

# View the results
print(df[['documents', 'entities']])

                                           documents  \
0  Apple Inc. is based in Cupertino, California a...   
1  Tesla, headquartered in Palo Alto, is known fo...   
2  Microsoft, founded by Bill Gates, is a technol...   

                                            entities  
0  [(Apple Inc., ORG), (Cupertino, GPE), (Califor...  
1                   [(Tesla, ORG), (Palo Alto, GPE)]  
2  [(Microsoft, ORG), (Bill Gates, PERSON), (Redm...  


Organize and Structure the Data

In [None]:
# Convert extracted entities to a structured format
structured_data = []

for doc, entities in zip(df['documents'], df['entities']):
    for entity, label in entities:
        structured_data.append({'Document': doc, 'Entity': entity, 'Label': label})

# Create a structured DataFrame
structured_df = pd.DataFrame(structured_data)
print(structured_df)

# Save to CSV
structured_df.to_csv('extracted_entities.csv', index=False)

                                            Document      Entity   Label
0  Apple Inc. is based in Cupertino, California a...  Apple Inc.     ORG
1  Apple Inc. is based in Cupertino, California a...   Cupertino     GPE
2  Apple Inc. is based in Cupertino, California a...  California     GPE
3  Apple Inc. is based in Cupertino, California a...  Steve Jobs  PERSON
4  Tesla, headquartered in Palo Alto, is known fo...       Tesla     ORG
5  Tesla, headquartered in Palo Alto, is known fo...   Palo Alto     GPE
6  Microsoft, founded by Bill Gates, is a technol...   Microsoft     ORG
7  Microsoft, founded by Bill Gates, is a technol...  Bill Gates  PERSON
8  Microsoft, founded by Bill Gates, is a technol...     Redmond     GPE
