<a href="https://www.kaggle.com/code/danishjavedcodes/unstructured-financial-text-into-tables?scriptVersionId=162889571" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/financial-raw-data/data.txt


**Importing Dependencies**

In [2]:
import pandas as pd
import re
import pandas as pd
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.tag import pos_tag
import spacy
nlp = spacy.load("en_core_web_sm")

**Reading Data from file**

In [3]:
import pandas as pd

# Load the text data from the file
with open('/kaggle/input/financial-raw-data/data.txt', 'r') as file:
    text_data = file.read()

**Converting to Data frame**

In [4]:
# Split the text into Title and details
data = [title.split(":", 1) for title in text_data.split("\n") if title]

# Create a dataframe~
df = pd.DataFrame(data, columns=["Title", "Details"])
df

Unnamed: 0,Title,Details
0,Real Estate Investment,John is considering buying a rental property ...
1,Stock Market Trading,"Emily monitors the stock market daily, lookin..."
2,Car Purchase Negotiation,Sarah visits a local car dealership to buy a ...
3,Cryptocurrency Investment,David follows the cryptocurrency market close...
4,Antique Collection,Rebecca is passionate about antiques and regu...
5,Foreign Exchange Trading,James is a forex trader who specializes in cu...
6,Art Investment,Michael attends an art auction featuring work...
7,Commodity Futures Trading,Lisa trades commodity futures contracts and n...
8,Startup Equity Investment,Tom is approached by a startup seeking fundin...
9,Precious Metals Trading,Jessica diversifies her investment portfolio ...


**Tokanizing Descriptions**

In [5]:
df = pd.DataFrame(data, columns=["Title", "Details"])
# Tokenize the details
tokenized_details = df['Details'].apply(lambda x: sent_tokenize(x))
df['Tokenized Details'] = tokenized_details

**Functions for Data Extractions**

In [6]:
def find_proper_nouns(text):
    tokens = word_tokenize(text)
    tagged_words = pos_tag(tokens)
    proper_nouns = [word for word, pos in tagged_words if pos == 'NNP' or pos == 'NNPS']
    
    return proper_nouns

def find_actions(text):
    doc = nlp(text)
    actions = [token.text for token in doc if token.pos_ == 'VERB' or token.pos_ == 'VB']
    return actions

def find_prices(text):
    # Regular expression to match common price patterns
    price_pattern = re.compile(r'\$\s?\d+(?:,\d{3})*(?:\.\d{1,2})?|\d+(?:,\d{3})*(?:\.\d{1,2})?\s?USD|\d+(?:,\d{3})*(?:\.\d{1,2})?\s?dollars')
    # Find all matches in the text
    matches = re.finditer(price_pattern, text)
    # Extract and return the matched prices
    prices = [match.group(0) for match in matches]   
    return prices

def find_product_types(text):
    tokens = word_tokenize(text)
    tagged_words = pos_tag(tokens)
    
    # Define POS patterns for identifying potential product types
    product_type_patterns = [
        ('NN', 'NNS'),    # Singular and plural nouns
    ]
    
    # Find words matching the specified POS patterns
    product_types = [word for word, pos in tagged_words if any((pos == pos1 or pos == pos2) for pos1, pos2 in product_type_patterns)]
    return product_types

def find_product_conditions(text):
    tokens = word_tokenize(text)
    tagged_words = pos_tag(tokens)  
    # Define POS patterns for identifying potential product conditions
    condition_patterns = [
        ('JJ',),          # Adjectives
        ('RB', 'JJ'),      # Adverbs + adjectives
    ]
    # Find words matching the specified POS patterns
    product_conditions = [word for word, pos in tagged_words if any(all((pos == p) for p in pattern) for pattern in condition_patterns)]
    return product_conditions

**Generating new Detailed Data Frame Table**

In [7]:
new_df = pd.DataFrame(columns=["Title", "Person", "Product", "Condition", "Actions", "Price"])

In [8]:
# Extracting information from tokanized details
title = ""
for index, row in df.iterrows():
    new_title = row['Title']
    for sentence in row['Tokenized Details']:
        Person = " ,".join(find_proper_nouns(sentence))
        Product = " ,".join(find_product_types(sentence))
        Condition = " ,".join(find_product_conditions(sentence))
        Action = " ,".join(find_actions(sentence))
        Price = " ,".join(find_prices(sentence))
        if new_title != title:
            new_df = pd.concat([new_df, pd.DataFrame({"Title": [new_title], "Person": [Person], "Product": [Product], "Condition": [Condition], "Actions": [Action], "Price": Price})], ignore_index=True)
        else:
            new_df = pd.concat([new_df, pd.DataFrame({"Title": " ", "Person": [Person], "Product": [Product], "Condition": [Condition], "Actions": [Action], "Price": Price})], ignore_index=True)
        title = new_title
new_df

Unnamed: 0,Title,Person,Product,Condition,Actions,Price
0,Real Estate Investment,John,"property ,bustling ,neighborhood",rental,"considering ,buying",
1,,,"listings ,two-bedroom ,apartment","various ,cozy","researching ,finds ,listed","$200,000"
2,,,"income ,properties ,area ,purchase ,decision","potential ,rental ,similar","evaluates ,compares ,making",
3,Stock Market Trading,,"stock ,market ,opportunities ,stocks",,"monitors ,looking ,buy ,sell",
4,,,"shares ,tech ,company ,market ,volatility",due,"notices ,eyeing ,dropped",
5,,,"company ,prospects ,shares ,plans ,price ,rebo...",long-term,"Believing ,decides ,buy ,sell ,rebounds",$50
6,Car Purchase Negotiation,Sarah,"car ,dealership ,vehicle","local ,new",buy,
7,,,"models ,sedan","test-driving ,several ,sleek","driving ,settles ,priced","$30,000"
8,,,deal,,"knows ,negotiate",
9,,,"knowledge ,car ,pricing ,market ,trends ,sales...",,"Using ,haggles ,secures","$28,000"
