# Data Parsing | Practice

Installing required libraries

In [None]:
!pip install langchain-community
!pip install openai
!pip install python-dotenv

In [None]:
!pip install "unstructured[all-docs]"

In [None]:
!pip install jq

Importing required libraries

In [92]:
import pandas as pd
import numpy as np
import requests
import json

In [42]:
from itertools import islice
from PIL import Image
from io import BytesIO
from langchain.document_loaders import CSVLoader, TextLoader, JSONLoader
from unstructured.partition.csv import partition_csv
from unstructured.partition.json import partition_json
from unstructured.partition.text import partition_text
from IPython.core.display import display, HTML

## CSV / EXCEL

### A. pandas - `read_csv()`

In [3]:
FILE_PATH = '/content/csv/customers-100.csv'

In [4]:
df = pd.read_csv(FILE_PATH)
df.head(3)

Unnamed: 0,Index,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website
0,1,DD37Cf93aecA6Dc,Sheryl,Baxter,Rasmussen Group,East Leonard,Chile,229.077.5154,397.884.0519x718,zunigavanessa@smith.info,2020-08-24,http://www.stephenson.com/
1,2,1Ef7b82A4CAAD10,Preston,Lozano,Vega-Gentry,East Jimmychester,Djibouti,5153435776,686-620-1820x944,vmata@colon.com,2021-04-23,http://www.hobbs.com/
2,3,6F94879bDAfE5a6,Roy,Berry,Murillo-Perry,Isabelborough,Antigua and Barbuda,+1-539-402-0259,(496)978-3969x58947,beckycarr@hogan.com,2020-03-25,http://www.lawrence.com/


In [5]:
URL = df.Website[2]

try:
  response = requests.get(URL)
  print(response)
except:
  print("Failed to get response!")

<Response [200]>


### B. LangChain - `CSVLoader()`

In [6]:
FILE_PATH = '/content/csv/imdb_top_1000.csv'

In [7]:
loader = CSVLoader(FILE_PATH)
docs = loader.load()
print(type(docs))
print(len(docs))

for doc in docs[:15]:
  print(doc.page_content[:15]) # all column-value as single line text
  print(doc.metadata)

<class 'list'>
1000
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 0}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 1}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 2}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 3}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 4}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 5}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 6}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 7}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 8}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 9}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 10}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 11}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 12}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000

**Using `lazy_loader()` - returns a generator, saves memory : )**

In [8]:
loader = CSVLoader(
              FILE_PATH,
              csv_args = {'delimiter': ','}
            )
docs = loader.lazy_load()

In [9]:
print(type(docs))

for i, _ in enumerate(islice(docs, 15)):
  doc = next(docs) # because docs is a generator object
  print(doc.page_content[:15]) # all column-value as single line text
  print(doc.metadata)

<class 'generator'>
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 1}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 3}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 5}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 7}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 9}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 11}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 13}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 15}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 17}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 19}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 21}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 23}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 25}
Poster_Link: ht
{'source': '/content/csv/imdb_top

**Showing Images from Links**   

In [10]:
loader = CSVLoader(FILE_PATH)
docs = loader.load()

In [11]:
column_name = "Poster_Link"  # Replace with your column name
cnt = 1
for doc in docs[:5]:
    lines = doc.page_content.split("\n")
    for line in lines:
        if line.startswith(f"{column_name}:"):
            image_link = line.split(": ", 1)[1].strip()  # Extract the value and strip spaces
            try:
                response = requests.get(image_link)
                if response.status_code != 200:
                    print(f"HTTP Error {response.status_code} for {image_link}")
                    continue
                if 'image' not in response.headers.get('Content-Type', ''):
                    print(f"Not an image URL: {image_link}")
                    continue
                image = Image.open(BytesIO(response.content))

                # image.show() # not working so, saving instead
                # image.save(f"{image_link.split('/')[-1]}.jpg")

                image.save(f"{cnt}.jpg")
                cnt += 1

            except Exception as e:
                print(f"Failed to load image: {image_link}, Error: {e}")


### C. Unstructured - `partition_csv()`

In [26]:
elements = partition_csv(filename="/content/csv/people-100.csv")
print(type(elements), len(elements))

elements[0].metadata
print(vars(elements[0].metadata))

<class 'list'> 1
{'file_directory': '/content/csv', 'filename': 'people-100.csv', 'last_modified': '2024-12-27T03:48:29', 'text_as_html': '<table><tr><td>Index</td><td>User Id</td><td>First Name</td><td>Last Name</td><td>Sex</td><td>Email</td><td>Phone</td><td>Date of birth</td><td>Job Title</td></tr><tr><td>1</td><td>88F7B33d2bcf9f5</td><td>Shelby</td><td>Terrell</td><td>Male</td><td>elijah57@example.net</td><td>001-084-906-7849x73518</td><td>1945-10-26</td><td>Games developer</td></tr><tr><td>2</td><td>f90cD3E76f1A9b9</td><td>Phillip</td><td>Summers</td><td>Female</td><td>bethany14@example.com</td><td>214.112.6044x4913</td><td>1910-03-24</td><td>Phytotherapist</td></tr><tr><td>3</td><td>DbeAb8CcdfeFC2c</td><td>Kristine</td><td>Travis</td><td>Male</td><td>bthompson@example.com</td><td>277.609.7938</td><td>1992-07-02</td><td>Homeopath</td></tr><tr><td>4</td><td>A31Bee3c201ef58</td><td>Yesenia</td><td>Martinez</td><td>Male</td><td>kaitlinkaiser@example.com</td><td>584.094.6111</td><td>2

In [27]:
html_content = elements[0].metadata.text_as_html
display(HTML(html_content))

0,1,2,3,4,5,6,7,8
Index,User Id,First Name,Last Name,Sex,Email,Phone,Date of birth,Job Title
1,88F7B33d2bcf9f5,Shelby,Terrell,Male,elijah57@example.net,001-084-906-7849x73518,1945-10-26,Games developer
2,f90cD3E76f1A9b9,Phillip,Summers,Female,bethany14@example.com,214.112.6044x4913,1910-03-24,Phytotherapist
3,DbeAb8CcdfeFC2c,Kristine,Travis,Male,bthompson@example.com,277.609.7938,1992-07-02,Homeopath
4,A31Bee3c201ef58,Yesenia,Martinez,Male,kaitlinkaiser@example.com,584.094.6111,2017-08-03,Market researcher
5,1bA7A3dc874da3c,Lori,Todd,Male,buchananmanuel@example.net,689-207-3558x7233,1938-12-01,Veterinary surgeon
6,bfDD7CDEF5D865B,Erin,Day,Male,tconner@example.org,001-171-649-9856x5553,2015-10-28,Waste management officer
7,bE9EEf34cB72AF7,Katherine,Buck,Female,conniecowan@example.com,+1-773-151-6685x49162,1989-01-22,Intelligence analyst
8,2EFC6A4e77FaEaC,Ricardo,Hinton,Male,wyattbishop@example.com,001-447-699-7998x88612,1924-03-26,Hydrogeologist
9,baDcC4DeefD8dEB,Dave,Farrell,Male,nmccann@example.net,603-428-2429x27392,2018-10-06,Lawyer


## TXT

### A. LangChain - `TextLoader()`

In [29]:
txt_loader = TextLoader("/content/txt/guidetoinvestors.txt")
txt_docs = txt_loader.load()
type(txt_docs), len(txt_docs)

(list, 1)

In [None]:
# dir(txt_docs[0])
type(txt_docs[0])
vars(txt_docs[0])

In [31]:
lines = txt_docs[0].page_content.split("\n")
for line in lines[:15]:
  print(line)

April 2007(This essay is derived from a keynote talk at the 2007 ASES Summit
at Stanford.)The world of investors is a foreign one to most hackers—partly
because investors are so unlike hackers, and partly because they
tend to operate in secret.  I've been dealing with this world for
many years, both as a founder and an investor, and I still don't
fully understand it.In this essay I'm going to list some of the more surprising things
I've learned about investors.  Some I only learned in the past year.Teaching hackers how to deal with investors is probably the second
most important thing we do at Y Combinator.  The most important
thing for a startup is to make something good.  But everyone knows
that's important.  The dangerous thing about investors is that
hackers don't know how little they know about this strange world.1. The investors are what make a startup hub.About a year ago I tried to figure out what you'd need to reproduce
Silicon Valley.  I decided the 
critical ingredients were

### B. Unstructured - `partition_text()`

In [43]:
txt_docs = partition_text(filename="/content/txt/nike_2023_annual_report.txt")
type(txt_docs), len(txt_docs)

(list, 4321)

In [45]:
vars(txt_docs[0])

{'text': 'FORM 10-K FORM 10-KUNITED STATES',
 'embeddings': None,
 '_element_id': '2f8520361729ee18412fc10b012c0e55',
 'metadata': <unstructured.documents.elements.ElementMetadata at 0x798dc46b4d00>}

In [47]:
text = "\n".join([doc.text for doc in txt_docs])
print(text[:500])

FORM 10-K FORM 10-KUNITED STATES
SECURITIES AND EXCHANGE COMMISSION
Washington, D.C. 20549
FORM 10-K
(Mark One)
☑ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
FOR THE FISCAL YEAR ENDED MAY 31, 2023
OR
☐TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(D) OF THE SECURITIES EXCHANGE ACT OF 1934
FOR THE TRANSITION PERIOD FROM TO .
Commission File No. 1-10635
NIKE, Inc.
(Exact name of Registrant as specified in its charter)
Oregon 93-0584541
(State or other juris


## JSON

### A. LangChain - `JSONLoader()`

In [75]:
jq_schema = ".[].question"

In [76]:
json_loader = JSONLoader("/content/json/q_a.json", jq_schema)
json_docs = json_loader.load()

In [78]:
for doc in json_docs[:5]:
  print(doc.page_content)
  print(doc.metadata)

What does climate change refer to?
{'source': '/content/json/q_a.json', 'seq_num': 1}
What encompasses the planet's overall weather patterns?
{'source': '/content/json/q_a.json', 'seq_num': 2}
What activities have significantly contributed to climate change over the past century?
{'source': '/content/json/q_a.json', 'seq_num': 3}
How many cycles of glacial advance and retreat have occurred over the past 650,000 years?
{'source': '/content/json/q_a.json', 'seq_num': 4}
What marked the beginning of the modern climate era and human civilization?
{'source': '/content/json/q_a.json', 'seq_num': 5}


### Using JSON library

In [96]:
json_docs = json.load(open('/content/json/ChatEval_raw_messages.json'))
type(json_docs), len(json_docs)
print(json_docs[0].keys())

dict_keys(['msg_id', 'user', 'text', 'ts', 'reply_to'])
