# Data Parsing | Practice

Installing required libraries

In [None]:
!pip install langchain-community
!pip install openai
!pip install python-dotenv

Importing required libraries

In [14]:
import pandas as pd
import numpy as np
import requests

In [44]:
from itertools import islice
from PIL import Image
from io import BytesIO
from langchain.document_loaders import CSVLoader

## CSV / EXCEL

### A. pandas - `read_csv()`

In [18]:
FILE_PATH = '/content/csv/customers-100.csv'

In [19]:
df = pd.read_csv(FILE_PATH)
df.head(3)

Unnamed: 0,Index,Customer Id,First Name,Last Name,Company,City,Country,Phone 1,Phone 2,Email,Subscription Date,Website
0,1,DD37Cf93aecA6Dc,Sheryl,Baxter,Rasmussen Group,East Leonard,Chile,229.077.5154,397.884.0519x718,zunigavanessa@smith.info,2020-08-24,http://www.stephenson.com/
1,2,1Ef7b82A4CAAD10,Preston,Lozano,Vega-Gentry,East Jimmychester,Djibouti,5153435776,686-620-1820x944,vmata@colon.com,2021-04-23,http://www.hobbs.com/
2,3,6F94879bDAfE5a6,Roy,Berry,Murillo-Perry,Isabelborough,Antigua and Barbuda,+1-539-402-0259,(496)978-3969x58947,beckycarr@hogan.com,2020-03-25,http://www.lawrence.com/


In [20]:
URL = df.Website[2]

try:
  response = requests.get(URL)
  print(response)
except:
  print("Failed to get response!")

<Response [200]>


### B. LangChain - `CSVLoader()`

In [21]:
FILE_PATH = '/content/csv/imdb_top_1000.csv'

In [30]:
loader = CSVLoader(FILE_PATH)
docs = loader.load()
print(type(docs))
print(len(docs))

for doc in docs[:15]:
  print(doc.page_content[:15]) # all column-value as single line text
  print(doc.metadata)

<class 'list'>
1000
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 0}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 1}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 2}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 3}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 4}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 5}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 6}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 7}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 8}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 9}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 10}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 11}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 12}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000

**Using `lazy_loader()` - returns a generator, saves memory : )**

In [32]:
loader = CSVLoader(
              FILE_PATH,
              csv_args = {'delimiter': ','}
            )
docs = loader.lazy_load()

In [40]:
print(type(docs))

for i, _ in enumerate(islice(docs, 15)):
  doc = next(docs) # because docs is a generator object
  print(doc.page_content[:15]) # all column-value as single line text
  print(doc.metadata)

<class 'generator'>
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 92}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 94}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 96}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 98}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 100}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 102}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 104}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 106}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 108}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 110}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 112}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 114}
Poster_Link: ht
{'source': '/content/csv/imdb_top_1000.csv', 'row': 116}
Poster_Link: ht
{'source': '/conten

**Showing Images from Links**   

In [41]:
loader = CSVLoader(FILE_PATH)
docs = loader.load()

In [47]:
column_name = "Poster_Link"  # Replace with your column name
cnt = 1
for doc in docs[:5]:
    lines = doc.page_content.split("\n")
    for line in lines:
        if line.startswith(f"{column_name}:"):
            image_link = line.split(": ", 1)[1].strip()  # Extract the value and strip spaces
            try:
                response = requests.get(image_link)
                if response.status_code != 200:
                    print(f"HTTP Error {response.status_code} for {image_link}")
                    continue
                if 'image' not in response.headers.get('Content-Type', ''):
                    print(f"Not an image URL: {image_link}")
                    continue
                image = Image.open(BytesIO(response.content))

                # image.show() # not working so, saving instead
                # image.save(f"{image_link.split('/')[-1]}.jpg")

                image.save(f"{cnt}.jpg")
                cnt += 1

            except Exception as e:
                print(f"Failed to load image: {image_link}, Error: {e}")
