<h1> URL Image list to Text </h1>
<h2> Requirements </h2>
<ul>
    <li> Tesseract
        <ol>
            <li> <a href="https://digi.bib.uni-mannheim.de/tesseract/tesseract-ocr-w64-setup-v5.0.0.20211201.exe">Download Link</a></li>
            <li> Add tesseract install folder to path </li>
        </ol>
    </li>
    <li> PyTesseract
        <ol>
            <li> pip install pytesseract </li>
        </ol>
    </li>
    <li> LangDetect
        <ol>
            <li> pip install langdetect</li>
        </ol>
    </li>
    <li>Pillow
        <ol>
            <li>pip install Pillow</li>
        </ol>
    </li>
    <li>Pandas
        <ol>
            <li>pip install pandas</li>
        </ol>
    </li>
    <li> ISO-639
        <ol>
            <li>pip install iso-639</li>
        </ol>
    </li>
</ul>

In [50]:
import os
import pytesseract
import requests
import pandas as pd
from PIL import Image, ImageFilter, ImageEnhance
from typing import List
from langdetect import detect
from iso639 import languages
import time
from textblob import TextBlob
from copy import deepcopy

In [47]:
#Defining constants

url_filename = "./images.txt"
csv_filename = "./data.csv"

def check_csv(filepath : str) -> pd.DataFrame:
    if not os.path.isfile(filepath):
        pd.DataFrame(columns=["Image URL", "Raw Text", "Language", "Polarity", "Subjectivity","Lang Code"]).to_csv(filepath)
    return pd.read_csv(filepath).drop("Unnamed: 0", axis=1)

def get_urls(filepath : str) -> List[str]:
    return [url.strip() for url in open(filepath, "r").read().split("\n")]

image_data_df = check_csv(csv_filename)
image_urls = get_urls(url_filename)
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}


In [3]:
def get_text(img : Image) -> str:
    img = img.convert('L')
    img = img.point(lambda x: 0 if x < 140 else 255)
    return pytesseract.image_to_string(img).replace("\n", " ")

In [40]:
def analyse_text(text: str) -> List:
    blob = TextBlob(text)
    lang_code = detect(text)
    language = languages.get(alpha2=lang_code).name
    polarity, subjectivity = "", ""
    if language == "English":
        polarity = blob.polarity
        subjectivity= blob.subjectivity
    return [text, language, polarity, subjectivity, lang_code]

In [5]:
def analyse_all_urls(image_url : List[str], dataframe : pd.DataFrame) -> None:
    for url in image_url:
        img = requests.get(url, stream=True, headers=headers)
        if img.status_code == 200:
            row = [url] + analyse_text(get_text(Image.open(img.raw)))
            dataframe.loc[len(dataframe)] = row
        else:
            print("Error" + img.status_code + "\n" + img.text)
        time.sleep(0.5)

In [52]:
def test_method(image_url : List[str], dataframe : pd.DataFrame) -> None:
    for url in image_url:
        row = [url] + analyse_text(get_text(Image.open(os.path.join("./Images",url))))
        dataframe.loc[len(dataframe)] = row
        time.sleep(0.5)

test_urls = next(os.walk('./Images'))[2]
test_df = deepcopy(image_data_df)
test_method(test_urls, test_df)
test_df

Unnamed: 0,Image URL,Raw Text,Language,Polarity,Subjectivity,Lang Code
0,0.png,ethan_cummiskey ™2« .o°: the gars,English,0.0,0.0,en
1,2.png,Feedback de dentincia Devido a relatos recent...,Portuguese,,,pt
2,3.jpg,- % |NORTH SIDE BLOODS| = -t/Reme...,English,0.0,0.0,en
3,4.jpg,THUNDER,English,0.0,0.0,en
4,5.png,a Your Secret Friend would love this Baby's...,English,0.075,0.433333,en


In [57]:
def save_dataframe(filename : str, dataframe : pd.DataFrame) -> None:
    dataframe.to_csv(filename)
    print("Data Saved to " + filename)