In [1]:
!pip install streamlit
! pip install pdf2image
!apt-get install poppler-utils 


Collecting streamlit
  Downloading streamlit-1.2.0-py2.py3-none-any.whl (9.1 MB)
[K     |████████████████████████████████| 9.1 MB 4.7 MB/s 
Collecting base58
  Downloading base58-2.1.1-py3-none-any.whl (5.6 kB)
Collecting blinker
  Downloading blinker-1.4.tar.gz (111 kB)
[K     |████████████████████████████████| 111 kB 39.7 MB/s 
Collecting pydeck>=0.1.dev5
  Downloading pydeck-0.7.1-py2.py3-none-any.whl (4.3 MB)
[K     |████████████████████████████████| 4.3 MB 28.5 MB/s 
Collecting pympler>=0.9
  Downloading Pympler-0.9.tar.gz (178 kB)
[K     |████████████████████████████████| 178 kB 55.0 MB/s 
Collecting watchdog
  Downloading watchdog-2.1.6-py3-none-manylinux2014_x86_64.whl (76 kB)
[K     |████████████████████████████████| 76 kB 4.1 MB/s 
Collecting gitpython!=3.1.19
  Downloading GitPython-3.1.24-py3-none-any.whl (180 kB)
[K     |████████████████████████████████| 180 kB 56.7 MB/s 
Collecting validators
  Downloading validators-0.18.2-py3-none-any.whl (19 kB)
Collecting gitdb<

Collecting pdf2image
  Downloading pdf2image-1.16.0-py3-none-any.whl (10 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.16.0
Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 37 not upgraded.
Need to get 154 kB of archives.
After this operation, 613 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 poppler-utils amd64 0.62.0-2ubuntu2.12 [154 kB]
Fetched 154 kB in 1s (224 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 155222 files and directories currently installed.)
Preparing to unpack .../poppler-utils_0.62.0-2ubuntu2.12_amd64.deb ...
Unpacking poppler-utils (0.62.0-2ubuntu2.12) ...
Setting up poppler-utils (0.62.0-2ubuntu2.12) ...
Processing triggers for man-db (2.8.3-2ubuntu0.1) ...


In [2]:
!pip install albumentations==0.4.6
!sudo apt install tesseract-ocr
!pip install pytesseract

Collecting albumentations==0.4.6
  Downloading albumentations-0.4.6.tar.gz (117 kB)
[K     |████████████████████████████████| 117 kB 5.2 MB/s 
Collecting imgaug>=0.4.0
  Downloading imgaug-0.4.0-py2.py3-none-any.whl (948 kB)
[K     |████████████████████████████████| 948 kB 23.2 MB/s 
Building wheels for collected packages: albumentations
  Building wheel for albumentations (setup.py) ... [?25l[?25hdone
  Created wheel for albumentations: filename=albumentations-0.4.6-py3-none-any.whl size=65172 sha256=faa1674916be7f9a1dd9833ff7c732e863d3ee47d897d1f490f1216a1b511401
  Stored in directory: /root/.cache/pip/wheels/cf/34/0f/cb2a5f93561a181a4bcc84847ad6aaceea8b5a3127469616cc
Successfully built albumentations
Installing collected packages: imgaug, albumentations
  Attempting uninstall: imgaug
    Found existing installation: imgaug 0.2.9
    Uninstalling imgaug-0.2.9:
      Successfully uninstalled imgaug-0.2.9
  Attempting uninstall: albumentations
    Found existing installation: album

In [67]:
%%writefile app.py
import streamlit as st
import pdf2image
import numpy as np
import pandas as pd
from PIL import Image
import cv2
import time
from datetime import datetime
import torch
import torch.nn as nn
import torchvision
import albumentations as A
from albumentations.pytorch import ToTensorV2
import pytesseract
from io import StringIO


pytesseract.pytesseract.tesseract_cmd = r'/usr/bin/tesseract'

TRANSFORM = A.Compose([
                #ToTensor --> Normalize(mean, std)
                A.Normalize(
                    mean=[0.485, 0.456, 0.406],
                    std=[0.229, 0.224, 0.225],
                    max_pixel_value = 255,
                ),
                ToTensorV2()
            ])

class DenseNet(nn.Module):
    def __init__(self, pretrained = True, requires_grad = True):
        super(DenseNet, self).__init__()
        denseNet = torchvision.models.densenet121(pretrained=True).features
        self.densenet_out_1 = torch.nn.Sequential()
        self.densenet_out_2 = torch.nn.Sequential()
        self.densenet_out_3 = torch.nn.Sequential()

        for x in range(8):
            self.densenet_out_1.add_module(str(x), denseNet[x])
        for x in range(8,10):
            self.densenet_out_2.add_module(str(x), denseNet[x])

        self.densenet_out_3.add_module(str(10), denseNet[10])

        if not requires_grad:
            for param in self.parameters():
                param.requires_grad = False

    def forward(self, x):

        out_1 = self.densenet_out_1(x) #torch.Size([1, 256, 64, 64])
        out_2 = self.densenet_out_2(out_1) #torch.Size([1, 512, 32, 32])
        out_3 = self.densenet_out_3(out_2) #torch.Size([1, 1024, 32, 32])
        return out_1, out_2, out_3

class TableDecoder(nn.Module):
    def __init__(self, channels, kernels, strides):
        super(TableDecoder, self).__init__()
        self.conv_7_table = nn.Conv2d(
                        in_channels = 256,
                        out_channels = 256,
                        kernel_size = kernels[0],
                        stride = strides[0])
        self.upsample_1_table = nn.ConvTranspose2d(
                        in_channels = 256,
                        out_channels=128,
                        kernel_size = kernels[1],
                        stride = strides[1])
        self.upsample_2_table = nn.ConvTranspose2d(
                        in_channels = 128 + channels[0],
                        out_channels = 256,
                        kernel_size = kernels[2],
                        stride = strides[2])
        self.upsample_3_table = nn.ConvTranspose2d(
                        in_channels = 256 + channels[1],
                        out_channels = 1,
                        kernel_size = kernels[3],
                        stride = strides[3])

    def forward(self, x, pool_3_out, pool_4_out):
        x = self.conv_7_table(x)  #[1, 256, 32, 32]
        out = self.upsample_1_table(x) #[1, 128, 64, 64]
        out = torch.cat((out, pool_4_out), dim=1) #[1, 640, 64, 64]
        out = self.upsample_2_table(out) #[1, 256, 128, 128]
        out = torch.cat((out, pool_3_out), dim=1) #[1, 512, 128, 128]
        out = self.upsample_3_table(out) #[1, 3, 1024, 1024]
        return out

class ColumnDecoder(nn.Module):
    def __init__(self, channels, kernels, strides):
        super(ColumnDecoder, self).__init__()
        self.conv_8_column = nn.Sequential(
                        nn.Conv2d(in_channels = 256,out_channels = 256,kernel_size = kernels[0], stride = strides[0]),
                        nn.ReLU(inplace=True),
                        nn.Dropout(0.8),
                        nn.Conv2d(in_channels = 256,out_channels = 256,kernel_size = kernels[0], stride = strides[0])
                        )
        self.upsample_1_column = nn.ConvTranspose2d(
                        in_channels = 256,
                        out_channels=128,
                        kernel_size = kernels[1],
                        stride = strides[1])
        self.upsample_2_column = nn.ConvTranspose2d(
                        in_channels = 128 + channels[0],
                        out_channels = 256,
                        kernel_size = kernels[2],
                        stride = strides[2])
        self.upsample_3_column = nn.ConvTranspose2d(
                        in_channels = 256 + channels[1],
                        out_channels = 1,
                        kernel_size = kernels[3],
                        stride = strides[3])

    def forward(self, x, pool_3_out, pool_4_out):
        x = self.conv_8_column(x)  #[1, 256, 32, 32]
        out = self.upsample_1_column(x) #[1, 128, 64, 64]
        out = torch.cat((out, pool_4_out), dim=1) #[1, 640, 64, 64]
        out = self.upsample_2_column(out) #[1, 256, 128, 128]
        out = torch.cat((out, pool_3_out), dim=1) #[1, 512, 128, 128]
        out = self.upsample_3_column(out) #[1, 3, 1024, 1024]
        return out

class TableNet(nn.Module):
    def __init__(self):
        super(TableNet, self).__init__()

        self.base_model = DenseNet(pretrained = False, requires_grad = True)
        self.pool_channels = [512, 256]
        self.in_channels = 1024
        self.kernels = [(1,1), (1,1), (2,2),(16,16)]
        self.strides = [(1,1), (1,1), (2,2),(16,16)]

        #common layer
        self.conv6 = nn.Sequential(
            nn.Conv2d(in_channels = self.in_channels, out_channels = 256, kernel_size=(1,1)),
            nn.ReLU(inplace=True),
            nn.Dropout(0.8),
            nn.Conv2d(in_channels = 256, out_channels = 256, kernel_size=(1,1)),
            nn.ReLU(inplace=True),
            nn.Dropout(0.8))

        self.table_decoder = TableDecoder(self.pool_channels, self.kernels, self.strides)
        self.column_decoder = ColumnDecoder(self.pool_channels, self.kernels, self.strides)

    def forward(self, x):

        pool_3_out, pool_4_out, pool_5_out = self.base_model(x)
        conv_out = self.conv6(pool_5_out) #[1, 256, 32, 32]
        table_out = self.table_decoder(conv_out, pool_3_out, pool_4_out) #torch.Size([1, 1, 1024, 1024])
        column_out = self.column_decoder(conv_out, pool_3_out, pool_4_out) #torch.Size([1, 1, 1024, 1024])
        return table_out,column_out

@st.cache(allow_output_mutation=True)
def load_model():

    model = TableNet()
    model.load_state_dict(torch.load("densenet_config_4_model_checkpoint.pth.tar",map_location ='cpu')['state_dict'])
    model.eval()
    return model

def predict(img_path):
    with st.spinner('Processing..'):
        orig_image = img_path[0].resize((1024, 1024))
        test_img = np.array(orig_image.convert('LA').convert("RGB"))

        now = datetime.now()
        image = TRANSFORM(image = test_img)["image"]
        with torch.no_grad():
            image = image.unsqueeze(0)
            #with torch.cuda.amp.autocast():
            table_out, _  = model(image)
            table_out = torch.sigmoid(table_out)

        #remove gradients
        table_out = (table_out.detach().numpy().squeeze(0).transpose(1,2,0) > 0.5).astype(np.uint8)

        #get contours of the mask to get number of tables
        contours, table_heirarchy = cv2.findContours(table_out, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)

        table_contours = []
        #ref: https://www.pyimagesearch.com/2015/02/09/removing-contours-image-using-python-opencv/
        #remove bad contours
        for c in contours:

            if cv2.contourArea(c) > 3000:
                table_contours.append(c)

        if len(table_contours) == 0:
            st.write("No Table detected")

        table_boundRect = [None]*len(table_contours)
        for i, c in enumerate(table_contours):
            polygon = cv2.approxPolyDP(c, 3, True)
            table_boundRect[i] = cv2.boundingRect(polygon)

        #table bounding Box
        table_boundRect.sort()

        orig_image = np.array(orig_image)
        #draw bounding boxes
        color = (0,0,255)
        thickness = 4

        for x,y,w,h in table_boundRect:
            cv2.rectangle(orig_image, (x,y),(x+w,y+h), color, thickness)

        st.image(orig_image)

        end_time = datetime.now()
        difference = end_time - now
        #print("Total Time : {} seconds".format(difference))
        time = "{}".format(difference)

        st.write(f"{time} secs")

        st.write("Predicted Tables")

        image = test_img[...,0].reshape(1024, 1024).astype(np.uint8)

        for i,(x,y,w,h) in enumerate(table_boundRect):
            image_crop = image[y:y+h,x:x+w]
            st.image(image_crop)
            data = pytesseract.image_to_string(image_crop)
            try:
                df = pd.read_csv(StringIO(data),sep=r'\|',lineterminator=r'\n',engine='python')
                st.write(f" ## Table {i+1}")
                st.write(df)
            except pd.errors.ParserError:
                try:
                    df = pd.read_csv(StringIO(data),delim_whitespace=True,lineterminator=r'\n',engine='python')
                    st.write(f" ## Table {i+1}")
                    st.write(df)
                except pd.errors.ParserError:
                    st.write(f" ## Table {i+1}")
                    st.write(data)



with st.spinner("Loading Last Checkpoint"):
    model = load_model()

st.header("Data Extraction from Tables")
#upload files
file = st.file_uploader("Please upload an Image file")


if file is not None:
    images = pdf2image.convert_from_bytes(file.read())
    pic=images
    st.write(type(pic))
    st.image(pic, use_column_width=True)
    predict(pic)
    
  

Overwriting app.py


In [None]:
!streamlit run app.py & npx localtunnel --port 8501

[K[?25hnpx: installed 22 in 3.46s
your url is: https://tender-bat-76.loca.lt
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Network URL: [0m[1mhttp://172.28.0.2:8501[0m
[34m  External URL: [0m[1mhttp://35.184.19.124:8501[0m
[0m
