### Project goals:
* query all data with AVAIL label
* identify product name, price, and seller phonenumber

In [8]:
from sql_functions import query_all, text_preprocessing
import warnings
warnings.filterwarnings("ignore")

data = query_all(['shortcode','caption','label'])
avail = data[data['label']=="AVAIL"]

avail

Unnamed: 0,shortcode,caption,label
0,CdkKXK9p7hOdKxgtry4sggDIJtX94oAjuaEvz00,Nama barang : sepatu size 37\nKondisi : new bl...,AVAIL
2,CdkJlJmpzRLEaFIxrn3FMQprY5ulloR_lnpC_40,Nama barang : VANS SLIP ON BLACK CLASSIC 7 us ...,AVAIL
5,Cdj-Qtwp6hQe589X95b9akSYwQZKN4dIeH-ArQ0,1. Nama barang : Bruklat hitam fit to L\nKondi...,AVAIL
6,Cdj99kDJ1EIscvToboJ2lR3cOVz6_PPgNhbhCY0,Nama barang : Sepatu Heels Payless VE Janine\n...,AVAIL
7,Cdj6_M0JF6eVSkOcLvfUeMZvwW20NUG0qORlGY0,Nama Barang : ip xr 64gb red\nKondisi : mulus ...,AVAIL
...,...,...,...
1539,CeFiIkcPf4asBIrv-2sQ1TeAe7AvmBI4hkcPSo0,• Di Jual : Realme 3\n• Kondisi : Normal smua\...,AVAIL
1540,CeFhttXP66xSJQjtBgBjaaioAJFTjM-9WAme4I0,• Di Jual : Samsung A6 plus\n• Kondisi : norma...,AVAIL
1541,CeFg_PovzeAgjQ2hPSIVhhp2xnjc0ZD34F5Owc0,📌Dijual : Kamera SONY ZV-1 & SONY Earbud WF-C5...,AVAIL
1542,CeFgY6sva-HsgW5HkLq1svWZ3b9GAVWC5QxMXg0,📌Dijual : HDD External SEAGATE\n📌Keterangan : ...,AVAIL


In [26]:
caption = avail.iloc[1,1]
caption

'Nama barang : VANS SLIP ON BLACK CLASSIC 7 us / 39 eur / 25 cm\nKondisi : Baru, belum pernah dipakai\nHarga : 485.000 nego\n\nKontak\nWA : 088233813908\nIG : @citradewiandafebrian\n\nLokasi : tlogoadi mlati sleman yogyakarta'

In [90]:
import re

def to_list(line):
    words = re.findall("[a-z]+",line)
    return words

def word_refiner(keyword):
    if any([x in keyword for x in ["nama","nama_barang","dijual","di_jual","jual"]]):
        return "nama_barang"
    elif any([x in keyword for x in ["alasan","keterangan","kondisi","ket","spesifikasi"]]):
        return "keterangan"
    elif any([x in keyword for x in ["ig","instagram"]]):
        return "ig"
    elif "harga" in keyword:
        if all([x in keyword for x in ["harga","beli"]]):
            return "harga_beli"
        else:
            return "harga_jual"
    elif any([x in keyword for x in ["lokasi","alamat"]]):
        return "lokasi"
    elif any([x in keyword for x in ["kontak","wa","whatsapp","minat_hub"]]):
        if "wa" in to_list(keyword):
            return "kontak"
        elif any([x in keyword for x in ["kontak","whatsapp","minat_hub"]]):
            return "kontak"
        else:
            return keyword
    else:
        return keyword

class Caption:
    
    def __init__(self,text):
        self.caption = text
        self.text = text.split("\n")
        
    def to_dict(self):
        instance = {}
        for line in self.text:
            key = line.split(":")[0]
            value = ":".join(line.split(":")[1:]).strip().upper()
            if key != "" and value != "":
                key = key.lower().strip().replace(" ","_")
                if key == "ig":
                    value = value.lower().lstrip("@")
                instance.update({key: value})
        return instance
    
    def to_list(self):
        return [d.strip() for d in self.text if d.strip() != ""]
    
    def refine_dict(self):
        instance = self.to_dict()
        new_instance = [{word_refiner(key): value} for key,value in instance.items()]
        return new_instance
    
    def __str__(self):
        return re.sub("\s+"," ",self.caption).strip()
    
if __name__ == "__main__":
    
    # caption = Caption(avail.iloc[6,1])
    # print(caption.to_dict())
    
    keys = []
    for i in avail.index:
        caption = Caption(avail.loc[i,'caption'])
        new_keys = [[*d.keys()][0] for d in caption.refine_dict()]
        keys.extend(new_keys)
    keys = list(dict.fromkeys(keys))
print(len(keys))
keys

242


['nama_barang',
 'keterangan',
 'harga_jual',
 'kontak',
 'ig',
 'lokasi',
 'warna',
 'size',
 'tinggi_heels',
 'minus',
 'bahan_meja',
 'bahan_kursi',
 'untuk_tanggal',
 'harga_beli',
 'kelengkapan',
 'note',
 'size_l',
 'shade',
 'size_s_(p',
 '-_detail_kain',
 '-_audio',
 'masa_pemakaian',
 'bonus',
 'kondiai',
 'fasilitas',
 'check-in',
 'check-out',
 '~_rangka_kaki',
 '~_kain',
 '~_bantalan_sandaran_tangan',
 '~_dudukan',
 '~_sandaran_tangan',
 '~_lebar_arm_rest',
 '~_tinggi_sandaran',
 '~_tinggi_dudukan',
 '~_tinggi_arm_rest',
 '~_dimensi_dudukan_kursi',
 '~_dimensi_bantal_recline_kecil',
 '~_dimensi_posisi_normal',
 '~_dimensi_posisi_recline',
 'bh',
 'kabupaten',
 'provinsi',
 'luas_tanah',
 'lebar_depan',
 'hadap',
 'sertipikat',
 'warna_body_sepeda',
 'baterai',
 'kecepatan',
 'jarak_tempuh',
 'dinamo',
 'ban',
 'ukuran_sepeda_(cm)',
 'lainnya',
 '1._monitor',
 '2._pross',
 '3._mobo',
 '4._vga',
 '5._ram',
 '6._ssd',
 '7._hd',
 '10._psu',
 '11._case',
 'fitur',
 '•minus',
 'u