# Reading in the data

In [None]:
import os
from google.colab import drive

drive.mount('/content/drive')
os.chdir("/content/drive/My Drive/")

Mounted at /content/drive


In [None]:
import csv

sample901 = []

with open('s901_corr.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    for row in csv_reader:
      sample901.append(row)

In [None]:
len(sample901)

15058

In [None]:
col_names = ['uuid', 'origin_id', 'description', 'rasul_cleaned_description', 'hs_code', 'cleaned_description', 'noised_description']

In [None]:
print(len(sample901), "rows imported")
sample901[0]

15058 rows imported


['890abe46-f7a3-4054-b88e-88f1a1b153e7',
 '1985872',
 'CERAMIC HARDWARE :   40MM PORCELAIN FLOW ER KNOB WHITE MOTIF BLACK',
 'ceramic hardware porcelain flower knob white motif black',
 '6914100000',
 'ceramic hardware mm porcelain flow er knob white motif black',
 'ceramic hardwear mm porcelian flow r knob wite motif blak']

#Dividing train, dev, test set
Will use 60%, 20%, 20% split.

In [None]:
split1 = int(0.6 * len(sample901))
split2 = int(0.8 * len(sample901))

import random
random.seed(0)
random.shuffle(sample901)

train901 = sample901[:split1]
dev901 = sample901[split1:split2]
test901 = sample901[split2:]

In [None]:
print(len(train901),"rows in train")
print(len(dev901),"rows in dev")
print(len(test901),"rows in test")
train901[0]

1276078 rows in train
425360 rows in dev
425360 rows in test


['a68d46ce-7d18-473b-94ea-d8e893c2c1bc',
 '546647',
 '100% COTTON WOVEN MENS COAT',
 'cottonwoven mens coat',
 '6201190000',
 'cotton woven mens coat',
 'cotten wvoen mens oat']

#Preprocessing

When preprocessing the description, we will
* lowercase
* remove special characters, leaving only standard latin letters
* uniform whitespace

In [None]:
def preprocess(description):
  #Lowercase
  preprocessed_description = description.lower()

  #Replace lowercase characters with whitespace
  chars = ['a','b','c','d','e','f','g','h','i','j','k','l','m',
           'n','o','p','q','r','s','t','u','v','w','x','y','z',]

  clean = ''
  for char in preprocessed_description:
    if char not in chars:
      clean += ' '
    else:
      clean += char

  #Correct whitespace
  return " ".join(clean.split())

#Example
print(preprocess('FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd Table'))
print(preprocess('READYMADE GARMENTS BOYS INFANT F/S SUIT 100% COTTON STYLE NO. 41156 SIZE 0 TO 24 MONTS (PURPLE CEATION GSTIN 27AAAC'))

furniture aluminium alu iron tripod rd table
readymade garments boys infant f s suit cotton style no size to monts purple ceation gstin aaac


# Noising

In [None]:
import numpy as np

letters = ['a','b','c','d','e','f','g','h','i','j','k','l','m',
           'n','o','p','q','r','s','t','u','v','w','x','y','z',]

def introduce_simple_noise(rows, threshold=0.9):
    noisy_rows = []
    for row in rows:
        noisy_sentence = []
        i = 0
        sentence = row[2]
        while i < len(sentence):
            random = np.random.uniform(0,1,1)
            if random < threshold:
                noisy_sentence.append(sentence[i])
            else:
                new_random = np.random.uniform(0,1,1)
                if new_random > 0.67:
                    if i == (len(sentence) - 1):
                        continue
                    else:
                        noisy_sentence.append(sentence[i+1])
                        noisy_sentence.append(sentence[i])
                        i += 1
                elif new_random < 0.33:
                    random_letter = np.random.choice(letters, 1)[0]
                    noisy_sentence.append(random_letter)
                    noisy_sentence.append(sentence[i])
                else:
                    pass     
            i += 1
        noisy_rows.append("".join(noisy_sentence))
        
    return noisy_rows

In [None]:
#Function introduces deletion typos into descriptions. Number of typos is adjusted by frequency parameter.
def noise_deletion(description, frequency = 1):
  for i in range(frequency):
    desc_length = len(description)
    try:
      delete_index = random.randrange(0,desc_length) #selects which character to delete
      description = description[:delete_index] + description[delete_index+1:]
    except:
      return description
  return description

#Example
print(noise_deletion('furniture aluminium alu iron tripod rd table'))
print(noise_deletion('furniture aluminium alu iron tripod rd table',3))

furniture aluminium alu iron tripod rd table
furniture aluminium alu iron tripod rd table


In [None]:
#Function introduces typos of switching letters. Number of switches is adjusted by frequency parameter.
def noise_switch(description, frequency = 1):
  desc_length = len(description)
  for i in range(frequency):
    try:
      switch_index = random.randrange(0,desc_length-1) #selects which character to switch with the next
      description = description[:switch_index] + description[switch_index+1] + description[switch_index] + description[switch_index+2:]
    except:
      return description
  return description

#Example
print(noise_switch('furniture aluminium alu iron tripod rd table'))
print(noise_switch('furniture aluminium alu iron tripod rd table',3))

furniture aluminium alu iron tripod rd table
furniture aluminium alu iron tripod rd table


In [None]:
#Kaggle dataset for commonly misspelt words.
#https://www.kaggle.com/bittlingmayer/spelling?select=birkbeck.txt
import random
kaggle_misspell = {}

with open('birkbeck.txt') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=':')
    for row in csv_reader:
      kaggle_misspell[row[0]] = row[1:][0].split()

#Function finds and replaces words in the description with misspelled words from Kaggle.
#All words present are switched. If many misspellings exist, then one misspelling is selected by random.
def noise_kaggle(description, frequency = None):
  og_words = description.split()
  words = description.lower().split()
  if frequency == None:
    for i, word in enumerate(words):
      if word in kaggle_misspell:
        og_words[i] = random.choice(kaggle_misspell[word])
  else:
    kaggle_indexes = random.sample(range(len(og_words)), k=frequency)
    for i, word in enumerate(words):
      if i in kaggle_indexes and word in kaggle_misspell:
        og_words[i] = random.choice(kaggle_misspell[word])
  return " ".join(og_words)

#Example
print(noise_kaggle('FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd Table'))
print(noise_kaggle('FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd Table'))
print(noise_kaggle('FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd Table'))
print(noise_kaggle('FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd Table'))
print(noise_kaggle('FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd Table'))

print(noise_kaggle('FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd Table', 0))
print(noise_kaggle('FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd Table', 1))
print(noise_kaggle('FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd Table', 2))
print(noise_kaggle('FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd Table', 3))
print(noise_kaggle('FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd Table', 4))

FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd tabel
FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd tabl
FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd tabel
FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd tabl
FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd tasble
FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd Table
FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd Table
FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd Table
FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd tabel
FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd tabel


In [None]:
#Function that returns a noised description

def noise_description(description, del_freq = 1, switch_freq = 1):
  return(noise_kaggle(noise_switch(noise_deletion(description, del_freq),switch_freq)))

#Example
print(noise_description('FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd Table'))
print(noise_description('FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd Table'))
print(noise_description('FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd Table'))
print(noise_description('FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd Table'))
print(noise_description('FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd Table'))
print(noise_description('READYMADE GARMENTS BOYS INFANT F/S SUIT 100% COTTON STYLE NO. 41156 SIZE 0 TO 24 MONTS (PURPLE CEATION GSTIN 27AAAC'))
print(noise_description('READYMADE GARMENTS BOYS INFANT F/S SUIT 100% COTTON STYLE NO. 41156 SIZE 0 TO 24 MONTS (PURPLE CEATION GSTIN 27AAAC'))
print(noise_description('READYMADE GARMENTS BOYS INFANT F/S SUIT 100% COTTON STYLE NO. 41156 SIZE 0 TO 24 MONTS (PURPLE CEATION GSTIN 27AAAC'))
print(noise_description('READYMADE GARMENTS BOYS INFANT F/S SUIT 100% COTTON STYLE NO. 41156 SIZE 0 TO 24 MONTS (PURPLE CEATION GSTIN 27AAAC'))
print(noise_description('READYMADE GARMENTS BOYS INFANT F/S SUIT 100% COTTON STYLE NO. 41156 SIZE 0 TO 24 MONTS (PURPLE CEATION GSTIN 27AAAC'))

FURNITURE (ALUINIUM)- Alu/Iron Tripod Rd tabel
FURNITUER (ALUMINIUM) - Alu/Iron ripod Rd tasble
FURNITURE (ALUMINIUM) - Alu/Iron Tripod R dTble
FURNITURE (ALUINIUM)- Alu/Iron Tripod Rd tabel
FURNITURE (ALUMIIUM) - Alu/Iron Tripod dR tabl
READYMADE GARMENTS boyes enfant F/S sute 100% cotten stile NO. 41156 sise 0 tow 2 MONTS (PUPRLE CEATION GSTIN 27AAAC
READYMADE GARMENTS boy enfant F/S suite 100% cotten STYL N.O 41156 sise 0 the 24 MONTS (PURPLE CEATION GSTIN 27AAAC
READYMADE GRMENTS boy INFAN TF/S suite 100% cotten stile NO. 41156 sise 0 ot 24 MONTS (PURPLE CEATION GSTIN 27AAAC
READYMAD GARMENTS boyes enfant F/S sute 100% cotten stile NO. 41156 sixze 0 ro 24 MONTS( pulper CEATION GSTIN 27AAAC
READYMADE GARMENTS boyes enfant F/S sute 100% cotten STYL NO. 41156 sise 0 OT 24 MONTS (PURPLE CEATION GSTIN 27AAAC


In [None]:
def random_noise_description(description):
  freqs = [random.randrange(len(description.split())), random.randrange(2), random.randrange(2)]
  #print(freqs)
  noised_description = noise_kaggle(description, frequency = freqs[0])
  noised_description = noise_switch(noised_description, frequency = freqs[1])
  noised_description = noise_deletion(noised_description, frequency = freqs[2])
  return noised_description

#Example  
print(random_noise_description('FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd Table'))
print(random_noise_description('FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd Table'))
print(random_noise_description('FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd Table'))
print(random_noise_description('FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd Table'))
print(random_noise_description('FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd Table'))
print(random_noise_description('READYMADE GARMENTS BOYS INFANT F/S SUIT 100% COTTON STYLE NO. 41156 SIZE 0 TO 24 MONTS (PURPLE CEATION GSTIN 27AAAC'))
print(random_noise_description('READYMADE GARMENTS BOYS INFANT F/S SUIT 100% COTTON STYLE NO. 41156 SIZE 0 TO 24 MONTS (PURPLE CEATION GSTIN 27AAAC'))
print(random_noise_description('READYMADE GARMENTS BOYS INFANT F/S SUIT 100% COTTON STYLE NO. 41156 SIZE 0 TO 24 MONTS (PURPLE CEATION GSTIN 27AAAC'))
print(random_noise_description('READYMADE GARMENTS BOYS INFANT F/S SUIT 100% COTTON STYLE NO. 41156 SIZE 0 TO 24 MONTS (PURPLE CEATION GSTIN 27AAAC'))
print(random_noise_description('READYMADE GARMENTS BOYS INFANT F/S SUIT 100% COTTON STYLE NO. 41156 SIZE 0 TO 24 MONTS (PURPLE CEATION GSTIN 27AAAC'))


print(random_noise_description('cotton woven mens coat'))
print(random_noise_description('cotton woven mens coat'))
print(random_noise_description('cotton woven mens coat'))
print(random_noise_description('cotton woven mens coat'))
print(random_noise_description('cotton woven mens coat'))
print(random_noise_description('cotton woven mens coat'))

FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd bale
FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd Table
FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd Table
FURNITURE (ALUMINIUM) - Alu/Iron Tripod Rd Table
FURINTURE (ALUMINIUM) - Alu/Iron Tripod Rd Table
READYMADE GARMENTS boyes enfant F/S SUIT 100% cotten stile NO. 41156 sixze 0 ro 24 MONTS (PURPLE CEATION GSTIN 27ACA
READYMADE GARMENTS boy INFANT F/S suite 100% cotten STYLE NO. 41156 SIZE 0 of 24 MONTS (PURPLE CEATION GSTIN 27AAAC
READYMADE GARMENTS boyes enfant F/S suite 100% cotten STYLE NO. 4116 sise 0 th 24 MONTS (PURPLE CEATION GSTIN 27AAAC
READYMADE GARMENTS boyes enfant F/S SUIT 100% cotten stile NO. 41156 sixze 0 two 24 MONTS (PURPLE CEATON GSTIN 27AAAC
READYMADE GARMENTS BOYS INFANT F/S SUIT 100% COTTON STYL NO. 41156 SIZE 0 TO 24 MONTS (PURPLE CEATION GSTIN 27AAAC
cotton wove nmens ct
cotto nwoven mens coat
cotten owven mens coat
coton woven mens coat
cotetn woven mens coat
cotton woven mens coat


#Noised training dataset

The noised descriptions are data augmentations for the existing descriptions. Also they provide more stable typo-correct word pairings.

In [None]:
import pandas as pd

noised901_1 = pd.DataFrame(train901, columns = ['uuid', 'origin_id', 'description', 'cleaned_description', 'hs_code', 'origin', 'invalidated', 'imported_at']) 
noised901_1 = noised901_1[noised901_1['description'].notna()]
noised901_2 = noised901_1.copy(deep=True)
noised901_3 = noised901_1.copy(deep=True)
noised901_4 = noised901_1.copy(deep=True)

noised901_2['description'] = noised901_1['cleaned_description']

noised901_3['description'] = noised901_3['description'].apply(noise_description)

noised901_4['description'] = noised901_4['cleaned_description'].apply(noise_description)

print(noised901_1.shape,noised901_2.shape,noised901_3.shape,noised901_4.shape)

In [None]:
noised901 = pd.concat([noised901_1,noised901_2,noised901_3,noised901_4], axis=0)
print(noised901.shape)

(737916, 8)


In [None]:
import pandas as pd

rng_noised901 = pd.DataFrame(sample901, columns = ['uuid', 'origin_id', 'description', 'rasul_cleaned_description', 'hs_code', 'cleaned_description', 'noised_description']) 

In [None]:
rng_noised901.head()

Unnamed: 0,uuid,origin_id,description,rasul_cleaned_description,hs_code,cleaned_description,noised_description
0,890abe46-f7a3-4054-b88e-88f1a1b153e7,1985872,CERAMIC HARDWARE : 40MM PORCELAIN FLOW ER KN...,ceramic hardware porcelain flower knob white m...,6914100000,ceramic hardware mm porcelain flow er knob whi...,ceramic hardwear mm porcelian flow r knob wite...
1,c02296c5-8b30-4215-8c80-826ea61c7c03,358953,READYMADE GARMENTS - 100% COTTON WOVEN P/L LA...,readymadegarments cottonwoven ladies dress,6204420090,readymade garments cotton woven p l ladies dress,readymade garmet scotton woven p l adies dres
2,e3ac9e71-96f6-42ea-a213-10a3877df4f1,1543075,SPARE PARTS-- (LIGHT ACC)(1.7 KGS),spareparts light acc kgs,9405990090,spare parts light acc kgs,pare parts ligth acc kgs
3,8c308910-e5cb-4943-b70d-67cfed4e2a82,662266,STYLISH KID S COTTON SPANDEX CLOTHING SET,stylish kid cotton spandex clothing set,6204228090,stylish kid s cotton spandex clothing set,stylis kid s cotten spandex clotihng sat
4,23691adb-cf44-4e9a-8111-1a1e408a50b9,1833319,BRAKE FRICTION PLATE (PART NO. AC-738) (TRACTO...,brake friction plate part tractor parts,8708309990,brake friction plate part no ac tractor parts,brakke frictin blate pard on ac tracto rparts


In [None]:
rng_noised901['noised_description'] = rng_noised901['cleaned_description'].apply(random_noise_description)

In [None]:
rng_noised901.head()

Unnamed: 0,uuid,origin_id,description,rasul_cleaned_description,hs_code,cleaned_description,noised_description
0,890abe46-f7a3-4054-b88e-88f1a1b153e7,1985872,CERAMIC HARDWARE : 40MM PORCELAIN FLOW ER KN...,ceramic hardware porcelain flower knob white m...,6914100000,ceramic hardware mm porcelain flow er knob whi...,ceramic hardwear mm porcelain flow er knob whi...
1,c02296c5-8b30-4215-8c80-826ea61c7c03,358953,READYMADE GARMENTS - 100% COTTON WOVEN P/L LA...,readymadegarments cottonwoven ladies dress,6204420090,readymade garments cotton woven p l ladies dress,readymade garments cotton woven p l ladies dress
2,e3ac9e71-96f6-42ea-a213-10a3877df4f1,1543075,SPARE PARTS-- (LIGHT ACC)(1.7 KGS),spareparts light acc kgs,9405990090,spare parts light acc kgs,ster parts ight acc kgs
3,8c308910-e5cb-4943-b70d-67cfed4e2a82,662266,STYLISH KID S COTTON SPANDEX CLOTHING SET,stylish kid cotton spandex clothing set,6204228090,stylish kid s cotton spandex clothing set,stylish kid s cotton spandex clothing set
4,23691adb-cf44-4e9a-8111-1a1e408a50b9,1833319,BRAKE FRICTION PLATE (PART NO. AC-738) (TRACTO...,brake friction plate part tractor parts,8708309990,brake friction plate part no ac tractor parts,brake friction plate art no ac tractor parts


In [None]:
rng_noised901.shape

(15058, 7)

In [None]:
rng_noised901_2 = rng_noised901.copy(deep=True)
rng_noised901_2['noised_description'] = rng_noised901_2['cleaned_description']
rng_noised901 = pd.concat([rng_noised901,rng_noised901_2], axis=0)

rng_noised901.head()

Unnamed: 0,uuid,origin_id,description,rasul_cleaned_description,hs_code,cleaned_description,noised_description
0,a68d46ce-7d18-473b-94ea-d8e893c2c1bc,546647,100% COTTON WOVEN MENS COAT,cottonwoven mens coat,6201190000,cotton woven mens coat,cotte nwoven mens cot
1,ba263c49-82c9-41c0-872d-70e82138b840,1058388,PISTON - 11.03455-4274 [COMPONENTS FOR MARINE ...,piston components marine diesel engine,8409990090,piston components for marine diesel engine,piston componnts for marine diesel engnie
2,f0992bf1-f7c9-4b5e-acb0-53c000b8babe,1153247,"SPARE PARTS FOR EXCAVATOR - PIPEFUEL,FUEL FILT...",spareparts excavator pipe fuel fuel filter lea...,8431498000,spare parts for excavator pipefuel fuel filter...,spare parts for excavator pipefuel fl filter l...
3,66ae8795-e482-483b-bea2-6d0b869c3059,1775737,Parts Of Automobile Steering System: P. S. G. ...,parts automobile steering system box bar order...,8708949900,parts of automobile steering system p s g box ...,parts of automobile steering syste p s g bax r...
4,af91342f-af2d-4a92-9fee-dfe62a69e475,1823954,"AUTO SPARE PARTS PANEL ASSY-FRONT DOOR,R",autospareparts panel assy frontdoor,8708999790,auto spare parts panel assy front door r,outo esr parts panel assy front do r


In [None]:
rng_noised901.shape

(4253596, 7)

In [None]:
rng_noised901 = rng_noised901.sample(frac = 1)

In [None]:
rng_noised901.shape

(4253596, 7)

In [None]:
rng_noised901.head()

Unnamed: 0,uuid,origin_id,description,rasul_cleaned_description,hs_code,cleaned_description,noised_description
136931,1124409f-47dd-4f51-a759-d44c2e7b4b18,1665424,EMPTY HOUSING FOR PANEL 12W SQ / RING / BOTTOM...,empty housing panel ring bottom set,9405990090,empty housing for panel w sq ring bottom set,empty housing for panel w sq ring bottom set
864751,e414757c-5277-4b9c-908a-8ea4a78baf40,1781139,PARTS FOR AGRICULTURAL TRACTORS -CONTROL VALVE...,parts agricultural tractors control valve slee...,8708999790,parts for agricultural tractors control valve ...,parts fre agricultura tractosr contioll valve ...
553431,36a555c6-2520-4385-a14d-d777205d83f7,153413,SUIT CASE MADE OF PP WITHOUT ALUMINIUM F RAME ...,suitcase made without aluminium rame samsonite...,4202129990,suit case made of pp without aluminium f rame ...,suit case made of pp without aluminium f rame ...
2112376,f4e04f9e-7440-42a9-9ee7-d540eed0669c,1108865,"THRUST BALL BEARING, 51128 (NET WEIGHT 1.850 K...",thrust ball bearing net weight for ndustrial use,8482109090,thrust ball bearing net weight kgs pcs for i n...,thrust boll bearing net weght kgs pcs for i nd...
1961397,19729e15-c3f9-4f67-b92a-6b45da1de48d,1811336,AUTO PARTS - RING GEAR,autoparts ring gear,8708999790,auto parts ring gear,ato parst ringgear


In [None]:
rng_noised901.to_csv(index=False)