In [1]:
import pandas as pd
import numpy as np
from nltk import word_tokenize
import nltk
import re, string, unicodedata
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from word2number import w2n
from ast import literal_eval
from nltk.corpus import wordnet
import qgrid

In [3]:
df = pd.read_csv('prepared_data_tokenized.csv',low_memory=False)
df['description_text'] = df.description_text.apply(literal_eval)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1220834 entries, 0 to 1220833
Data columns (total 2 columns):
description_text     1220834 non-null object
harmonized_number    1220834 non-null object
dtypes: object(2)
memory usage: 18.6+ MB


Unnamed: 0,description_text,harmonized_number
0,"[WOODWORKING, MACHINE, AND, SPARE, PARTS, H.S....",846591
1,"[WOODWORKING, MACHINE, AND, SPARE, PARTS, PO, ...",846591
2,"[STAND, ,, ZERO, CLEARANCE, THROAT, PLATE, ,, ...",846591
3,"[., ., ., ., ., ., ., .]",846591
4,"[., ., ., ., ., .]",846591


In [4]:
def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

df['description_text'] = df['description_text'].apply(to_lowercase)

In [5]:
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

df['description_text'] = df['description_text'].apply(remove_non_ascii)

In [6]:
def word_to_num(words):
    """Convert all textual numbers to digits from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word
        try:
            new_word = str(w2n.word_to_num(word))
        except:
            pass
        new_words.append(new_word)
    return new_words

df['description_text'] = df['description_text'].apply(word_to_num)

In [7]:
def remove_non_alpha(words):
    """Remove non alpha in list of tokenized words"""
    new_words = []
    for word in words:
        if word.isalpha():
            new_words.append(word)
    return new_words

df['description_text'] = df['description_text'].apply(remove_non_alpha)

In [8]:
def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

df['description_text'] = df['description_text'].apply(remove_punctuation)

In [9]:
def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

df['description_text'] = df['description_text'].apply(remove_stopwords)

In [14]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, get_wordnet_pos(word))
        lemmas.append(lemma)
    return lemmas

df['description_text'] = df['description_text'].apply(lemmatize)

In [17]:
def remove_non_english(words):
    """Remove non english words in list of tokenized words"""
    english = set(nltk.corpus.words.words())
    new_words = []
    for word in words:
        if word in english:
            new_words.append(word)
    return new_words

df['description_text'] = df['description_text'].apply(remove_non_english)

In [21]:
df2 = df.copy()
df2['description_text'] = df2['description_text'].apply(lambda x: ' '.join(x))
df2['description_text'] = df2['description_text'].str.replace(',', '').str.replace('[', '').str.replace(']', '')
mf = pd.Series(' '.join(df2['description_text']).lower().split()).value_counts()
qgrid.show_grid(mf)

QgridWidget(grid_options={'fullWidthRows': True, 'syncColumnCellResize': True, 'forceFitColumns': True, 'defau…

In [22]:
def remove_noise(words):
    """Remove noise from list of tokenized words"""
    noise = ['hs','code','hts','invoice','pallet','kg','certify','po','pack','industry','office','expiration',
                'voyage','cargo','clearance','date','onto','loading', 'de', 'appliance','en','ca','un',
                'freight','package', 'load','shipper','qty', 'net','contain','container','order','number',
                'weight','contract','carrier','shipment','dhl', 'notify','collect','shipper','certified',
                'pc','forwarding','delivery','note', 'ref', 'packed', 'gross','product','loaded',
                'piece','export','of','and','no','po','on','for','nw','number','article','classification',
                'tariff','china','brazil','argentine','date','new','unpacked','limited','traffic','pay',
                'ctn','nr','no','name','am','serial','ex','exceed','payable','regulate','fi','exclude',
                'blk','gr','international','fca','due','eta','etd','accordance','deliver',
                'stc','gross','order','pcs','total','per','the','prepaid','consist','distribution',
                'plt','contain','equipment','capacity','class','white','description','approve',
                'shipped','nos','ncm','tariff','category','ready','license','violation',
                'country','count','nesoi','ncm','ruc','complete','id','quantity','regulation',
                'destination','nac','pkg','declare','declared','fax','cargo','transportation',
                'commercial','contact','nvocc','nbr', 'prepaid','brand','continuation',
                'hc','mm','customer','orange','violet','rose','company',
                'imo','sc','tsca','cm','hscode','harmless','applicable',
                'po','ship','nw','y','pkg', 'esd','email','sc','pack','detail','transport',
                'banq','origin','pa','charge','account','via',
                'rate', 'package','certify','container', 'consignee','declare',
                'purchase','payment','bill','abroad','express','certify',
                'red','green','black','grey','blue','yellow','white','account','charge','local',
                'date','id','name','item','customer','lot','duty','invoice','commodity',
                'address','comply','say','mexico','canada','japan','uk','germany','france',
                'shipping','exporter', "import", "information", "declaration",'supplier','release',
                'loader','agreement','liability','compliant', 'registration','compliance','weigh',
                'foreign', 'corporation','discharge','certificate','detention','logistics','vessel',
                'transit','temperature','quality','transfer','impact','requirement','global','tax','reception',
                'manufacturer','carriage','enterprise','trading','emergency','free','description','agreement','terminal',
                'a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z',
                ]
    new_words = []
    for word in words:
        if word not in noise:
            new_words.append(word)
    return new_words

df['description_text'] = df['description_text'].apply(remove_noise)

In [23]:
df = df[df['description_text'].map(lambda d: len(d)) > 0]

df = df.reset_index()
df = df.drop(['index'],axis=1)

df.to_csv('final_lemmatized.csv', index=False)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1120838 entries, 0 to 1120837
Data columns (total 2 columns):
description_text     1120838 non-null object
harmonized_number    1120838 non-null object
dtypes: object(2)
memory usage: 17.1+ MB


Unnamed: 0,description_text,harmonized_number
0,"[woodworking, machine, spare, part]",846591
1,"[woodworking, machine, spare, part, spare, par...",846591
2,"[stand, throat, plate, glide, pad]",846591
3,"[garment, men, soccer, sock, soccer, sock, jun...",611595
4,"[cover, connect, prestige]",611595


In [21]:
df.to_csv('final_lemmatized.csv', index=False)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1125750 entries, 0 to 1125749
Data columns (total 2 columns):
description_text     1125750 non-null object
harmonized_number    1125750 non-null object
dtypes: object(2)
memory usage: 17.2+ MB


Unnamed: 0,description_text,harmonized_number
0,"['woodworking', 'machine', 'spare', 'part']",846591
1,"['woodworking', 'machine', 'spare', 'part', 's...",846591
2,"['stand', 'throat', 'plate', 'glide', 'pad']",846591
3,"['garment', 'men', 'soccer', 'sock', 'soccer',...",611595
4,"['cover', 'connect', 'prestige']",611595
