# Data Preprocessing for Text Classification with Muse Emebedding 
## Contents:
1. Get Muse Embiddings
2. Clear, tokenize and save train and test data


In [1]:
import numpy as np
import pandas as pd
import nltk
import re

In [2]:
!sh get_embeddings.sh

*** Looking for MUSE embeddings...
*** Downloading English MUSE embeddings
--2020-05-10 18:26:22--  https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.en.vec
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 104.22.75.142
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 628614720 (599M) [text/plain]
Saving to: <<./muse_embeddings/wiki.multi.en.vec>>


2020-05-10 18:45:56 (524 KB/s) - <<./muse_embeddings/wiki.multi.en.vec>> saved [628614720/628614720]

*** Downloading Russian MUSE  embeddings
--2020-05-10 18:45:56--  https://dl.fbaipublicfiles.com/arrival/vectors/wiki.multi.ru.vec
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.75.142, 104.22.74.142
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 630591038 (601M) [text/plain]
Saving to

In [3]:
TEST_DATA = "test_data.tsv"  
TRAIN_DATA = "train_data.tsv"
CLEAN_TRAIN_DATA = "clean_train_data.csv"
CLEAN_TEST_DATA = "clean_test_data.csv"

In [4]:
def clean_text(t):
    t = t.lower()
    t = t.replace('\r\n', ' ')
    mask = r'[",.`%$#@&*()!|^~/\=+-_]'
    t = re.sub(mask,'',t)
    return t.strip()

In [5]:
def textTokens(sent):
    """Split text into a list of tokens. 
    Return a list of tokens.
    """
    sent_tokens = []
    for token in nltk.word_tokenize(sent):
        sent_tokens.append(token)          
    return (sent_tokens)

### Read train data and label classes

In [6]:
# Read all train data
df = pd.read_csv(TRAIN_DATA, sep='\t', index_col=0)
df.info()
df.sample(5)

  mask |= (ar1 == a)


<class 'pandas.core.frame.DataFrame'>
Int64Index: 3983203 entries, 0 to 3983202
Data columns (total 3 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   ru_name   object
 1   eng_name  object
 2   answer    bool  
dtypes: bool(1), object(2)
memory usage: 95.0+ MB


Unnamed: 0,ru_name,eng_name,answer
539901,"ООО ""САТУРН""","Limited liability company ""K.I.K.""",False
3709679,"Общество с ограниченной ответственностью ""АйДи...","LLC ""iD-Tech""",True
1591731,"Общество с ограниченной ответственностью ""Прим...","""Adamant Estate""",False
285589,"ООО ""Арт Вью""","""EveregServer""",False
659982,"Общество с ограниченной ответственностью ""Бухг...",Chaste taste,False


In [7]:
# Create 0/1 label from answer column 
df['label'] = df['answer'].apply(lambda x: int(x) )
df = df.drop('answer', axis=1)

In [8]:
def clean_tokenize(df):
    """ Clean and tokenize text. Appen 'ru_tocks' and 'en_tocks' to data frame.
        Args: 
            df - dataframe
        Returns: 
            tockens data frame 
    """
    # Clean text
    df['ru_name'] = df['ru_name'].apply(lambda x: clean_text(x))
    df['eng_name'] = df['eng_name'].apply(lambda x: clean_text(x))
    df['ru_tocks'] = df['ru_name'].apply(lambda x: textTokens(x))
    df['eng_tocks'] = df['eng_name'].apply(lambda x: textTokens(x))
    df = df.drop(['ru_name','eng_name'], axis=1)
    return df


### Clean, tokenize and save train data

In [9]:
%%time
df = clean_tokenize(df)

CPU times: user 11min 43s, sys: 2.67 s, total: 11min 46s
Wall time: 11min 46s


In [10]:
#df = df.drop(['ru_name','eng_name'], axis=1)
df.info()
df.sample(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3983203 entries, 0 to 3983202
Data columns (total 3 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   label      int64 
 1   ru_tocks   object
 2   eng_tocks  object
dtypes: int64(1), object(2)
memory usage: 121.6+ MB


Unnamed: 0,label,ru_tocks,eng_tocks
1128279,0,"[общество, с, ограниченной, ответственностью, ...","[imperial, russian, ballet, llc]"
2791743,0,"[ооо, руслазер]","[international, cooperation, services, ics, ltd]"
1562657,0,"[ооо, топ, сервис]",[dekod]


In [11]:
# Save tockenized data
df.to_csv(CLEAN_TRAIN_DATA)

In [12]:
# Read test data
df_t = pd.read_csv(TEST_DATA, sep='\t', index_col=0)
df_t.info()
df_t.sample(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 996052 entries, 0 to 996051
Data columns (total 2 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   ru_name   996052 non-null  object
 1   eng_name  996052 non-null  object
dtypes: object(2)
memory usage: 22.8+ MB


Unnamed: 0,ru_name,eng_name
50917,"Общество с ограниченной ответственностью ""БОРВ...",Limited Liability Company Chandler Group
882310,"Общество с ограниченной ответственностью ""ВиВа-Т""","Closed Joint-Stock Company ""Company groups ""Cl..."
452263,"ООО ""СК ""ПИТЕР-ГОЛЬФ""","""CAN"" Limited"
626503,"ООО ""КОРЕНЪ""","JSC ""IC ""Delta Don"""
864765,"ООО ""Элит Стоун""",N 11 VIII


### Clean, tokenize and save test data

In [13]:
# Clean and tokenize test data
df_t = clean_tokenize(df_t)
df_t.info()
df_t.sample(3)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 996052 entries, 0 to 996051
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   ru_tocks   996052 non-null  object
 1   eng_tocks  996052 non-null  object
dtypes: object(2)
memory usage: 22.8+ MB


Unnamed: 0,ru_tocks,eng_tocks
126667,"[общество, с, ограниченной, ответственностью, ...","[nauchnoproizvodstvennoe, predpriyatie, androm..."
814532,"[общество, с, ограниченной, ответственностью, ...","[ddec, service, ltd]"
37019,"[ооо, торговый, дом, сиоен, эксклюзивный, дист...","[diverse, communications]"


In [14]:
df_t.to_csv(CLEAN_TEST_DATA)