# Mount drive


In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# change this to your desired directory
ROOT = '/content/drive/MyDrive/Machine_Learning'
# and this too
ROOT_DATA = f'{ROOT}/shopee_sentiment_data_set'

!ls $ROOT

Mounted at /content/drive
cache  colab		   envibert_original
ckpt   envibert_augmented  shopee_sentiment_data_set


# Install packages


In [None]:
!pip install fairseq
!pip install pytorch-lightning
!pip install transformers
!pip install sentencepiece

Collecting fairseq
  Downloading fairseq-0.10.2-cp37-cp37m-manylinux1_x86_64.whl (1.7 MB)
[K     |████████████████████████████████| 1.7 MB 5.1 MB/s 
Collecting dataclasses
  Downloading dataclasses-0.6-py3-none-any.whl (14 kB)
Collecting sacrebleu>=1.4.12
  Downloading sacrebleu-2.0.0-py3-none-any.whl (90 kB)
[K     |████████████████████████████████| 90 kB 8.9 MB/s 
Collecting hydra-core
  Downloading hydra_core-1.1.1-py3-none-any.whl (145 kB)
[K     |████████████████████████████████| 145 kB 62.0 MB/s 
Collecting colorama
  Downloading colorama-0.4.4-py2.py3-none-any.whl (16 kB)
Collecting portalocker
  Downloading portalocker-2.3.2-py2.py3-none-any.whl (15 kB)
Collecting omegaconf==2.1.*
  Downloading omegaconf-2.1.1-py3-none-any.whl (74 kB)
[K     |████████████████████████████████| 74 kB 3.1 MB/s 
Collecting antlr4-python3-runtime==4.8
  Downloading antlr4-python3-runtime-4.8.tar.gz (112 kB)
[K     |████████████████████████████████| 112 kB 45.4 MB/s 
[?25hCollecting PyYAML>=5.1

Collecting pytorch-lightning
  Downloading pytorch_lightning-1.5.3-py3-none-any.whl (523 kB)
[K     |████████████████████████████████| 523 kB 5.1 MB/s 
Collecting pyDeprecate==0.3.1
  Downloading pyDeprecate-0.3.1-py3-none-any.whl (10 kB)
Collecting fsspec[http]!=2021.06.0,>=2021.05.0
  Downloading fsspec-2021.11.1-py3-none-any.whl (132 kB)
[K     |████████████████████████████████| 132 kB 60.4 MB/s 
Collecting future>=0.17.1
  Downloading future-0.18.2.tar.gz (829 kB)
[K     |████████████████████████████████| 829 kB 41.5 MB/s 
[?25hCollecting torchmetrics>=0.4.1
  Downloading torchmetrics-0.6.0-py3-none-any.whl (329 kB)
[K     |████████████████████████████████| 329 kB 53.7 MB/s 
Collecting aiohttp
  Downloading aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 41.6 MB/s 
Collecting frozenlist>=1.1.1
  Downloading frozenlist-1.2.0-cp37-cp37m-manylinux_2_5_x86_64.m

# Pre-processing

In [None]:
import json
import math
import pandas as pd
import re
import requests
import unicodedata

# preview files
def preview_file():
    global ROOT_DATA
    data = pd.read_csv(f'{ROOT_DATA}/train_preprocess_unsegment.csv')
    print(data['text'].tail)
    print(data['preprocess_text'].tail)
    data = pd.read_csv(f'{ROOT_DATA}/p_train.csv')
    print(data['text'].tail)

# removing spaces adjacent to accents
def rm_accents_spaces(s: str):
    # the accents looks pretty cool right lol
    accents = ['̀', '̃', '́', '̉', '̣']

    i = 1
    while i < len(s):
        if s[i] in accents:
            s = f'{s[:i-1]}{s[i]}{s[i+2:]}'
        i += 1

    return s

# segment words that are stucked together using word frequency
def segment_word(word: str, word_dict: dict):
    result = []
    space = []
    cost = [0]
    for i in range(1, len(word)+1):
        min_cost = float('inf')
        p = 0

        for j in range(1, i+1):
            w = word[max(0, i-j):i]
            w = word_dict.get(w, {'appearance': 0, 'cost': float('inf')})
            c = cost[max(0, i-j)] + w['cost']
            if min_cost > c:
                min_cost = c
                p = max(0, i-j)

        space.append(p)
        cost.append(min_cost)

    p = len(space)-1
    while(p >= 0):
        result.append(word[space[p]:p+1])
        p = space[p]-1

    result.reverse()

    return result

# segment a sentence
def segment_sentence(sentence: str, word_dict: dict):
    # spliting the sentence
    words = sentence.split(' ')
    words = list(filter(None, words))
    result = []
    # segment each word
    for word in words:
        result.extend(segment_word(word, word_dict))
    result = ' '.join(result)
    return result

# load word dictionary
def load_word_dict(cached: bool = False):
    # Check if cached
    if cached:
        with open(f'{ROOT_DATA}/word_dict.json', 'r', encoding='utf8') as f:
            word_dict = json.load(f)
    else:
        # Get the online data
        print('Fetching word list . . .')
        url = 'https://raw.githubusercontent.com/garfieldnate/vi_experiments/master/wiki_word_list/wikipedia_unigrams.txt'
        response = requests.get(url)

        # Parse the data
        print('Processing word list . . .')
        raw = response.text
        lines = raw.split('\n')
        lines = lines[1:]
        word_dict = {}
        total_count = 0
        for line in lines:
            tmp = line.split('\t')
            if len(tmp) == 2:
                appearance = int(tmp[1])
                total_count += appearance
                word_dict[re.sub(r'\s+', '_', tmp[0])] = {
                    'appearance': appearance,
                    'cost': 0
                }

        for i in range(101):
            word_dict[str(i)] = {
                'appearance': 100,
                'cost': 0
            }
            total_count += 100

        for word in word_dict:
            word_dict[word]['cost'] = math.log(
                total_count/word_dict[word]['appearance']
            )

        # Cache the data
        print('Caching word list . . .')
        with open(f'{ROOT_DATA}/word_dict.json', 'w+', encoding='utf8') as f:
            json.dump(word_dict, f)

    return word_dict

# preprocess starts here
def preprocess(filename: str = 'train.csv', field: str = 'text', underscore_mode: bool = False):
    global ROOT_DATA
    data = pd.read_csv(f'{ROOT_DATA}/{filename}')

    reviews = data[field]
    p_reviews = []

    count = 0
    word_list = load_word_dict()

    for review in reviews:
        count += 1
        if count % 1000 == 1:
            print(f'Processing row {count}-{min(len(reviews), count+999)}')

        review = review.lower()
        review = re.sub(r'\s+', ' ', review)
        review = rm_accents_spaces(review)
        review = unicodedata.normalize('NFC', review)
        review = re.sub(r'\s*\W\s*', ' ', review)
        review = re.sub(r'\s+', ' ', review)
        review = segment_sentence(review, word_list)
        if underscore_mode is False:
            review = re.sub(r'_', ' ', review)
        review = re.sub(r'\s+', ' ', review)
        p_reviews.append(review)

    data[field] = p_reviews
    data.to_csv(f'{ROOT_DATA}/p_{filename}')

# main
if __name__ == '__main__':
    preprocess('train.csv', 'text', False)
    preprocess('test.csv', 'text', False)
    preview_file()


Fetching word list . . .
Processing word list . . .
Caching word list . . .
Processing row 1-1000
Processing row 1001-2000
Processing row 2001-3000
Processing row 3001-4000
Processing row 4001-5000
Processing row 5001-6000
Processing row 6001-7000
Processing row 7001-8000
Processing row 8001-9000
Processing row 9001-10000
Processing row 10001-11000
Processing row 11001-12000
Processing row 12001-13000
Processing row 13001-14000
Processing row 14001-15000
Processing row 15001-16000
Processing row 16001-17000
Processing row 17001-18000
Processing row 18001-19000
Processing row 19001-20000
Processing row 20001-21000
Processing row 21001-22000
Processing row 22001-23000
Processing row 23001-24000
Processing row 24001-25000
Processing row 25001-26000
Processing row 26001-27000
Fetching word list . . .
Processing word list . . .
Caching word list . . .
Processing row 1-1000
Processing row 1001-2000
Processing row 2001-3000
<bound method NDFrame.tail of 0        Đến quán 2 lần thôi , rất là t