In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%pip install nltk
%pip install contractions
%pip install pandas
%pip install tqdm



In [None]:
# from spellchecker import SpellChecker
import contractions
import urllib.parse
import re
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')


class CleanText:

    def __init__(self):
        # self.eng_checker = SpellChecker(language='en')
        self.lemmatizer = nltk.WordNetLemmatizer()
        nltk_stopwords = stopwords.words('english')
        negative_words = ['not', 'no', 'never', 'nor',
                          # location words
                          "off", "out", "over", "under", "up", "down",]
        self.stop_words = [
            word for word in nltk_stopwords if word not in negative_words]

    def _lower_case(self, text):
        return text.lower()

    def _check_url(self, url):
        # check if the url is valid or not
        parsed = urllib.parse.urlparse(url)
        return bool(parsed.scheme and parsed.netloc)

    def _remove_html(self, text):
        # remove all the html tags
        result = text.replace('<.*?>', '')
        return result

    def _expaned_contractions(self, text):
        return contractions.fix(text)

    def _remove_emails(self, text):
        return text.replace(r"[\w\.-]+@[\w\.-]+\.\w+", "")

    def _change_user_name(self, text):
        pattern = re.compile(r"@\w+")
        return re.sub(pattern, "user", text)

    def _replace_urls(self, text):
        # Replace URLs with 'url'
        return re.sub(r'https?://(www\.)?(\w+)(\.\w+)(/\w*)?', 'url', text)

    def _replace_symbol_to_space(self, text):
        # replace all the symbols to space
        return re.sub(r'[^\w\s]', ' ', text)

    def _remove_single_char(self, text):
        # remove all the single characters
        return re.sub(r'\b(?<![a-hj-z])[a-z](?![a-z])\b', ' ', text)

    def _remove_multiple_space(self, text):
        # remove all the multiple spaces
        return re.sub(r'\s+', ' ', text)

    def _remove_number(self, text):
        return re.sub(r'[0-9]', '', text)

    def _remove_repeat_word(self, text):
        # remove all the characters that appear more than 2 times in a word
        return re.sub(r'(.)\1+', r'\1\1', text)

    def _fit_word(self, text):
        # Replace misplassed words
        words = text.split()
        words = [self._correct_word(word) for word in words]
        return ' '.join(words)

    def _correct_word(self, text):
        fixed = self.eng_checker.correction(text)
        if fixed != None:
            return fixed
        return text

    def _remove_emoji(self, text):
        emoji_pattern = re.compile("["
                                   u"\U0001F600-\U0001F64F"  # emoticons
                                   u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                   u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                   u"\U0001F1E0-\U0001F1FF"  # flags
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', text)

    def _remove_stopwords(self, text):
        # Remove stopwords
        word_tokens = nltk.word_tokenize(text)
        filtered_text = [w for w in word_tokens if w not in self.stop_words]
        return ' '.join(filtered_text)

    def _lemmatize_text(self, text):
        # Lemmatize the text
        output=""
        text=text.split(" ")
        for word in text:
          word1 = self.lemmatizer.lemmatize(word, pos = "n")
          word2 = self.lemmatizer.lemmatize(word1, pos = "v")
          word3 = self.lemmatizer.lemmatize(word2, pos = "a")
          word4 = self.lemmatizer.lemmatize(word3, pos = "r")
          output=output + " " + word4
        return str(output.strip())

    def _fit_in_alphabet(self, text):
        regex = re.compile('[^a-zA-Z]')
        # First parameter is the replacement, second parameter is your input string
        return regex.sub(' ', text)

    def clean_lstm(self, text):

        # lọc nhiễu
        text = self._lower_case(text)
        text = self._remove_html(text)
        text = self._replace_urls(text)
        text = self._remove_emails(text)
        text = self._change_user_name(text)
        text = self._remove_emoji(text)
        text = self._remove_repeat_word(text)

        # chuẩn hóa

        text = self._expaned_contractions(text)
        # text = self._fit_word(text)
        text = self._remove_stopwords(text)
        text = self._fit_in_alphabet(text)
        text = self._lemmatize_text(text)

        # loại bỏ phần không cần thiết
        text = self._remove_single_char(text)
        text = self._remove_multiple_space(text)

        return text.strip()

    def clean_w2v(self, text):
        # lọc nhiễu
        text = self._lower_case(text)
        text = self._remove_html(text)
        text = self._replace_urls(text)
        text = self._remove_emails(text)
        text = self._change_user_name(text)
        text = self._remove_emoji(text)
        text = self._remove_repeat_word(text)

        # chuẩn hóa
        text = self._expaned_contractions(text)
        text = self._fit_in_alphabet(text)
        text = self._lemmatize_text(text)

        # loại bỏ phần không cần thiết
        text = self._remove_single_char(text)
        text = self._remove_multiple_space(text)

        return text.strip()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Load Data


In [None]:
import pandas as pd
from tqdm import tqdm
tqdm.pandas()

In [None]:
chunksize = 10**5

chunks = pd.read_csv('/content/drive/MyDrive/KLTN/data/reddit_comments.csv',chunksize=chunksize)


In [None]:
cleaner = CleanText()
for index, chunk in enumerate(chunks):
  print(f'{index}')
  chunk['body'] = chunk['body'].fillna('no comment').progress_apply(cleaner.clean_w2v)
  chunk.to_csv(f'/content/drive/MyDrive/KLTN/data/reddit_comments_cleaned_{index}.csv');

0


100%|██████████| 100000/100000 [00:35<00:00, 2846.11it/s]


1


100%|██████████| 100000/100000 [00:32<00:00, 3032.57it/s]


2


100%|██████████| 100000/100000 [00:33<00:00, 2992.76it/s]


3


100%|██████████| 100000/100000 [00:35<00:00, 2845.35it/s]


4


100%|██████████| 100000/100000 [00:37<00:00, 2689.59it/s]


5


100%|██████████| 100000/100000 [00:40<00:00, 2496.38it/s]


6


100%|██████████| 100000/100000 [00:41<00:00, 2431.64it/s]


7


100%|██████████| 100000/100000 [00:39<00:00, 2527.76it/s]


8


100%|██████████| 100000/100000 [00:40<00:00, 2463.26it/s]


9


100%|██████████| 100000/100000 [00:41<00:00, 2411.53it/s]


10


100%|██████████| 100000/100000 [00:42<00:00, 2374.18it/s]


11


100%|██████████| 100000/100000 [00:41<00:00, 2407.72it/s]


12


100%|██████████| 100000/100000 [00:41<00:00, 2392.21it/s]


13


100%|██████████| 100000/100000 [00:53<00:00, 1872.08it/s]


14


100%|██████████| 100000/100000 [00:40<00:00, 2488.88it/s]


15


100%|██████████| 100000/100000 [00:39<00:00, 2531.30it/s]


16


100%|██████████| 100000/100000 [00:39<00:00, 2528.16it/s]


17


100%|██████████| 100000/100000 [00:37<00:00, 2671.44it/s]


18


100%|██████████| 100000/100000 [00:35<00:00, 2846.53it/s]


19


100%|██████████| 100000/100000 [00:33<00:00, 2946.98it/s]


20


100%|██████████| 100000/100000 [00:33<00:00, 3008.75it/s]


21


100%|██████████| 100000/100000 [00:36<00:00, 2750.84it/s]


22


100%|██████████| 100000/100000 [00:39<00:00, 2553.04it/s]


23


100%|██████████| 100000/100000 [00:39<00:00, 2519.61it/s]


24


100%|██████████| 100000/100000 [00:39<00:00, 2562.85it/s]


25


100%|██████████| 100000/100000 [00:38<00:00, 2621.21it/s]


26


100%|██████████| 100000/100000 [00:37<00:00, 2647.33it/s]


27


100%|██████████| 100000/100000 [00:37<00:00, 2657.54it/s]


28


100%|██████████| 100000/100000 [00:37<00:00, 2654.36it/s]


29


100%|██████████| 100000/100000 [00:44<00:00, 2225.45it/s]


30


100%|██████████| 100000/100000 [00:37<00:00, 2637.89it/s]


31


100%|██████████| 100000/100000 [00:37<00:00, 2666.62it/s]


32


100%|██████████| 100000/100000 [00:36<00:00, 2718.97it/s]


33


100%|██████████| 100000/100000 [00:36<00:00, 2745.76it/s]


34


100%|██████████| 100000/100000 [00:36<00:00, 2734.48it/s]


35


100%|██████████| 100000/100000 [00:33<00:00, 3013.99it/s]


36


100%|██████████| 100000/100000 [00:33<00:00, 2956.37it/s]


37


100%|██████████| 100000/100000 [00:36<00:00, 2758.08it/s]


38


100%|██████████| 100000/100000 [00:38<00:00, 2564.72it/s]


39


100%|██████████| 100000/100000 [00:40<00:00, 2465.10it/s]


40


100%|██████████| 100000/100000 [00:39<00:00, 2550.71it/s]


41


100%|██████████| 100000/100000 [00:38<00:00, 2567.89it/s]


42


100%|██████████| 100000/100000 [00:39<00:00, 2538.54it/s]


43


100%|██████████| 100000/100000 [00:39<00:00, 2560.60it/s]


44


100%|██████████| 100000/100000 [00:38<00:00, 2581.92it/s]


45


100%|██████████| 100000/100000 [00:46<00:00, 2151.27it/s]


46


100%|██████████| 100000/100000 [00:35<00:00, 2830.15it/s]


47


100%|██████████| 100000/100000 [00:36<00:00, 2706.04it/s]


48


100%|██████████| 100000/100000 [00:37<00:00, 2663.00it/s]


49


100%|██████████| 100000/100000 [00:35<00:00, 2816.91it/s]


50


100%|██████████| 100000/100000 [00:36<00:00, 2717.48it/s]


51


100%|██████████| 100000/100000 [00:34<00:00, 2860.27it/s]


52


100%|██████████| 100000/100000 [00:32<00:00, 3043.47it/s]


53


100%|██████████| 100000/100000 [00:40<00:00, 2493.78it/s]


54


100%|██████████| 100000/100000 [00:40<00:00, 2453.76it/s]


55


100%|██████████| 100000/100000 [00:41<00:00, 2401.37it/s]


56


100%|██████████| 100000/100000 [00:40<00:00, 2491.11it/s]


57


100%|██████████| 100000/100000 [00:42<00:00, 2363.14it/s]


58


100%|██████████| 100000/100000 [00:42<00:00, 2334.86it/s]


59


100%|██████████| 100000/100000 [00:43<00:00, 2311.74it/s]


60


100%|██████████| 100000/100000 [00:43<00:00, 2297.76it/s]


61


100%|██████████| 100000/100000 [00:43<00:00, 2282.90it/s]


62


100%|██████████| 100000/100000 [00:43<00:00, 2300.27it/s]


63


100%|██████████| 100000/100000 [00:43<00:00, 2307.95it/s]


64


100%|██████████| 100000/100000 [00:42<00:00, 2357.13it/s]


65


100%|██████████| 100000/100000 [00:42<00:00, 2364.43it/s]


66


100%|██████████| 100000/100000 [00:48<00:00, 2064.60it/s]


67


100%|██████████| 100000/100000 [00:38<00:00, 2593.72it/s]


68


100%|██████████| 100000/100000 [00:36<00:00, 2761.79it/s]


69


100%|██████████| 100000/100000 [00:35<00:00, 2824.55it/s]


70


100%|██████████| 100000/100000 [00:36<00:00, 2726.13it/s]


71


100%|██████████| 100000/100000 [00:37<00:00, 2679.33it/s]


72


100%|██████████| 100000/100000 [00:39<00:00, 2521.46it/s]


73


100%|██████████| 100000/100000 [00:41<00:00, 2392.63it/s]


74


100%|██████████| 100000/100000 [00:41<00:00, 2382.83it/s]


75


100%|██████████| 100000/100000 [00:42<00:00, 2366.02it/s]


76


100%|██████████| 100000/100000 [00:43<00:00, 2304.68it/s]


77


100%|██████████| 100000/100000 [00:43<00:00, 2304.87it/s]


78


100%|██████████| 100000/100000 [00:43<00:00, 2293.25it/s]


79


100%|██████████| 100000/100000 [00:42<00:00, 2358.04it/s]


80


100%|██████████| 100000/100000 [00:43<00:00, 2318.71it/s]


81


100%|██████████| 100000/100000 [00:42<00:00, 2339.49it/s]


82


100%|██████████| 100000/100000 [00:41<00:00, 2390.45it/s]


83


100%|██████████| 100000/100000 [00:40<00:00, 2443.44it/s]


84


100%|██████████| 100000/100000 [00:41<00:00, 2438.91it/s]


85


100%|██████████| 100000/100000 [00:40<00:00, 2491.88it/s]


86


100%|██████████| 100000/100000 [00:39<00:00, 2532.35it/s]


87


100%|██████████| 100000/100000 [00:37<00:00, 2659.09it/s]


88


100%|██████████| 100000/100000 [00:36<00:00, 2727.65it/s]


89


100%|██████████| 100000/100000 [00:38<00:00, 2565.02it/s]


90


100%|██████████| 100000/100000 [00:38<00:00, 2610.43it/s]


91


100%|██████████| 100000/100000 [00:40<00:00, 2490.92it/s]


92


100%|██████████| 100000/100000 [00:42<00:00, 2327.34it/s]


93


100%|██████████| 100000/100000 [00:41<00:00, 2428.71it/s]


94


100%|██████████| 100000/100000 [00:48<00:00, 2081.59it/s]


95


100%|██████████| 100000/100000 [00:41<00:00, 2381.23it/s]


96


100%|██████████| 100000/100000 [00:42<00:00, 2370.43it/s]


97


100%|██████████| 100000/100000 [00:42<00:00, 2356.97it/s]


98


100%|██████████| 100000/100000 [00:41<00:00, 2395.66it/s]


99


100%|██████████| 100000/100000 [00:42<00:00, 2371.09it/s]


100


100%|██████████| 100000/100000 [00:41<00:00, 2398.08it/s]


101


100%|██████████| 100000/100000 [00:41<00:00, 2405.99it/s]


102


100%|██████████| 100000/100000 [00:40<00:00, 2484.58it/s]


103


100%|██████████| 100000/100000 [00:41<00:00, 2428.17it/s]


104


100%|██████████| 100000/100000 [00:41<00:00, 2418.40it/s]


105


100%|██████████| 100000/100000 [00:39<00:00, 2504.04it/s]


106


100%|██████████| 100000/100000 [00:37<00:00, 2656.83it/s]


107


100%|██████████| 100000/100000 [00:36<00:00, 2758.85it/s]


108


100%|██████████| 100000/100000 [00:38<00:00, 2619.15it/s]


109


100%|██████████| 100000/100000 [00:37<00:00, 2636.58it/s]


110


100%|██████████| 100000/100000 [00:39<00:00, 2505.43it/s]


111


100%|██████████| 100000/100000 [00:42<00:00, 2337.68it/s]


112


100%|██████████| 100000/100000 [00:41<00:00, 2419.10it/s]


113


100%|██████████| 100000/100000 [00:43<00:00, 2309.36it/s]


114


100%|██████████| 100000/100000 [00:42<00:00, 2354.77it/s]


115


100%|██████████| 100000/100000 [00:43<00:00, 2311.46it/s]


116


100%|██████████| 100000/100000 [00:43<00:00, 2313.33it/s]


117


100%|██████████| 100000/100000 [00:43<00:00, 2308.74it/s]


118


100%|██████████| 100000/100000 [00:42<00:00, 2340.54it/s]


119


100%|██████████| 100000/100000 [00:41<00:00, 2418.95it/s]


120


100%|██████████| 100000/100000 [00:41<00:00, 2420.55it/s]


121


100%|██████████| 100000/100000 [00:40<00:00, 2445.17it/s]


122


100%|██████████| 100000/100000 [00:39<00:00, 2560.07it/s]


123


100%|██████████| 100000/100000 [00:46<00:00, 2131.01it/s]


124


100%|██████████| 100000/100000 [00:39<00:00, 2556.99it/s]


125


100%|██████████| 100000/100000 [00:38<00:00, 2593.71it/s]


126


100%|██████████| 100000/100000 [00:38<00:00, 2567.72it/s]


127


100%|██████████| 100000/100000 [00:37<00:00, 2673.63it/s]


128


100%|██████████| 100000/100000 [00:38<00:00, 2618.60it/s]


129


100%|██████████| 100000/100000 [00:39<00:00, 2522.54it/s]


130


100%|██████████| 100000/100000 [00:41<00:00, 2406.19it/s]


131


100%|██████████| 100000/100000 [00:41<00:00, 2432.70it/s]


132


100%|██████████| 100000/100000 [00:39<00:00, 2504.91it/s]


133


100%|██████████| 100000/100000 [00:40<00:00, 2441.57it/s]


134


100%|██████████| 100000/100000 [00:41<00:00, 2389.24it/s]


135


100%|██████████| 100000/100000 [00:42<00:00, 2379.03it/s]


136


100%|██████████| 100000/100000 [00:42<00:00, 2364.80it/s]


137


100%|██████████| 100000/100000 [00:42<00:00, 2329.15it/s]


138


100%|██████████| 100000/100000 [00:42<00:00, 2340.66it/s]


139


100%|██████████| 100000/100000 [00:43<00:00, 2276.89it/s]


140


100%|██████████| 100000/100000 [00:41<00:00, 2392.07it/s]


141


100%|██████████| 100000/100000 [00:39<00:00, 2511.43it/s]


142


100%|██████████| 100000/100000 [00:39<00:00, 2517.42it/s]


143


100%|██████████| 100000/100000 [00:37<00:00, 2697.73it/s]


144


100%|██████████| 100000/100000 [00:36<00:00, 2749.59it/s]


145


100%|██████████| 100000/100000 [00:35<00:00, 2851.99it/s]


146


100%|██████████| 100000/100000 [00:36<00:00, 2717.97it/s]


147


100%|██████████| 100000/100000 [00:38<00:00, 2593.25it/s]


148


100%|██████████| 100000/100000 [00:40<00:00, 2455.26it/s]


149


100%|██████████| 100000/100000 [00:46<00:00, 2140.56it/s]


150


100%|██████████| 100000/100000 [00:39<00:00, 2563.53it/s]


151


100%|██████████| 100000/100000 [00:38<00:00, 2574.64it/s]


152


100%|██████████| 100000/100000 [00:39<00:00, 2518.44it/s]


153


100%|██████████| 100000/100000 [00:39<00:00, 2532.11it/s]


154


100%|██████████| 100000/100000 [00:38<00:00, 2598.00it/s]


155


100%|██████████| 100000/100000 [00:38<00:00, 2581.70it/s]


156


100%|██████████| 100000/100000 [00:38<00:00, 2603.84it/s]


157


100%|██████████| 100000/100000 [00:38<00:00, 2629.68it/s]


158


100%|██████████| 100000/100000 [00:36<00:00, 2762.66it/s]


159


100%|██████████| 100000/100000 [00:35<00:00, 2808.39it/s]


160


100%|██████████| 100000/100000 [00:36<00:00, 2762.22it/s]


161


100%|██████████| 100000/100000 [00:37<00:00, 2699.72it/s]


162


100%|██████████| 100000/100000 [00:37<00:00, 2645.72it/s]


163


100%|██████████| 100000/100000 [00:40<00:00, 2484.30it/s]


164


100%|██████████| 100000/100000 [00:40<00:00, 2478.19it/s]


165


100%|██████████| 100000/100000 [00:40<00:00, 2457.64it/s]


166


100%|██████████| 100000/100000 [00:39<00:00, 2508.48it/s]


167


100%|██████████| 100000/100000 [00:39<00:00, 2508.41it/s]


168


100%|██████████| 100000/100000 [00:40<00:00, 2479.54it/s]


169


100%|██████████| 100000/100000 [00:40<00:00, 2493.51it/s]


170


100%|██████████| 100000/100000 [00:40<00:00, 2488.36it/s]


171


100%|██████████| 100000/100000 [00:38<00:00, 2567.74it/s]


172


100%|██████████| 100000/100000 [00:36<00:00, 2714.81it/s]


173


100%|██████████| 100000/100000 [00:37<00:00, 2643.46it/s]


174


100%|██████████| 100000/100000 [00:37<00:00, 2642.45it/s]


175


100%|██████████| 100000/100000 [00:36<00:00, 2758.89it/s]


176


100%|██████████| 100000/100000 [00:37<00:00, 2671.27it/s]


177


100%|██████████| 100000/100000 [00:37<00:00, 2697.12it/s]


178


100%|██████████| 100000/100000 [00:39<00:00, 2559.13it/s]


179


100%|██████████| 100000/100000 [00:41<00:00, 2413.33it/s]


180


100%|██████████| 100000/100000 [00:49<00:00, 2012.77it/s]


181


100%|██████████| 100000/100000 [00:40<00:00, 2440.46it/s]


182


100%|██████████| 100000/100000 [00:42<00:00, 2366.10it/s]


183


100%|██████████| 100000/100000 [00:42<00:00, 2359.91it/s]


184


100%|██████████| 100000/100000 [00:42<00:00, 2358.12it/s]


185


100%|██████████| 100000/100000 [00:42<00:00, 2345.89it/s]


186


100%|██████████| 100000/100000 [00:42<00:00, 2357.65it/s]


187


100%|██████████| 100000/100000 [00:42<00:00, 2376.67it/s]


188


100%|██████████| 100000/100000 [00:42<00:00, 2362.35it/s]


189


100%|██████████| 100000/100000 [00:41<00:00, 2414.66it/s]


190


100%|██████████| 100000/100000 [00:39<00:00, 2505.33it/s]


191


100%|██████████| 100000/100000 [00:39<00:00, 2520.91it/s]


192


100%|██████████| 100000/100000 [00:38<00:00, 2570.81it/s]


193


100%|██████████| 100000/100000 [00:37<00:00, 2668.85it/s]


194


100%|██████████| 100000/100000 [00:35<00:00, 2808.89it/s]


195


100%|██████████| 100000/100000 [00:37<00:00, 2680.28it/s]


196


100%|██████████| 100000/100000 [00:39<00:00, 2557.13it/s]


197


100%|██████████| 100000/100000 [00:41<00:00, 2398.78it/s]


198


100%|██████████| 100000/100000 [00:40<00:00, 2470.29it/s]


199


100%|██████████| 100000/100000 [00:40<00:00, 2497.50it/s]


200


100%|██████████| 100000/100000 [00:41<00:00, 2426.89it/s]


201


100%|██████████| 100000/100000 [00:42<00:00, 2374.25it/s]


202


100%|██████████| 100000/100000 [00:42<00:00, 2341.87it/s]


203


100%|██████████| 100000/100000 [00:49<00:00, 2021.16it/s]


204


100%|██████████| 100000/100000 [00:41<00:00, 2403.46it/s]


205


100%|██████████| 100000/100000 [00:41<00:00, 2400.40it/s]


206


100%|██████████| 100000/100000 [00:41<00:00, 2423.47it/s]


207


100%|██████████| 100000/100000 [00:40<00:00, 2493.20it/s]


208


100%|██████████| 100000/100000 [00:40<00:00, 2450.53it/s]


209


100%|██████████| 100000/100000 [00:40<00:00, 2474.36it/s]


210


100%|██████████| 100000/100000 [00:37<00:00, 2666.47it/s]


211


100%|██████████| 100000/100000 [00:36<00:00, 2774.47it/s]


212


100%|██████████| 100000/100000 [00:34<00:00, 2939.80it/s]


213


100%|██████████| 100000/100000 [00:35<00:00, 2842.41it/s]


214


100%|██████████| 100000/100000 [00:35<00:00, 2783.67it/s]


215


100%|██████████| 100000/100000 [00:39<00:00, 2508.99it/s]


216


100%|██████████| 100000/100000 [00:41<00:00, 2422.60it/s]


217


100%|██████████| 100000/100000 [00:42<00:00, 2362.89it/s]


218


100%|██████████| 100000/100000 [00:39<00:00, 2521.59it/s]


219


100%|██████████| 100000/100000 [00:42<00:00, 2369.27it/s]


220


100%|██████████| 100000/100000 [00:42<00:00, 2369.96it/s]


221


100%|██████████| 100000/100000 [00:42<00:00, 2380.34it/s]


222


100%|██████████| 100000/100000 [00:41<00:00, 2391.84it/s]


223


100%|██████████| 100000/100000 [00:42<00:00, 2361.51it/s]


224


100%|██████████| 100000/100000 [00:42<00:00, 2340.92it/s]


225


100%|██████████| 100000/100000 [00:49<00:00, 2036.55it/s]


226


100%|██████████| 100000/100000 [00:40<00:00, 2457.17it/s]


227


100%|██████████| 100000/100000 [00:40<00:00, 2493.82it/s]


228


100%|██████████| 100000/100000 [00:39<00:00, 2535.15it/s]


229


100%|██████████| 100000/100000 [00:39<00:00, 2538.37it/s]


230


100%|██████████| 100000/100000 [00:37<00:00, 2667.50it/s]


231


100%|██████████| 100000/100000 [00:36<00:00, 2733.06it/s]


232


100%|██████████| 100000/100000 [00:35<00:00, 2809.16it/s]


233


100%|██████████| 100000/100000 [00:37<00:00, 2654.51it/s]


234


100%|██████████| 100000/100000 [00:42<00:00, 2364.44it/s]


235


100%|██████████| 100000/100000 [00:44<00:00, 2244.94it/s]


236


100%|██████████| 100000/100000 [00:44<00:00, 2237.01it/s]


237


100%|██████████| 100000/100000 [00:41<00:00, 2424.28it/s]


238


100%|██████████| 100000/100000 [00:42<00:00, 2369.77it/s]


239


100%|██████████| 100000/100000 [00:43<00:00, 2304.24it/s]


240


100%|██████████| 100000/100000 [00:44<00:00, 2240.46it/s]


241


100%|██████████| 100000/100000 [00:44<00:00, 2252.32it/s]


242


100%|██████████| 100000/100000 [00:43<00:00, 2308.80it/s]


243


100%|██████████| 100000/100000 [00:42<00:00, 2363.35it/s]


244


100%|██████████| 100000/100000 [00:41<00:00, 2396.34it/s]


245


100%|██████████| 100000/100000 [00:40<00:00, 2447.70it/s]


246


100%|██████████| 100000/100000 [00:40<00:00, 2475.47it/s]


247


100%|██████████| 100000/100000 [00:39<00:00, 2500.19it/s]


248


100%|██████████| 100000/100000 [00:39<00:00, 2523.32it/s]


249


100%|██████████| 100000/100000 [00:36<00:00, 2729.28it/s]


250


100%|██████████| 100000/100000 [00:36<00:00, 2759.50it/s]


251


100%|██████████| 100000/100000 [00:35<00:00, 2784.24it/s]


252


100%|██████████| 100000/100000 [00:37<00:00, 2694.26it/s]


253


100%|██████████| 100000/100000 [00:37<00:00, 2665.28it/s]


254


100%|██████████| 100000/100000 [00:41<00:00, 2434.10it/s]


255


100%|██████████| 100000/100000 [00:39<00:00, 2504.24it/s]


256


100%|██████████| 100000/100000 [00:40<00:00, 2478.63it/s]


257


100%|██████████| 100000/100000 [00:41<00:00, 2397.83it/s]


258


100%|██████████| 100000/100000 [00:47<00:00, 2110.05it/s]


259


100%|██████████| 100000/100000 [00:41<00:00, 2417.56it/s]


260


100%|██████████| 100000/100000 [00:41<00:00, 2400.82it/s]


261


100%|██████████| 100000/100000 [00:41<00:00, 2433.81it/s]


262


100%|██████████| 100000/100000 [00:41<00:00, 2402.56it/s]


263


100%|██████████| 100000/100000 [00:40<00:00, 2452.70it/s]


264


100%|██████████| 100000/100000 [00:41<00:00, 2431.41it/s]


265


100%|██████████| 100000/100000 [00:39<00:00, 2545.33it/s]


266


100%|██████████| 100000/100000 [00:38<00:00, 2579.35it/s]


267


100%|██████████| 100000/100000 [00:38<00:00, 2628.73it/s]


268


100%|██████████| 100000/100000 [00:36<00:00, 2714.01it/s]


269


100%|██████████| 100000/100000 [00:36<00:00, 2729.76it/s]


270


100%|██████████| 100000/100000 [00:38<00:00, 2583.22it/s]


271


100%|██████████| 100000/100000 [00:39<00:00, 2542.71it/s]


272


100%|██████████| 100000/100000 [00:39<00:00, 2522.30it/s]


273


100%|██████████| 100000/100000 [00:36<00:00, 2723.69it/s]


274


100%|██████████| 100000/100000 [00:36<00:00, 2743.61it/s]


275


100%|██████████| 100000/100000 [00:36<00:00, 2728.14it/s]


276


100%|██████████| 100000/100000 [00:37<00:00, 2656.49it/s]


277


100%|██████████| 100000/100000 [00:37<00:00, 2683.82it/s]


278


100%|██████████| 100000/100000 [00:37<00:00, 2669.05it/s]


279


100%|██████████| 100000/100000 [00:38<00:00, 2621.08it/s]


280


100%|██████████| 100000/100000 [00:38<00:00, 2607.71it/s]


281


100%|██████████| 100000/100000 [00:39<00:00, 2531.30it/s]


282


100%|██████████| 100000/100000 [00:37<00:00, 2678.71it/s]


283


100%|██████████| 100000/100000 [00:36<00:00, 2762.08it/s]


284


100%|██████████| 100000/100000 [00:36<00:00, 2716.99it/s]


285


100%|██████████| 100000/100000 [00:44<00:00, 2225.46it/s]


286


100%|██████████| 100000/100000 [00:39<00:00, 2561.40it/s]


287


100%|██████████| 100000/100000 [00:40<00:00, 2467.38it/s]


288


100%|██████████| 100000/100000 [00:40<00:00, 2475.59it/s]


289


100%|██████████| 100000/100000 [00:39<00:00, 2535.19it/s]


290


100%|██████████| 100000/100000 [00:39<00:00, 2529.18it/s]


291


100%|██████████| 100000/100000 [00:38<00:00, 2572.27it/s]


292


100%|██████████| 100000/100000 [00:39<00:00, 2541.32it/s]


293


100%|██████████| 100000/100000 [00:38<00:00, 2628.90it/s]


294


100%|██████████| 100000/100000 [00:37<00:00, 2654.12it/s]


295


100%|██████████| 100000/100000 [00:37<00:00, 2642.26it/s]


296


100%|██████████| 100000/100000 [00:37<00:00, 2659.31it/s]


297


100%|██████████| 100000/100000 [00:38<00:00, 2566.50it/s]


298


100%|██████████| 100000/100000 [00:37<00:00, 2636.54it/s]


299


100%|██████████| 100000/100000 [00:36<00:00, 2734.44it/s]


300


100%|██████████| 100000/100000 [00:37<00:00, 2663.24it/s]


301


100%|██████████| 100000/100000 [00:40<00:00, 2458.05it/s]


302


100%|██████████| 100000/100000 [00:42<00:00, 2375.88it/s]


303


100%|██████████| 100000/100000 [00:44<00:00, 2259.81it/s]


304


100%|██████████| 100000/100000 [00:43<00:00, 2321.52it/s]


305


100%|██████████| 100000/100000 [00:42<00:00, 2350.98it/s]


306


100%|██████████| 100000/100000 [00:41<00:00, 2405.50it/s]


307


100%|██████████| 100000/100000 [00:41<00:00, 2382.90it/s]


308


100%|██████████| 100000/100000 [00:49<00:00, 2009.88it/s]


309


100%|██████████| 100000/100000 [00:42<00:00, 2372.68it/s]


310


100%|██████████| 100000/100000 [00:42<00:00, 2371.17it/s]


311


100%|██████████| 100000/100000 [00:41<00:00, 2418.38it/s]


312


100%|██████████| 100000/100000 [00:43<00:00, 2284.56it/s]


313


100%|██████████| 100000/100000 [00:41<00:00, 2392.85it/s]


314


100%|██████████| 100000/100000 [00:40<00:00, 2445.60it/s]


315


100%|██████████| 100000/100000 [00:40<00:00, 2492.99it/s]


316


100%|██████████| 100000/100000 [00:37<00:00, 2635.54it/s]


317


100%|██████████| 100000/100000 [00:36<00:00, 2737.92it/s]


318


100%|██████████| 100000/100000 [00:36<00:00, 2746.66it/s]


319


100%|██████████| 100000/100000 [00:39<00:00, 2540.25it/s]


320


100%|██████████| 100000/100000 [00:41<00:00, 2402.01it/s]


321


100%|██████████| 100000/100000 [00:42<00:00, 2373.31it/s]


322


100%|██████████| 100000/100000 [00:41<00:00, 2419.11it/s]


323


100%|██████████| 100000/100000 [00:41<00:00, 2425.10it/s]


324


100%|██████████| 100000/100000 [00:42<00:00, 2372.93it/s]


325


100%|██████████| 100000/100000 [00:42<00:00, 2352.05it/s]


326


100%|██████████| 100000/100000 [00:41<00:00, 2384.84it/s]


327


100%|██████████| 100000/100000 [00:42<00:00, 2375.81it/s]


328


100%|██████████| 100000/100000 [00:42<00:00, 2354.06it/s]


329


100%|██████████| 100000/100000 [00:42<00:00, 2373.21it/s]


330


100%|██████████| 100000/100000 [00:51<00:00, 1939.29it/s]


331


100%|██████████| 100000/100000 [00:55<00:00, 1812.75it/s]


332


100%|██████████| 100000/100000 [00:49<00:00, 2001.07it/s]


333


100%|██████████| 100000/100000 [00:40<00:00, 2461.30it/s]


334


100%|██████████| 100000/100000 [00:39<00:00, 2544.50it/s]


335


100%|██████████| 100000/100000 [00:37<00:00, 2636.16it/s]


336


100%|██████████| 100000/100000 [00:37<00:00, 2670.54it/s]


337


100%|██████████| 100000/100000 [00:36<00:00, 2707.68it/s]


338


100%|██████████| 100000/100000 [00:36<00:00, 2741.71it/s]


339


100%|██████████| 100000/100000 [00:38<00:00, 2586.94it/s]


340


100%|██████████| 100000/100000 [00:42<00:00, 2358.48it/s]


341


100%|██████████| 100000/100000 [00:53<00:00, 1856.66it/s]


342


100%|██████████| 100000/100000 [00:40<00:00, 2492.17it/s]


343


100%|██████████| 100000/100000 [00:41<00:00, 2424.58it/s]


344


100%|██████████| 100000/100000 [00:41<00:00, 2383.67it/s]


345


100%|██████████| 100000/100000 [00:41<00:00, 2403.81it/s]


346


100%|██████████| 100000/100000 [00:41<00:00, 2429.75it/s]


347


100%|██████████| 100000/100000 [00:41<00:00, 2410.64it/s]


348


100%|██████████| 100000/100000 [00:41<00:00, 2420.91it/s]


349


100%|██████████| 100000/100000 [00:41<00:00, 2436.89it/s]


350


100%|██████████| 100000/100000 [00:40<00:00, 2472.79it/s]


351


100%|██████████| 100000/100000 [00:41<00:00, 2422.83it/s]


352


100%|██████████| 100000/100000 [00:40<00:00, 2495.30it/s]


353


100%|██████████| 100000/100000 [00:39<00:00, 2529.95it/s]


354


100%|██████████| 100000/100000 [00:35<00:00, 2791.41it/s]


355


100%|██████████| 100000/100000 [00:33<00:00, 2970.80it/s]


356


100%|██████████| 100000/100000 [00:33<00:00, 2943.73it/s]


357


100%|██████████| 100000/100000 [00:36<00:00, 2746.23it/s]


358


100%|██████████| 100000/100000 [00:38<00:00, 2580.97it/s]


359


100%|██████████| 100000/100000 [00:40<00:00, 2457.82it/s]


360


100%|██████████| 100000/100000 [00:40<00:00, 2451.19it/s]


361


100%|██████████| 100000/100000 [00:41<00:00, 2434.58it/s]


362


100%|██████████| 100000/100000 [00:41<00:00, 2384.56it/s]


363


100%|██████████| 100000/100000 [00:48<00:00, 2072.74it/s]


364


100%|██████████| 100000/100000 [00:40<00:00, 2439.39it/s]


365


100%|██████████| 100000/100000 [00:41<00:00, 2437.35it/s]


366


100%|██████████| 100000/100000 [00:41<00:00, 2433.98it/s]


367


100%|██████████| 100000/100000 [00:40<00:00, 2484.12it/s]


368


100%|██████████| 100000/100000 [00:39<00:00, 2539.35it/s]


369


100%|██████████| 100000/100000 [00:38<00:00, 2564.71it/s]


370


100%|██████████| 100000/100000 [00:39<00:00, 2541.92it/s]


371


100%|██████████| 100000/100000 [00:39<00:00, 2534.78it/s]


372


100%|██████████| 100000/100000 [00:39<00:00, 2545.32it/s]


373


100%|██████████| 100000/100000 [00:39<00:00, 2555.18it/s]


374


100%|██████████| 100000/100000 [00:37<00:00, 2694.51it/s]


375


100%|██████████| 100000/100000 [00:36<00:00, 2723.89it/s]


376


100%|██████████| 100000/100000 [00:38<00:00, 2573.22it/s]


377


100%|██████████| 100000/100000 [00:36<00:00, 2704.91it/s]


378


100%|██████████| 100000/100000 [00:37<00:00, 2649.06it/s]


379


100%|██████████| 100000/100000 [00:37<00:00, 2634.94it/s]


380


100%|██████████| 100000/100000 [00:40<00:00, 2464.14it/s]


381


100%|██████████| 100000/100000 [00:41<00:00, 2410.20it/s]


382


100%|██████████| 100000/100000 [00:41<00:00, 2393.35it/s]


383


100%|██████████| 100000/100000 [00:40<00:00, 2469.88it/s]


384


100%|██████████| 100000/100000 [00:41<00:00, 2435.37it/s]


385


100%|██████████| 100000/100000 [00:41<00:00, 2431.91it/s]


386


100%|██████████| 100000/100000 [00:40<00:00, 2457.92it/s]


387


100%|██████████| 100000/100000 [00:40<00:00, 2443.08it/s]


388


100%|██████████| 100000/100000 [00:40<00:00, 2449.68it/s]


389


100%|██████████| 100000/100000 [00:39<00:00, 2532.38it/s]


390


100%|██████████| 100000/100000 [00:36<00:00, 2703.71it/s]


391


100%|██████████| 100000/100000 [00:35<00:00, 2828.60it/s]


392


100%|██████████| 100000/100000 [00:37<00:00, 2681.53it/s]


393


100%|██████████| 100000/100000 [00:38<00:00, 2627.22it/s]


394


100%|██████████| 100000/100000 [00:39<00:00, 2518.72it/s]


395


100%|██████████| 100000/100000 [00:45<00:00, 2176.98it/s]


396


100%|██████████| 100000/100000 [00:38<00:00, 2620.95it/s]


397


100%|██████████| 100000/100000 [00:38<00:00, 2600.27it/s]


398


100%|██████████| 100000/100000 [00:38<00:00, 2567.72it/s]


399


100%|██████████| 100000/100000 [00:39<00:00, 2506.38it/s]


400


100%|██████████| 100000/100000 [00:39<00:00, 2553.86it/s]


401


100%|██████████| 100000/100000 [00:38<00:00, 2600.40it/s]


402


100%|██████████| 100000/100000 [00:38<00:00, 2612.56it/s]


403


100%|██████████| 100000/100000 [00:37<00:00, 2663.61it/s]


404


100%|██████████| 100000/100000 [00:37<00:00, 2667.08it/s]


405


100%|██████████| 100000/100000 [00:36<00:00, 2758.07it/s]


406


100%|██████████| 100000/100000 [00:36<00:00, 2774.91it/s]


407


100%|██████████| 100000/100000 [00:36<00:00, 2745.03it/s]


408


100%|██████████| 100000/100000 [00:39<00:00, 2511.90it/s]


409


100%|██████████| 100000/100000 [00:41<00:00, 2413.64it/s]


410


100%|██████████| 100000/100000 [00:44<00:00, 2269.91it/s]


411


100%|██████████| 100000/100000 [00:41<00:00, 2400.90it/s]


412


100%|██████████| 100000/100000 [00:39<00:00, 2546.06it/s]


413


100%|██████████| 100000/100000 [00:39<00:00, 2501.78it/s]


414


100%|██████████| 100000/100000 [00:40<00:00, 2492.12it/s]


415


100%|██████████| 100000/100000 [00:40<00:00, 2460.63it/s]


416


100%|██████████| 100000/100000 [00:40<00:00, 2492.52it/s]


417


100%|██████████| 100000/100000 [00:40<00:00, 2476.15it/s]


418


100%|██████████| 100000/100000 [00:40<00:00, 2474.30it/s]


419


100%|██████████| 100000/100000 [00:39<00:00, 2524.66it/s]


420


100%|██████████| 100000/100000 [00:38<00:00, 2586.80it/s]


421


100%|██████████| 100000/100000 [00:36<00:00, 2713.21it/s]


422


100%|██████████| 100000/100000 [00:36<00:00, 2718.72it/s]


423


100%|██████████| 100000/100000 [00:49<00:00, 2037.40it/s]


424


100%|██████████| 100000/100000 [00:42<00:00, 2361.81it/s]


425


100%|██████████| 100000/100000 [00:42<00:00, 2337.05it/s]


426


100%|██████████| 100000/100000 [00:42<00:00, 2329.22it/s]


427


100%|██████████| 100000/100000 [00:43<00:00, 2300.23it/s]


428


100%|██████████| 100000/100000 [00:43<00:00, 2312.06it/s]


429


100%|██████████| 100000/100000 [00:42<00:00, 2342.23it/s]


430


100%|██████████| 100000/100000 [00:42<00:00, 2369.88it/s]


431


100%|██████████| 100000/100000 [00:43<00:00, 2323.79it/s]


432


100%|██████████| 100000/100000 [00:42<00:00, 2358.59it/s]


433


100%|██████████| 100000/100000 [00:41<00:00, 2394.12it/s]


434


100%|██████████| 100000/100000 [00:41<00:00, 2390.38it/s]


435


100%|██████████| 100000/100000 [00:43<00:00, 2301.42it/s]


436


100%|██████████| 100000/100000 [00:39<00:00, 2511.27it/s]


437


100%|██████████| 100000/100000 [00:37<00:00, 2639.91it/s]


438


100%|██████████| 100000/100000 [00:35<00:00, 2845.30it/s]


439


100%|██████████| 100000/100000 [00:37<00:00, 2639.51it/s]


440


100%|██████████| 100000/100000 [00:39<00:00, 2527.18it/s]


441


100%|██████████| 100000/100000 [00:42<00:00, 2364.25it/s]


442


100%|██████████| 100000/100000 [00:45<00:00, 2213.54it/s]


443


100%|██████████| 100000/100000 [00:42<00:00, 2349.52it/s]


444


100%|██████████| 100000/100000 [00:42<00:00, 2350.40it/s]


445


100%|██████████| 100000/100000 [00:43<00:00, 2275.43it/s]


446


100%|██████████| 100000/100000 [00:44<00:00, 2234.96it/s]


447


100%|██████████| 100000/100000 [00:44<00:00, 2238.36it/s]


448


100%|██████████| 100000/100000 [00:49<00:00, 2009.29it/s]


449


100%|██████████| 100000/100000 [00:45<00:00, 2215.97it/s]


450


100%|██████████| 100000/100000 [00:44<00:00, 2266.34it/s]


451


100%|██████████| 100000/100000 [00:44<00:00, 2251.52it/s]


452


100%|██████████| 100000/100000 [00:43<00:00, 2280.80it/s]


453


100%|██████████| 100000/100000 [00:43<00:00, 2318.71it/s]


454


100%|██████████| 100000/100000 [00:42<00:00, 2376.70it/s]


455


100%|██████████| 100000/100000 [00:40<00:00, 2447.41it/s]


456


100%|██████████| 100000/100000 [00:38<00:00, 2606.37it/s]


457


100%|██████████| 100000/100000 [00:37<00:00, 2638.81it/s]


458


100%|██████████| 100000/100000 [00:37<00:00, 2657.85it/s]


459


100%|██████████| 100000/100000 [00:41<00:00, 2424.37it/s]


460


100%|██████████| 100000/100000 [00:43<00:00, 2316.56it/s]


461


100%|██████████| 100000/100000 [00:42<00:00, 2337.76it/s]


462


100%|██████████| 100000/100000 [00:42<00:00, 2353.65it/s]


463


100%|██████████| 100000/100000 [00:41<00:00, 2430.04it/s]


464


100%|██████████| 100000/100000 [00:43<00:00, 2291.59it/s]


465


100%|██████████| 100000/100000 [00:43<00:00, 2305.25it/s]


466


100%|██████████| 100000/100000 [00:44<00:00, 2248.19it/s]


467


100%|██████████| 100000/100000 [00:43<00:00, 2287.68it/s]


468


100%|██████████| 100000/100000 [00:43<00:00, 2293.17it/s]


469


100%|██████████| 100000/100000 [00:42<00:00, 2336.28it/s]


470


100%|██████████| 100000/100000 [00:42<00:00, 2336.13it/s]


471


100%|██████████| 100000/100000 [00:43<00:00, 2321.68it/s]


472


100%|██████████| 100000/100000 [00:42<00:00, 2348.99it/s]


473


100%|██████████| 100000/100000 [00:41<00:00, 2404.43it/s]


474


100%|██████████| 100000/100000 [00:41<00:00, 2437.12it/s]


475


100%|██████████| 100000/100000 [00:39<00:00, 2555.24it/s]


476


100%|██████████| 100000/100000 [00:37<00:00, 2668.82it/s]


477


100%|██████████| 100000/100000 [00:37<00:00, 2638.92it/s]


478


100%|██████████| 100000/100000 [00:44<00:00, 2231.43it/s]


479


100%|██████████| 100000/100000 [00:40<00:00, 2451.11it/s]


480


100%|██████████| 100000/100000 [00:43<00:00, 2286.59it/s]


481


100%|██████████| 100000/100000 [00:43<00:00, 2288.42it/s]


482


100%|██████████| 100000/100000 [00:42<00:00, 2374.81it/s]


483


100%|██████████| 100000/100000 [00:42<00:00, 2336.74it/s]


484


100%|██████████| 100000/100000 [00:42<00:00, 2347.99it/s]


485


100%|██████████| 100000/100000 [00:43<00:00, 2278.28it/s]


486


100%|██████████| 100000/100000 [00:43<00:00, 2310.40it/s]


487


100%|██████████| 100000/100000 [00:42<00:00, 2356.14it/s]


488


100%|██████████| 100000/100000 [00:43<00:00, 2301.64it/s]


489


100%|██████████| 100000/100000 [00:42<00:00, 2374.18it/s]


490


100%|██████████| 100000/100000 [00:41<00:00, 2409.09it/s]


491


100%|██████████| 100000/100000 [00:40<00:00, 2445.42it/s]


492


100%|██████████| 100000/100000 [00:39<00:00, 2514.85it/s]


493


100%|██████████| 100000/100000 [00:38<00:00, 2571.65it/s]


494


100%|██████████| 100000/100000 [00:39<00:00, 2551.91it/s]


495


100%|██████████| 100000/100000 [00:38<00:00, 2566.20it/s]


496


100%|██████████| 100000/100000 [00:38<00:00, 2604.72it/s]


497


100%|██████████| 100000/100000 [00:39<00:00, 2527.45it/s]


498


100%|██████████| 100000/100000 [00:40<00:00, 2443.98it/s]


499


100%|██████████| 100000/100000 [00:42<00:00, 2337.26it/s]


500


100%|██████████| 100000/100000 [00:48<00:00, 2056.74it/s]


501


100%|██████████| 100000/100000 [00:40<00:00, 2490.06it/s]


502


100%|██████████| 100000/100000 [00:41<00:00, 2416.59it/s]


503


100%|██████████| 100000/100000 [00:42<00:00, 2353.23it/s]


504


100%|██████████| 100000/100000 [00:42<00:00, 2336.21it/s]


505


100%|██████████| 100000/100000 [00:42<00:00, 2348.95it/s]


506


100%|██████████| 100000/100000 [00:41<00:00, 2388.85it/s]


507


100%|██████████| 100000/100000 [00:43<00:00, 2322.24it/s]


508


100%|██████████| 100000/100000 [00:43<00:00, 2290.35it/s]


509


100%|██████████| 100000/100000 [00:43<00:00, 2322.33it/s]


510


100%|██████████| 100000/100000 [00:42<00:00, 2365.56it/s]


511


100%|██████████| 100000/100000 [00:42<00:00, 2359.94it/s]


512


100%|██████████| 100000/100000 [00:39<00:00, 2542.40it/s]


513


100%|██████████| 100000/100000 [00:39<00:00, 2555.60it/s]


514


100%|██████████| 100000/100000 [00:38<00:00, 2612.62it/s]


515


100%|██████████| 100000/100000 [00:37<00:00, 2683.54it/s]


516


100%|██████████| 100000/100000 [00:38<00:00, 2586.67it/s]


517


100%|██████████| 100000/100000 [00:40<00:00, 2496.30it/s]


518


100%|██████████| 100000/100000 [00:41<00:00, 2430.78it/s]


519


100%|██████████| 100000/100000 [00:41<00:00, 2425.71it/s]


520


100%|██████████| 100000/100000 [00:40<00:00, 2469.85it/s]


521


100%|██████████| 100000/100000 [00:41<00:00, 2389.10it/s]


522


100%|██████████| 100000/100000 [00:41<00:00, 2420.34it/s]


523


100%|██████████| 100000/100000 [00:39<00:00, 2504.80it/s]


524


100%|██████████| 100000/100000 [00:39<00:00, 2555.10it/s]


525


100%|██████████| 100000/100000 [00:41<00:00, 2406.96it/s]


526


100%|██████████| 100000/100000 [00:41<00:00, 2418.25it/s]


527


100%|██████████| 100000/100000 [00:40<00:00, 2440.42it/s]


528


100%|██████████| 100000/100000 [00:41<00:00, 2437.11it/s]


529


100%|██████████| 100000/100000 [00:37<00:00, 2680.94it/s]


530


100%|██████████| 100000/100000 [00:35<00:00, 2791.63it/s]


531


100%|██████████| 100000/100000 [00:36<00:00, 2753.17it/s]


532


100%|██████████| 100000/100000 [00:38<00:00, 2629.52it/s]


533


100%|██████████| 100000/100000 [00:38<00:00, 2577.38it/s]


534


100%|██████████| 100000/100000 [00:41<00:00, 2395.65it/s]


535


100%|██████████| 100000/100000 [00:41<00:00, 2407.37it/s]


536


100%|██████████| 100000/100000 [00:46<00:00, 2147.68it/s]


537


100%|██████████| 100000/100000 [00:39<00:00, 2547.41it/s]


538


100%|██████████| 100000/100000 [00:40<00:00, 2489.51it/s]


539


100%|██████████| 100000/100000 [00:39<00:00, 2507.03it/s]


540


100%|██████████| 100000/100000 [00:39<00:00, 2513.81it/s]


541


100%|██████████| 100000/100000 [00:39<00:00, 2562.58it/s]


542


100%|██████████| 100000/100000 [00:38<00:00, 2605.04it/s]


543


100%|██████████| 100000/100000 [00:39<00:00, 2505.78it/s]


544


100%|██████████| 100000/100000 [00:37<00:00, 2637.46it/s]


545


100%|██████████| 4410/4410 [00:01<00:00, 2713.37it/s]


number 546 files

Make copus file

In [None]:
path='/content/drive/MyDrive/KLTN/data/preprogress'

In [None]:
with open(f'{path}/cooking.train','w') as f:
  # reddit comment
  for index in tqdm(range(545)):
    df = pd.read_csv(f'/content/drive/MyDrive/KLTN/data/reddit/reddit_comments_cleaned_{index}.csv').fillna('no comment')
    df.drop_duplicates(subset='body')
    for row in df['body']:
      f.write(row+'\n')

  fb = pd.read_csv(f'/content/drive/MyDrive/KLTN/data/preprogress/fb_comment_cleaned(new).csv').fillna('no comment')
  fb.drop_duplicates(subset='cleaned_text')
  fb.dropna()
  for row in fb['cleaned_text']:
     f.write(row+'\n')




100%|██████████| 545/545 [05:02<00:00,  1.80it/s]


train fasttext

install

In [2]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.11.1-py3-none-any.whl (227 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4199672 sha256=5ece642cee8b819265c45b3d694e13740d0b396d77bd07228cd3c2f39504ca36
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttext
Installing collected packages: pybind11, fasttext
Successfully installed fasttext-0.9.2 pybind11-2.11.1


In [None]:

import fasttext

training_data_path = '/content/drive/MyDrive/KLTN/data/preprogress/cooking.train'

model = fasttext.train_unsupervised(
    input=training_data_path,
    lr=0.1,             # Learning rate
    dim=100,            # Dimension of word vectors
    epoch=10,           # Number of training epochs
    word_ngrams=3,      # Max length of word ngram
    bucket=200000,      # Number of buckets to use for hashing ngrams
)

model.save_model('/content/drive/MyDrive/KLTN/data/trained_model.bin')