**NLP**

*CC6205-1 - Otoño 2022*

Autor: Felipe Urrutia Vargas

In [1]:
# !pip install --upgrade pandas

import pandas as pd
import numpy as np
# pd.set_option("max_rows", None)
import pickle
from string import punctuation
import re

from sklearn.metrics import cohen_kappa_score
import matplotlib.pyplot as plt
import seaborn as sns
# from astropy.visualization import hist
sns.set_theme(style="whitegrid")
plt.rc('axes', titlesize=14)
plt.rc('legend', fontsize=14)
plt.rc('xtick', labelsize=12)
plt.rc('ytick', labelsize=12)
plt.rcParams.update({'font.size': 16})
plt.rcParams['axes.titlesize'] = 16
plt.rcParams["figure.figsize"] = (10, 6)
plt.rcParams.update({'lines.markeredgewidth': 1})
plt.rcParams.update({'errorbar.capsize': 2})
import random

import plotly.express as px

In [2]:
sentiments = "anger fear joy sadness".split()
intensities = "low medium high".split()

In [3]:
df_train = pickle.load(open("df_train.pickle", "rb"))

In [4]:
#import spacy library
import spacy

!python -m spacy download en_core_web_sm

nlp = spacy.load("en_core_web_sm")

Collecting en-core-web-sm==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)
[+] Download and installation successful
You can now load the package via spacy.load('en_core_web_sm')


In [6]:
vocab_joy = {i: [] for i in intensities}
for i in intensities:
    _sub = df_train[(df_train["sen"] == "joy") & (df_train["int"] == i)]
    for text in _sub["text"]:
        doc = nlp(text)
        for t in doc:
            vocab_joy[i].append(t.text.lower())

In [7]:
import collections

In [9]:
!pip install unidecode
from unidecode import unidecode

!pip install Levenshtein
import Levenshtein as lev



In [10]:
import emojilib

In [11]:
vowel = "aeiou"
digit = "0123456789"

def replace_multiple(string, list_replace, replace_ch):
    for ch in list_replace:
        if ch in string:
            string = string.replace(ch, replace_ch)
    return string

def sim_lev(a, b):
    return 1 - lev.distance(a, b) / max(len(a), len(b)) if len(a) != 0 else 0

In [12]:
tweet = df_train.iloc[123]["text"]
tweet

'@GroupAnon black armed thug with a record carrying gun illegally gets shot by black cop. #outrage   This is a joke.Let em destroy their town'

In [13]:
emo_list = emojilib.emoji_list(tweet)
emo_names = list([d['name'] for d in emo_list if 'name' in d])
emo_names

[]

In [14]:
re.findall(r"\w+", tweet)

['GroupAnon',
 'black',
 'armed',
 'thug',
 'with',
 'a',
 'record',
 'carrying',
 'gun',
 'illegally',
 'gets',
 'shot',
 'by',
 'black',
 'cop',
 'outrage',
 'This',
 'is',
 'a',
 'joke',
 'Let',
 'em',
 'destroy',
 'their',
 'town']

In [15]:
nlp_tweet = nlp(tweet)
for token in nlp_tweet:
    label = "text lemma pos tag dep shape is_alpha is_stop".split()
    vals = [token.text, token.lemma_.lower(), token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop]
    print(dict(zip(label[1:], vals[1:])))

{'lemma': '@groupanon', 'pos': 'PROPN', 'tag': 'NNP', 'dep': 'nmod', 'shape': '@XxxxxXxxx', 'is_alpha': False, 'is_stop': False}
{'lemma': 'black', 'pos': 'ADJ', 'tag': 'JJ', 'dep': 'amod', 'shape': 'xxxx', 'is_alpha': True, 'is_stop': False}
{'lemma': 'armed', 'pos': 'ADJ', 'tag': 'JJ', 'dep': 'amod', 'shape': 'xxxx', 'is_alpha': True, 'is_stop': False}
{'lemma': 'thug', 'pos': 'NOUN', 'tag': 'NN', 'dep': 'nsubjpass', 'shape': 'xxxx', 'is_alpha': True, 'is_stop': False}
{'lemma': 'with', 'pos': 'ADP', 'tag': 'IN', 'dep': 'prep', 'shape': 'xxxx', 'is_alpha': True, 'is_stop': True}
{'lemma': 'a', 'pos': 'DET', 'tag': 'DT', 'dep': 'det', 'shape': 'x', 'is_alpha': True, 'is_stop': True}
{'lemma': 'record', 'pos': 'NOUN', 'tag': 'NN', 'dep': 'compound', 'shape': 'xxxx', 'is_alpha': True, 'is_stop': False}
{'lemma': 'carry', 'pos': 'VERB', 'tag': 'VBG', 'dep': 'compound', 'shape': 'xxxx', 'is_alpha': True, 'is_stop': False}
{'lemma': 'gun', 'pos': 'NOUN', 'tag': 'NN', 'dep': 'pobj', 'shape'

In [16]:
def get_retro_attrib(tweet):
    o = {}
    o["retro<&>num_tokens"] = len(tweet.split())
    o["retro<&>lenght"] = len(" ".join(tweet.split()))
    o["retro<&>num_numbs"] = len(re.findall(r"\d+",tweet))
    o["retro<&>num_alpha"] = len(re.findall(r"\w+", tweet))
    o["retro<&>num_with_uppercase"] = len(re.findall(r"\S*[A-Z]+\S*", tweet))
    o["retro<&>num_tokens_upper"] = sum(int(t.isupper()) for t in tweet.split())
    
    def prop_vowels(w):
        N = len(w.replace(" ", ""))
        if N>0:
            return len(re.findall(r"[aeiou]", tweet)) / N
        else:
            return 0
    
    def len_max_rep_char(w):
        w=w+" "
        c0 = w[0]
        lens = [0]
        clen = 1
        for c in w[1:]:
            if c == c0:
                clen += 1
            else:
                if c0.isalpha():
                    lens.append(clen)
                c0 = c
                clen = 1
        return max(lens)  
    
    o["retro<&>prop_vowels"] = prop_vowels(tweet.lower())
    o["retro<&>len_max_rep_char"] = len_max_rep_char(tweet.lower())
    
    def max_char_fre_per_token(w, c="k"):
        tw = w.split()
        fmax = 0
        for t in tw:
            f = sum(int(ch==c) for ch in t)
            if f>fmax:
                fmax = f
        return fmax
    
    o["retro<&>max_char_fre_per_token(o)"] = max_char_fre_per_token(tweet.lower(), c="o")
    o["retro<&>max_char_fre_per_token(s)"] = max_char_fre_per_token(tweet.lower(), c="s")
    o["retro<&>max_char_fre_per_token(g)"] = max_char_fre_per_token(tweet.lower(), c="g")    
    o["retro<&>max_char_fre_per_token(l)"] = max_char_fre_per_token(tweet.lower(), c="l")    
    
    def max_type_rep_char_per_token(w, t="vowel"):
        w=unidecode(w+" ")
        c0 = w[0]
        lens = [0]
        clen = 1
        for c in w[1:]:
            if (c0.isalpha() and c.isalpha()) and ((c in "aeiou" and c0 in "aeiou") or (c not in "aeiou" and c0 not in "aeiou")):
                clen += 1
            else:
                if t=="vowel":
                    if c0 in "aeiou":
                        lens.append(clen)
                else:
                    if c0 not in "aeiou":
                        lens.append(clen) 
                c0 = c
                clen = 1
        return max(lens) 
    
    o["retro<&>max_type_rep_char_per_token(vowel)"] = max_type_rep_char_per_token(tweet.lower(), t="vowel")
    
    return o

In [17]:
def get_punct_attrib(tweet):
    o = {}
    o["punct<&>[\.]{3}"] = len(re.findall(r"[\.]{3}", tweet))
    o["punct<&>[!]"] = len(re.findall(r"[!]", tweet))
    o["punct<&>[#]"] = len(re.findall(r"[#]", tweet))
    o["punct<&>[#]{1}\S+"] = len(re.findall(r"[#]{1}\S+", tweet))
    o["punct<&>[\*]"] = len(re.findall(r"[\*]", tweet))
    o["punct<&>[@]{1}\S+"] = len(re.findall(r"[@]{1}\S+", tweet))
    o["punct<&>\S*[?]{1}\S*"] = len(re.findall(r"\S*[?]{1}\S*", tweet))
    return o

In [18]:
def get_emojilib_attrib(tweet):
    emo_list = emojilib.emoji_list(tweet)
    emo_names = list([d['name'] for d in emo_list if 'name' in d])
    o = {}
    for emo in emo_names:
        if emo not in o.keys():
            o["emoji<&>"+emo] = 0
        o["emoji<&>"+emo] += 1
    return o

In [19]:
def get_linguistics_attrib(tweet):
    o = {}
    nlp_tweet = nlp(tweet)
    for token in nlp_tweet:
        label = "text lemma pos tag dep shape is_alpha is_stop".split()
        vals = [token.text, token.lemma_.lower(), token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop]
        dict_vals = dict(zip(label[1:], vals[1:]))
        for k, v in dict_vals.items():
            l = f"linguistics<&>{k}<&>{v}"
            if l not in o.keys():
                o[l] = 0
            o[l] += 1
    return o    

In [20]:
get_retro_attrib(tweet), get_punct_attrib(tweet), get_emojilib_attrib(tweet), get_linguistics_attrib(tweet)

({'retro<&>num_tokens': 24,
  'retro<&>lenght': 138,
  'retro<&>num_numbs': 0,
  'retro<&>num_alpha': 25,
  'retro<&>num_with_uppercase': 3,
  'retro<&>num_tokens_upper': 0,
  'retro<&>prop_vowels': 0.3217391304347826,
  'retro<&>len_max_rep_char': 2,
  'retro<&>max_char_fre_per_token(o)': 2,
  'retro<&>max_char_fre_per_token(s)': 1,
  'retro<&>max_char_fre_per_token(g)': 1,
  'retro<&>max_char_fre_per_token(l)': 4,
  'retro<&>max_type_rep_char_per_token(vowel)': 2},
 {'punct<&>[\\.]{3}': 0,
  'punct<&>[!]': 0,
  'punct<&>[#]': 1,
  'punct<&>[#]{1}\\S+': 1,
  'punct<&>[\\*]': 0,
  'punct<&>[@]{1}\\S+': 1,
  'punct<&>\\S*[?]{1}\\S*': 0},
 {},
 {'linguistics<&>lemma<&>@groupanon': 1,
  'linguistics<&>pos<&>PROPN': 1,
  'linguistics<&>tag<&>NNP': 1,
  'linguistics<&>dep<&>nmod': 2,
  'linguistics<&>shape<&>@XxxxxXxxx': 1,
  'linguistics<&>is_alpha<&>False': 5,
  'linguistics<&>is_stop<&>False': 22,
  'linguistics<&>lemma<&>black': 2,
  'linguistics<&>pos<&>ADJ': 3,
  'linguistics<&>tag<&>

In [21]:
import time
import datetime

In [22]:
data_all_attributes = []
times = []
for k, ix in enumerate(df_train.index):
    start_time = time.time()
    tweet = df_train.loc[ix]["text"]
    o = {"id": ix}
    o = {**o, **get_retro_attrib(tweet)}
    o = {**o, **get_punct_attrib(tweet)}
    o = {**o, **get_emojilib_attrib(tweet)}
    o = {**o, **get_linguistics_attrib(tweet)}
    data_all_attributes.append(o)
    dt = time.time()-start_time
    times.append(dt)
    print(f"""
    row: {k+1}, total_rows: {df_train.shape[0]}
    progress: {np.round(100*(k+1)/df_train.shape[0], 3)}%
    wait time: {datetime.timedelta(seconds = np.mean(times)*(df_train.shape[0]-k-1))}s
    """)


    row: 1, total_rows: 3960
    progress: 0.025%
    wait time: 0:00:31.703685s
    

    row: 2, total_rows: 3960
    progress: 0.051%
    wait time: 0:00:33.673118s
    

    row: 3, total_rows: 3960
    progress: 0.076%
    wait time: 0:00:30.365305s
    

    row: 4, total_rows: 3960
    progress: 0.101%
    wait time: 0:00:30.686960s
    

    row: 5, total_rows: 3960
    progress: 0.126%
    wait time: 0:00:29.293544s
    

    row: 6, total_rows: 3960
    progress: 0.152%
    wait time: 0:00:27.043909s
    

    row: 7, total_rows: 3960
    progress: 0.177%
    wait time: 0:00:27.696191s
    

    row: 8, total_rows: 3960
    progress: 0.202%
    wait time: 0:00:27.689202s
    

    row: 9, total_rows: 3960
    progress: 0.227%
    wait time: 0:00:26.803435s
    

    row: 10, total_rows: 3960
    progress: 0.253%
    wait time: 0:00:26.489263s
    

    row: 11, total_rows: 3960
    progress: 0.278%
    wait time: 0:00:25.871547s
    

    row: 12, total_rows: 3960
    progre


    row: 100, total_rows: 3960
    progress: 2.525%
    wait time: 0:00:22.041763s
    

    row: 101, total_rows: 3960
    progress: 2.551%
    wait time: 0:00:22.047333s
    

    row: 102, total_rows: 3960
    progress: 2.576%
    wait time: 0:00:21.976989s
    

    row: 103, total_rows: 3960
    progress: 2.601%
    wait time: 0:00:22.020336s
    

    row: 104, total_rows: 3960
    progress: 2.626%
    wait time: 0:00:21.988496s
    

    row: 105, total_rows: 3960
    progress: 2.652%
    wait time: 0:00:21.920411s
    

    row: 106, total_rows: 3960
    progress: 2.677%
    wait time: 0:00:21.926326s
    

    row: 107, total_rows: 3960
    progress: 2.702%
    wait time: 0:00:21.859961s
    

    row: 108, total_rows: 3960
    progress: 2.727%
    wait time: 0:00:21.830415s
    

    row: 109, total_rows: 3960
    progress: 2.753%
    wait time: 0:00:21.765966s
    

    row: 110, total_rows: 3960
    progress: 2.778%
    wait time: 0:00:21.737639s
    

    row: 111, total_


    row: 203, total_rows: 3960
    progress: 5.126%
    wait time: 0:00:20.756598s
    

    row: 204, total_rows: 3960
    progress: 5.152%
    wait time: 0:00:20.796785s
    

    row: 205, total_rows: 3960
    progress: 5.177%
    wait time: 0:00:20.781497s
    

    row: 206, total_rows: 3960
    progress: 5.202%
    wait time: 0:00:20.784567s
    

    row: 207, total_rows: 3960
    progress: 5.227%
    wait time: 0:00:20.823815s
    

    row: 208, total_rows: 3960
    progress: 5.253%
    wait time: 0:00:20.808442s
    

    row: 209, total_rows: 3960
    progress: 5.278%
    wait time: 0:00:20.757254s
    

    row: 210, total_rows: 3960
    progress: 5.303%
    wait time: 0:00:20.742267s
    

    row: 211, total_rows: 3960
    progress: 5.328%
    wait time: 0:00:20.727383s
    

    row: 212, total_rows: 3960
    progress: 5.354%
    wait time: 0:00:20.730275s
    

    row: 213, total_rows: 3960
    progress: 5.379%
    wait time: 0:00:20.750694s
    

    row: 214, total_


    row: 301, total_rows: 3960
    progress: 7.601%
    wait time: 0:00:20.258691s
    

    row: 302, total_rows: 3960
    progress: 7.626%
    wait time: 0:00:20.270951s
    

    row: 303, total_rows: 3960
    progress: 7.652%
    wait time: 0:00:20.258935s
    

    row: 304, total_rows: 3960
    progress: 7.677%
    wait time: 0:00:20.258991s
    

    row: 305, total_rows: 3960
    progress: 7.702%
    wait time: 0:00:20.259010s
    

    row: 306, total_rows: 3960
    progress: 7.727%
    wait time: 0:00:20.247044s
    

    row: 307, total_rows: 3960
    progress: 7.753%
    wait time: 0:00:20.235111s
    

    row: 308, total_rows: 3960
    progress: 7.778%
    wait time: 0:00:20.223243s
    

    row: 309, total_rows: 3960
    progress: 7.803%
    wait time: 0:00:20.199572s
    

    row: 310, total_rows: 3960
    progress: 7.828%
    wait time: 0:00:20.187823s
    

    row: 311, total_rows: 3960
    progress: 7.854%
    wait time: 0:00:20.187858s
    

    row: 312, total_


    row: 401, total_rows: 3960
    progress: 10.126%
    wait time: 0:00:19.690361s
    

    row: 402, total_rows: 3960
    progress: 10.152%
    wait time: 0:00:19.697875s
    

    row: 403, total_rows: 3960
    progress: 10.177%
    wait time: 0:00:19.678808s
    

    row: 404, total_rows: 3960
    progress: 10.202%
    wait time: 0:00:19.677442s
    

    row: 405, total_rows: 3960
    progress: 10.227%
    wait time: 0:00:19.684835s
    

    row: 406, total_rows: 3960
    progress: 10.253%
    wait time: 0:00:19.674636s
    

    row: 407, total_rows: 3960
    progress: 10.278%
    wait time: 0:00:19.673199s
    

    row: 408, total_rows: 3960
    progress: 10.303%
    wait time: 0:00:19.671736s
    

    row: 409, total_rows: 3960
    progress: 10.328%
    wait time: 0:00:19.644186s
    

    row: 410, total_rows: 3960
    progress: 10.354%
    wait time: 0:00:19.625426s
    

    row: 411, total_rows: 3960
    progress: 10.379%
    wait time: 0:00:19.615365s
    

    row: 


    row: 502, total_rows: 3960
    progress: 12.677%
    wait time: 0:00:19.033130s
    

    row: 503, total_rows: 3960
    progress: 12.702%
    wait time: 0:00:19.024197s
    

    row: 504, total_rows: 3960
    progress: 12.727%
    wait time: 0:00:19.028992s
    

    row: 505, total_rows: 3960
    progress: 12.753%
    wait time: 0:00:19.020059s
    

    row: 506, total_rows: 3960
    progress: 12.778%
    wait time: 0:00:19.011138s
    

    row: 507, total_rows: 3960
    progress: 12.803%
    wait time: 0:00:19.002232s
    

    row: 508, total_rows: 3960
    progress: 12.828%
    wait time: 0:00:19.006944s
    

    row: 509, total_rows: 3960
    progress: 12.854%
    wait time: 0:00:18.991251s
    

    row: 510, total_rows: 3960
    progress: 12.879%
    wait time: 0:00:18.975602s
    

    row: 511, total_rows: 3960
    progress: 12.904%
    wait time: 0:00:18.973513s
    

    row: 512, total_rows: 3960
    progress: 12.929%
    wait time: 0:00:18.957929s
    

    row: 


    row: 608, total_rows: 3960
    progress: 15.354%
    wait time: 0:00:18.207417s
    

    row: 609, total_rows: 3960
    progress: 15.379%
    wait time: 0:00:18.205145s
    

    row: 610, total_rows: 3960
    progress: 15.404%
    wait time: 0:00:18.208351s
    

    row: 611, total_rows: 3960
    progress: 15.429%
    wait time: 0:00:18.200554s
    

    row: 612, total_rows: 3960
    progress: 15.455%
    wait time: 0:00:18.203717s
    

    row: 613, total_rows: 3960
    progress: 15.48%
    wait time: 0:00:18.201385s
    

    row: 614, total_rows: 3960
    progress: 15.505%
    wait time: 0:00:18.193588s
    

    row: 615, total_rows: 3960
    progress: 15.53%
    wait time: 0:00:18.191236s
    

    row: 616, total_rows: 3960
    progress: 15.556%
    wait time: 0:00:18.183442s
    

    row: 617, total_rows: 3960
    progress: 15.581%
    wait time: 0:00:18.175657s
    

    row: 618, total_rows: 3960
    progress: 15.606%
    wait time: 0:00:18.173295s
    

    row: 61


    row: 711, total_rows: 3960
    progress: 17.955%
    wait time: 0:00:17.579488s
    

    row: 712, total_rows: 3960
    progress: 17.98%
    wait time: 0:00:17.576789s
    

    row: 713, total_rows: 3960
    progress: 18.005%
    wait time: 0:00:17.574083s
    

    row: 714, total_rows: 3960
    progress: 18.03%
    wait time: 0:00:17.571368s
    

    row: 715, total_rows: 3960
    progress: 18.056%
    wait time: 0:00:17.564100s
    

    row: 716, total_rows: 3960
    progress: 18.081%
    wait time: 0:00:17.552303s
    

    row: 717, total_rows: 3960
    progress: 18.106%
    wait time: 0:00:17.545056s
    

    row: 718, total_rows: 3960
    progress: 18.131%
    wait time: 0:00:17.533292s
    

    row: 719, total_rows: 3960
    progress: 18.157%
    wait time: 0:00:17.535089s
    

    row: 720, total_rows: 3960
    progress: 18.182%
    wait time: 0:00:17.536860s
    

    row: 721, total_rows: 3960
    progress: 18.207%
    wait time: 0:00:17.529615s
    

    row: 72


    row: 809, total_rows: 3960
    progress: 20.429%
    wait time: 0:00:17.124179s
    

    row: 810, total_rows: 3960
    progress: 20.455%
    wait time: 0:00:17.124856s
    

    row: 811, total_rows: 3960
    progress: 20.48%
    wait time: 0:00:17.129405s
    

    row: 812, total_rows: 3960
    progress: 20.505%
    wait time: 0:00:17.126159s
    

    row: 813, total_rows: 3960
    progress: 20.53%
    wait time: 0:00:17.115162s
    

    row: 814, total_rows: 3960
    progress: 20.556%
    wait time: 0:00:17.108043s
    

    row: 815, total_rows: 3960
    progress: 20.581%
    wait time: 0:00:17.093208s
    

    row: 816, total_rows: 3960
    progress: 20.606%
    wait time: 0:00:17.082260s
    

    row: 817, total_rows: 3960
    progress: 20.631%
    wait time: 0:00:17.075178s
    

    row: 818, total_rows: 3960
    progress: 20.657%
    wait time: 0:00:17.068100s
    

    row: 819, total_rows: 3960
    progress: 20.682%
    wait time: 0:00:17.064866s
    

    row: 82


    row: 908, total_rows: 3960
    progress: 22.929%
    wait time: 0:00:16.581012s
    

    row: 909, total_rows: 3960
    progress: 22.955%
    wait time: 0:00:16.580866s
    

    row: 910, total_rows: 3960
    progress: 22.98%
    wait time: 0:00:16.573989s
    

    row: 911, total_rows: 3960
    progress: 23.005%
    wait time: 0:00:16.573815s
    

    row: 912, total_rows: 3960
    progress: 23.03%
    wait time: 0:00:16.570283s
    

    row: 913, total_rows: 3960
    progress: 23.056%
    wait time: 0:00:16.570083s
    

    row: 914, total_rows: 3960
    progress: 23.081%
    wait time: 0:00:16.563200s
    

    row: 915, total_rows: 3960
    progress: 23.106%
    wait time: 0:00:16.556322s
    

    row: 916, total_rows: 3960
    progress: 23.131%
    wait time: 0:00:16.559424s
    

    row: 917, total_rows: 3960
    progress: 23.157%
    wait time: 0:00:16.555861s
    

    row: 918, total_rows: 3960
    progress: 23.182%
    wait time: 0:00:16.555609s
    

    row: 91


    row: 1002, total_rows: 3960
    progress: 25.303%
    wait time: 0:00:16.167173s
    

    row: 1003, total_rows: 3960
    progress: 25.328%
    wait time: 0:00:16.169200s
    

    row: 1004, total_rows: 3960
    progress: 25.354%
    wait time: 0:00:16.171207s
    

    row: 1005, total_rows: 3960
    progress: 25.379%
    wait time: 0:00:16.167311s
    

    row: 1006, total_rows: 3960
    progress: 25.404%
    wait time: 0:00:16.163411s
    

    row: 1007, total_rows: 3960
    progress: 25.429%
    wait time: 0:00:16.162438s
    

    row: 1008, total_rows: 3960
    progress: 25.455%
    wait time: 0:00:16.155592s
    

    row: 1009, total_rows: 3960
    progress: 25.48%
    wait time: 0:00:16.151680s
    

    row: 1010, total_rows: 3960
    progress: 25.505%
    wait time: 0:00:16.144835s
    

    row: 1011, total_rows: 3960
    progress: 25.53%
    wait time: 0:00:16.137996s
    

    row: 1012, total_rows: 3960
    progress: 25.556%
    wait time: 0:00:16.128248s
    




    row: 1098, total_rows: 3960
    progress: 27.727%
    wait time: 0:00:15.678432s
    

    row: 1099, total_rows: 3960
    progress: 27.753%
    wait time: 0:00:15.671722s
    

    row: 1100, total_rows: 3960
    progress: 27.778%
    wait time: 0:00:15.665014s
    

    row: 1101, total_rows: 3960
    progress: 27.803%
    wait time: 0:00:15.660907s
    

    row: 1102, total_rows: 3960
    progress: 27.828%
    wait time: 0:00:15.654203s
    

    row: 1103, total_rows: 3960
    progress: 27.854%
    wait time: 0:00:15.644909s
    

    row: 1104, total_rows: 3960
    progress: 27.879%
    wait time: 0:00:15.638213s
    

    row: 1105, total_rows: 3960
    progress: 27.904%
    wait time: 0:00:15.631519s
    

    row: 1106, total_rows: 3960
    progress: 27.929%
    wait time: 0:00:15.629997s
    

    row: 1107, total_rows: 3960
    progress: 27.955%
    wait time: 0:00:15.625884s
    

    row: 1108, total_rows: 3960
    progress: 27.98%
    wait time: 0:00:15.616614s
    



    row: 1190, total_rows: 3960
    progress: 30.051%
    wait time: 0:00:15.275725s
    

    row: 1191, total_rows: 3960
    progress: 30.076%
    wait time: 0:00:15.269024s
    

    row: 1192, total_rows: 3960
    progress: 30.101%
    wait time: 0:00:15.266977s
    

    row: 1193, total_rows: 3960
    progress: 30.126%
    wait time: 0:00:15.262599s
    

    row: 1194, total_rows: 3960
    progress: 30.152%
    wait time: 0:00:15.258216s
    

    row: 1195, total_rows: 3960
    progress: 30.177%
    wait time: 0:00:15.249200s
    

    row: 1196, total_rows: 3960
    progress: 30.202%
    wait time: 0:00:15.244817s
    

    row: 1197, total_rows: 3960
    progress: 30.227%
    wait time: 0:00:15.235812s
    

    row: 1198, total_rows: 3960
    progress: 30.253%
    wait time: 0:00:15.236044s
    

    row: 1199, total_rows: 3960
    progress: 30.278%
    wait time: 0:00:15.233958s
    

    row: 1200, total_rows: 3960
    progress: 30.303%
    wait time: 0:00:15.224961s
    


    row: 1287, total_rows: 3960
    progress: 32.5%
    wait time: 0:00:14.746119s
    

    row: 1288, total_rows: 3960
    progress: 32.525%
    wait time: 0:00:14.741617s
    

    row: 1289, total_rows: 3960
    progress: 32.551%
    wait time: 0:00:14.739186s
    

    row: 1290, total_rows: 3960
    progress: 32.576%
    wait time: 0:00:14.734677s
    

    row: 1291, total_rows: 3960
    progress: 32.601%
    wait time: 0:00:14.732235s
    

    row: 1292, total_rows: 3960
    progress: 32.626%
    wait time: 0:00:14.729787s
    

    row: 1293, total_rows: 3960
    progress: 32.652%
    wait time: 0:00:14.721136s
    

    row: 1294, total_rows: 3960
    progress: 32.677%
    wait time: 0:00:14.712491s
    

    row: 1295, total_rows: 3960
    progress: 32.702%
    wait time: 0:00:14.710034s
    

    row: 1296, total_rows: 3960
    progress: 32.727%
    wait time: 0:00:14.705513s
    

    row: 1297, total_rows: 3960
    progress: 32.753%
    wait time: 0:00:14.703045s
    




    row: 1381, total_rows: 3960
    progress: 34.874%
    wait time: 0:00:14.263053s
    

    row: 1382, total_rows: 3960
    progress: 34.899%
    wait time: 0:00:14.258408s
    

    row: 1383, total_rows: 3960
    progress: 34.924%
    wait time: 0:00:14.253762s
    

    row: 1384, total_rows: 3960
    progress: 34.949%
    wait time: 0:00:14.249112s
    

    row: 1385, total_rows: 3960
    progress: 34.975%
    wait time: 0:00:14.242603s
    

    row: 1386, total_rows: 3960
    progress: 35.0%
    wait time: 0:00:14.234235s
    

    row: 1387, total_rows: 3960
    progress: 35.025%
    wait time: 0:00:14.227728s
    

    row: 1388, total_rows: 3960
    progress: 35.051%
    wait time: 0:00:14.223082s
    

    row: 1389, total_rows: 3960
    progress: 35.076%
    wait time: 0:00:14.216580s
    

    row: 1390, total_rows: 3960
    progress: 35.101%
    wait time: 0:00:14.215630s
    

    row: 1391, total_rows: 3960
    progress: 35.126%
    wait time: 0:00:14.212823s
    




    row: 1477, total_rows: 3960
    progress: 37.298%
    wait time: 0:00:13.709510s
    

    row: 1478, total_rows: 3960
    progress: 37.323%
    wait time: 0:00:13.706481s
    

    row: 1479, total_rows: 3960
    progress: 37.348%
    wait time: 0:00:13.700090s
    

    row: 1480, total_rows: 3960
    progress: 37.374%
    wait time: 0:00:13.693701s
    

    row: 1481, total_rows: 3960
    progress: 37.399%
    wait time: 0:00:13.688988s
    

    row: 1482, total_rows: 3960
    progress: 37.424%
    wait time: 0:00:13.680928s
    

    row: 1483, total_rows: 3960
    progress: 37.449%
    wait time: 0:00:13.676217s
    

    row: 1484, total_rows: 3960
    progress: 37.475%
    wait time: 0:00:13.671503s
    

    row: 1485, total_rows: 3960
    progress: 37.5%
    wait time: 0:00:13.663452s
    

    row: 1486, total_rows: 3960
    progress: 37.525%
    wait time: 0:00:13.655405s
    

    row: 1487, total_rows: 3960
    progress: 37.551%
    wait time: 0:00:13.650694s
    




    row: 1576, total_rows: 3960
    progress: 39.798%
    wait time: 0:00:13.161203s
    

    row: 1577, total_rows: 3960
    progress: 39.823%
    wait time: 0:00:13.154902s
    

    row: 1578, total_rows: 3960
    progress: 39.848%
    wait time: 0:00:13.148603s
    

    row: 1579, total_rows: 3960
    progress: 39.874%
    wait time: 0:00:13.143815s
    

    row: 1580, total_rows: 3960
    progress: 39.899%
    wait time: 0:00:13.139025s
    

    row: 1581, total_rows: 3960
    progress: 39.924%
    wait time: 0:00:13.131222s
    

    row: 1582, total_rows: 3960
    progress: 39.949%
    wait time: 0:00:13.127937s
    

    row: 1583, total_rows: 3960
    progress: 39.975%
    wait time: 0:00:13.120139s
    

    row: 1584, total_rows: 3960
    progress: 40.0%
    wait time: 0:00:13.112346s
    

    row: 1585, total_rows: 3960
    progress: 40.025%
    wait time: 0:00:13.107556s
    

    row: 1586, total_rows: 3960
    progress: 40.051%
    wait time: 0:00:13.102766s
    




    row: 1675, total_rows: 3960
    progress: 42.298%
    wait time: 0:00:12.594111s
    

    row: 1676, total_rows: 3960
    progress: 42.323%
    wait time: 0:00:12.590637s
    

    row: 1677, total_rows: 3960
    progress: 42.348%
    wait time: 0:00:12.585795s
    

    row: 1678, total_rows: 3960
    progress: 42.374%
    wait time: 0:00:12.579591s
    

    row: 1679, total_rows: 3960
    progress: 42.399%
    wait time: 0:00:12.574747s
    

    row: 1680, total_rows: 3960
    progress: 42.424%
    wait time: 0:00:12.571263s
    

    row: 1681, total_rows: 3960
    progress: 42.449%
    wait time: 0:00:12.566415s
    

    row: 1682, total_rows: 3960
    progress: 42.475%
    wait time: 0:00:12.558855s
    

    row: 1683, total_rows: 3960
    progress: 42.5%
    wait time: 0:00:12.555361s
    

    row: 1684, total_rows: 3960
    progress: 42.525%
    wait time: 0:00:12.550512s
    

    row: 1685, total_rows: 3960
    progress: 42.551%
    wait time: 0:00:12.545661s
    




    row: 1769, total_rows: 3960
    progress: 44.672%
    wait time: 0:00:12.097553s
    

    row: 1770, total_rows: 3960
    progress: 44.697%
    wait time: 0:00:12.092631s
    

    row: 1771, total_rows: 3960
    progress: 44.722%
    wait time: 0:00:12.088944s
    

    row: 1772, total_rows: 3960
    progress: 44.747%
    wait time: 0:00:12.085254s
    

    row: 1773, total_rows: 3960
    progress: 44.773%
    wait time: 0:00:12.080324s
    

    row: 1774, total_rows: 3960
    progress: 44.798%
    wait time: 0:00:12.077861s
    

    row: 1775, total_rows: 3960
    progress: 44.823%
    wait time: 0:00:12.074159s
    

    row: 1776, total_rows: 3960
    progress: 44.848%
    wait time: 0:00:12.069222s
    

    row: 1777, total_rows: 3960
    progress: 44.874%
    wait time: 0:00:12.061828s
    

    row: 1778, total_rows: 3960
    progress: 44.899%
    wait time: 0:00:12.055662s
    

    row: 1779, total_rows: 3960
    progress: 44.924%
    wait time: 0:00:12.050725s
    


    row: 1871, total_rows: 3960
    progress: 47.247%
    wait time: 0:00:11.501190s
    

    row: 1872, total_rows: 3960
    progress: 47.273%
    wait time: 0:00:11.496242s
    

    row: 1873, total_rows: 3960
    progress: 47.298%
    wait time: 0:00:11.490178s
    

    row: 1874, total_rows: 3960
    progress: 47.323%
    wait time: 0:00:11.486343s
    

    row: 1875, total_rows: 3960
    progress: 47.348%
    wait time: 0:00:11.480279s
    

    row: 1876, total_rows: 3960
    progress: 47.374%
    wait time: 0:00:11.474215s
    

    row: 1877, total_rows: 3960
    progress: 47.399%
    wait time: 0:00:11.465931s
    

    row: 1878, total_rows: 3960
    progress: 47.424%
    wait time: 0:00:11.462092s
    

    row: 1879, total_rows: 3960
    progress: 47.449%
    wait time: 0:00:11.454923s
    

    row: 1880, total_rows: 3960
    progress: 47.475%
    wait time: 0:00:11.449973s
    

    row: 1881, total_rows: 3960
    progress: 47.5%
    wait time: 0:00:11.443916s
    




    row: 1973, total_rows: 3960
    progress: 49.823%
    wait time: 0:00:10.920405s
    

    row: 1974, total_rows: 3960
    progress: 49.848%
    wait time: 0:00:10.914415s
    

    row: 1975, total_rows: 3960
    progress: 49.874%
    wait time: 0:00:10.908426s
    

    row: 1976, total_rows: 3960
    progress: 49.899%
    wait time: 0:00:10.903442s
    

    row: 1977, total_rows: 3960
    progress: 49.924%
    wait time: 0:00:10.898458s
    

    row: 1978, total_rows: 3960
    progress: 49.949%
    wait time: 0:00:10.892469s
    

    row: 1979, total_rows: 3960
    progress: 49.975%
    wait time: 0:00:10.886483s
    

    row: 1980, total_rows: 3960
    progress: 50.0%
    wait time: 0:00:10.882499s
    

    row: 1981, total_rows: 3960
    progress: 50.025%
    wait time: 0:00:10.878511s
    

    row: 1982, total_rows: 3960
    progress: 50.051%
    wait time: 0:00:10.872522s
    

    row: 1983, total_rows: 3960
    progress: 50.076%
    wait time: 0:00:10.868530s
    




    row: 2072, total_rows: 3960
    progress: 52.323%
    wait time: 0:00:10.358426s
    

    row: 2073, total_rows: 3960
    progress: 52.348%
    wait time: 0:00:10.354324s
    

    row: 2074, total_rows: 3960
    progress: 52.374%
    wait time: 0:00:10.348398s
    

    row: 2075, total_rows: 3960
    progress: 52.399%
    wait time: 0:00:10.340654s
    

    row: 2076, total_rows: 3960
    progress: 52.424%
    wait time: 0:00:10.333824s
    

    row: 2077, total_rows: 3960
    progress: 52.449%
    wait time: 0:00:10.327904s
    

    row: 2078, total_rows: 3960
    progress: 52.475%
    wait time: 0:00:10.322891s
    

    row: 2079, total_rows: 3960
    progress: 52.5%
    wait time: 0:00:10.318781s
    

    row: 2080, total_rows: 3960
    progress: 52.525%
    wait time: 0:00:10.312862s
    

    row: 2081, total_rows: 3960
    progress: 52.551%
    wait time: 0:00:10.306943s
    

    row: 2082, total_rows: 3960
    progress: 52.576%
    wait time: 0:00:10.302829s
    




    row: 2167, total_rows: 3960
    progress: 54.722%
    wait time: 0:00:09.840743s
    

    row: 2168, total_rows: 3960
    progress: 54.747%
    wait time: 0:00:09.834854s
    

    row: 2169, total_rows: 3960
    progress: 54.773%
    wait time: 0:00:09.828140s
    

    row: 2170, total_rows: 3960
    progress: 54.798%
    wait time: 0:00:09.820603s
    

    row: 2171, total_rows: 3960
    progress: 54.823%
    wait time: 0:00:09.815544s
    

    row: 2172, total_rows: 3960
    progress: 54.848%
    wait time: 0:00:09.810485s
    

    row: 2173, total_rows: 3960
    progress: 54.874%
    wait time: 0:00:09.802955s
    

    row: 2174, total_rows: 3960
    progress: 54.899%
    wait time: 0:00:09.796252s
    

    row: 2175, total_rows: 3960
    progress: 54.924%
    wait time: 0:00:09.788730s
    

    row: 2176, total_rows: 3960
    progress: 54.949%
    wait time: 0:00:09.782853s
    

    row: 2177, total_rows: 3960
    progress: 54.975%
    wait time: 0:00:09.776157s
    


    row: 2271, total_rows: 3960
    progress: 57.348%
    wait time: 0:00:09.238477s
    

    row: 2272, total_rows: 3960
    progress: 57.374%
    wait time: 0:00:09.232662s
    

    row: 2273, total_rows: 3960
    progress: 57.399%
    wait time: 0:00:09.227590s
    

    row: 2274, total_rows: 3960
    progress: 57.424%
    wait time: 0:00:09.221033s
    

    row: 2275, total_rows: 3960
    progress: 57.449%
    wait time: 0:00:09.215220s
    

    row: 2276, total_rows: 3960
    progress: 57.475%
    wait time: 0:00:09.210149s
    

    row: 2277, total_rows: 3960
    progress: 57.5%
    wait time: 0:00:09.205075s
    

    row: 2278, total_rows: 3960
    progress: 57.525%
    wait time: 0:00:09.200741s
    

    row: 2279, total_rows: 3960
    progress: 57.551%
    wait time: 0:00:09.194189s
    

    row: 2280, total_rows: 3960
    progress: 57.576%
    wait time: 0:00:09.187639s
    

    row: 2281, total_rows: 3960
    progress: 57.601%
    wait time: 0:00:09.184775s
    




    row: 2379, total_rows: 3960
    progress: 60.076%
    wait time: 0:00:08.610034s
    

    row: 2380, total_rows: 3960
    progress: 60.101%
    wait time: 0:00:08.604960s
    

    row: 2381, total_rows: 3960
    progress: 60.126%
    wait time: 0:00:08.599885s
    

    row: 2382, total_rows: 3960
    progress: 60.152%
    wait time: 0:00:08.594808s
    

    row: 2383, total_rows: 3960
    progress: 60.177%
    wait time: 0:00:08.589732s
    

    row: 2384, total_rows: 3960
    progress: 60.202%
    wait time: 0:00:08.583331s
    

    row: 2385, total_rows: 3960
    progress: 60.227%
    wait time: 0:00:08.576932s
    

    row: 2386, total_rows: 3960
    progress: 60.253%
    wait time: 0:00:08.570535s
    

    row: 2387, total_rows: 3960
    progress: 60.278%
    wait time: 0:00:08.564799s
    

    row: 2388, total_rows: 3960
    progress: 60.303%
    wait time: 0:00:08.557747s
    

    row: 2389, total_rows: 3960
    progress: 60.328%
    wait time: 0:00:08.552014s
    


    row: 2485, total_rows: 3960
    progress: 62.753%
    wait time: 0:00:08.001429s
    

    row: 2486, total_rows: 3960
    progress: 62.778%
    wait time: 0:00:07.996348s
    

    row: 2487, total_rows: 3960
    progress: 62.803%
    wait time: 0:00:07.990081s
    

    row: 2488, total_rows: 3960
    progress: 62.828%
    wait time: 0:00:07.985001s
    

    row: 2489, total_rows: 3960
    progress: 62.854%
    wait time: 0:00:07.978736s
    

    row: 2490, total_rows: 3960
    progress: 62.879%
    wait time: 0:00:07.973656s
    

    row: 2491, total_rows: 3960
    progress: 62.904%
    wait time: 0:00:07.966803s
    

    row: 2492, total_rows: 3960
    progress: 62.929%
    wait time: 0:00:07.960544s
    

    row: 2493, total_rows: 3960
    progress: 62.955%
    wait time: 0:00:07.955464s
    

    row: 2494, total_rows: 3960
    progress: 62.98%
    wait time: 0:00:07.950383s
    

    row: 2495, total_rows: 3960
    progress: 63.005%
    wait time: 0:00:07.944714s
    



    row: 2591, total_rows: 3960
    progress: 65.429%
    wait time: 0:00:07.407638s
    

    row: 2592, total_rows: 3960
    progress: 65.455%
    wait time: 0:00:07.402540s
    

    row: 2593, total_rows: 3960
    progress: 65.48%
    wait time: 0:00:07.396915s
    

    row: 2594, total_rows: 3960
    progress: 65.505%
    wait time: 0:00:07.391816s
    

    row: 2595, total_rows: 3960
    progress: 65.53%
    wait time: 0:00:07.386718s
    

    row: 2596, total_rows: 3960
    progress: 65.556%
    wait time: 0:00:07.380567s
    

    row: 2597, total_rows: 3960
    progress: 65.581%
    wait time: 0:00:07.374417s
    

    row: 2598, total_rows: 3960
    progress: 65.606%
    wait time: 0:00:07.368794s
    

    row: 2599, total_rows: 3960
    progress: 65.631%
    wait time: 0:00:07.363695s
    

    row: 2600, total_rows: 3960
    progress: 65.657%
    wait time: 0:00:07.357549s
    

    row: 2601, total_rows: 3960
    progress: 65.682%
    wait time: 0:00:07.351404s
    




    row: 2698, total_rows: 3960
    progress: 68.131%
    wait time: 0:00:06.799892s
    

    row: 2699, total_rows: 3960
    progress: 68.157%
    wait time: 0:00:06.794325s
    

    row: 2700, total_rows: 3960
    progress: 68.182%
    wait time: 0:00:06.789225s
    

    row: 2701, total_rows: 3960
    progress: 68.207%
    wait time: 0:00:06.784124s
    

    row: 2702, total_rows: 3960
    progress: 68.232%
    wait time: 0:00:06.779023s
    

    row: 2703, total_rows: 3960
    progress: 68.258%
    wait time: 0:00:06.773921s
    

    row: 2704, total_rows: 3960
    progress: 68.283%
    wait time: 0:00:06.769283s
    

    row: 2705, total_rows: 3960
    progress: 68.308%
    wait time: 0:00:06.763251s
    

    row: 2706, total_rows: 3960
    progress: 68.333%
    wait time: 0:00:06.756756s
    

    row: 2707, total_rows: 3960
    progress: 68.359%
    wait time: 0:00:06.751653s
    

    row: 2708, total_rows: 3960
    progress: 68.384%
    wait time: 0:00:06.745625s
    


    row: 2800, total_rows: 3960
    progress: 70.707%
    wait time: 0:00:06.244452s
    

    row: 2801, total_rows: 3960
    progress: 70.732%
    wait time: 0:00:06.238913s
    

    row: 2802, total_rows: 3960
    progress: 70.758%
    wait time: 0:00:06.233787s
    

    row: 2803, total_rows: 3960
    progress: 70.783%
    wait time: 0:00:06.228661s
    

    row: 2804, total_rows: 3960
    progress: 70.808%
    wait time: 0:00:06.223534s
    

    row: 2805, total_rows: 3960
    progress: 70.833%
    wait time: 0:00:06.217994s
    

    row: 2806, total_rows: 3960
    progress: 70.859%
    wait time: 0:00:06.211631s
    

    row: 2807, total_rows: 3960
    progress: 70.884%
    wait time: 0:00:06.206093s
    

    row: 2808, total_rows: 3960
    progress: 70.909%
    wait time: 0:00:06.200971s
    

    row: 2809, total_rows: 3960
    progress: 70.934%
    wait time: 0:00:06.195432s
    

    row: 2810, total_rows: 3960
    progress: 70.96%
    wait time: 0:00:06.189893s
    



    row: 2898, total_rows: 3960
    progress: 73.182%
    wait time: 0:00:05.715857s
    

    row: 2899, total_rows: 3960
    progress: 73.207%
    wait time: 0:00:05.711069s
    

    row: 2900, total_rows: 3960
    progress: 73.232%
    wait time: 0:00:05.705549s
    

    row: 2901, total_rows: 3960
    progress: 73.258%
    wait time: 0:00:05.699662s
    

    row: 2902, total_rows: 3960
    progress: 73.283%
    wait time: 0:00:05.693778s
    

    row: 2903, total_rows: 3960
    progress: 73.308%
    wait time: 0:00:05.687895s
    

    row: 2904, total_rows: 3960
    progress: 73.333%
    wait time: 0:00:05.681648s
    

    row: 2905, total_rows: 3960
    progress: 73.359%
    wait time: 0:00:05.676131s
    

    row: 2906, total_rows: 3960
    progress: 73.384%
    wait time: 0:00:05.670252s
    

    row: 2907, total_rows: 3960
    progress: 73.409%
    wait time: 0:00:05.664736s
    

    row: 2908, total_rows: 3960
    progress: 73.434%
    wait time: 0:00:05.658859s
    


    row: 3000, total_rows: 3960
    progress: 75.758%
    wait time: 0:00:05.161596s
    

    row: 3001, total_rows: 3960
    progress: 75.783%
    wait time: 0:00:05.156101s
    

    row: 3002, total_rows: 3960
    progress: 75.808%
    wait time: 0:00:05.151563s
    

    row: 3003, total_rows: 3960
    progress: 75.833%
    wait time: 0:00:05.146386s
    

    row: 3004, total_rows: 3960
    progress: 75.859%
    wait time: 0:00:05.141209s
    

    row: 3005, total_rows: 3960
    progress: 75.884%
    wait time: 0:00:05.136030s
    

    row: 3006, total_rows: 3960
    progress: 75.909%
    wait time: 0:00:05.130851s
    

    row: 3007, total_rows: 3960
    progress: 75.934%
    wait time: 0:00:05.125037s
    

    row: 3008, total_rows: 3960
    progress: 75.96%
    wait time: 0:00:05.119541s
    

    row: 3009, total_rows: 3960
    progress: 75.985%
    wait time: 0:00:05.114362s
    

    row: 3010, total_rows: 3960
    progress: 76.01%
    wait time: 0:00:05.108551s
    




    row: 3102, total_rows: 3960
    progress: 78.333%
    wait time: 0:00:04.604340s
    

    row: 3103, total_rows: 3960
    progress: 78.359%
    wait time: 0:00:04.598321s
    

    row: 3104, total_rows: 3960
    progress: 78.384%
    wait time: 0:00:04.593408s
    

    row: 3105, total_rows: 3960
    progress: 78.409%
    wait time: 0:00:04.588218s
    

    row: 3106, total_rows: 3960
    progress: 78.434%
    wait time: 0:00:04.583027s
    

    row: 3107, total_rows: 3960
    progress: 78.46%
    wait time: 0:00:04.577012s
    

    row: 3108, total_rows: 3960
    progress: 78.485%
    wait time: 0:00:04.570998s
    

    row: 3109, total_rows: 3960
    progress: 78.51%
    wait time: 0:00:04.565534s
    

    row: 3110, total_rows: 3960
    progress: 78.535%
    wait time: 0:00:04.559524s
    

    row: 3111, total_rows: 3960
    progress: 78.561%
    wait time: 0:00:04.553788s
    

    row: 3112, total_rows: 3960
    progress: 78.586%
    wait time: 0:00:04.548600s
    




    row: 3207, total_rows: 3960
    progress: 80.985%
    wait time: 0:00:04.032421s
    

    row: 3208, total_rows: 3960
    progress: 81.01%
    wait time: 0:00:04.026984s
    

    row: 3209, total_rows: 3960
    progress: 81.035%
    wait time: 0:00:04.021547s
    

    row: 3210, total_rows: 3960
    progress: 81.061%
    wait time: 0:00:04.016110s
    

    row: 3211, total_rows: 3960
    progress: 81.086%
    wait time: 0:00:04.010440s
    

    row: 3212, total_rows: 3960
    progress: 81.111%
    wait time: 0:00:04.004771s
    

    row: 3213, total_rows: 3960
    progress: 81.136%
    wait time: 0:00:03.999568s
    

    row: 3214, total_rows: 3960
    progress: 81.162%
    wait time: 0:00:03.994598s
    

    row: 3215, total_rows: 3960
    progress: 81.187%
    wait time: 0:00:03.988930s
    

    row: 3216, total_rows: 3960
    progress: 81.212%
    wait time: 0:00:03.983263s
    

    row: 3217, total_rows: 3960
    progress: 81.237%
    wait time: 0:00:03.978060s
    



    row: 3305, total_rows: 3960
    progress: 83.46%
    wait time: 0:00:03.508345s
    

    row: 3306, total_rows: 3960
    progress: 83.485%
    wait time: 0:00:03.502919s
    

    row: 3307, total_rows: 3960
    progress: 83.51%
    wait time: 0:00:03.497691s
    

    row: 3308, total_rows: 3960
    progress: 83.535%
    wait time: 0:00:03.492463s
    

    row: 3309, total_rows: 3960
    progress: 83.561%
    wait time: 0:00:03.486840s
    

    row: 3310, total_rows: 3960
    progress: 83.586%
    wait time: 0:00:03.481218s
    

    row: 3311, total_rows: 3960
    progress: 83.611%
    wait time: 0:00:03.475598s
    

    row: 3312, total_rows: 3960
    progress: 83.636%
    wait time: 0:00:03.470369s
    

    row: 3313, total_rows: 3960
    progress: 83.662%
    wait time: 0:00:03.465336s
    

    row: 3314, total_rows: 3960
    progress: 83.687%
    wait time: 0:00:03.459912s
    

    row: 3315, total_rows: 3960
    progress: 83.712%
    wait time: 0:00:03.454682s
    




    row: 3402, total_rows: 3960
    progress: 85.909%
    wait time: 0:00:02.988612s
    

    row: 3403, total_rows: 3960
    progress: 85.934%
    wait time: 0:00:02.983363s
    

    row: 3404, total_rows: 3960
    progress: 85.96%
    wait time: 0:00:02.977949s
    

    row: 3405, total_rows: 3960
    progress: 85.985%
    wait time: 0:00:02.972536s
    

    row: 3406, total_rows: 3960
    progress: 86.01%
    wait time: 0:00:02.967448s
    

    row: 3407, total_rows: 3960
    progress: 86.035%
    wait time: 0:00:02.962360s
    

    row: 3408, total_rows: 3960
    progress: 86.061%
    wait time: 0:00:02.957108s
    

    row: 3409, total_rows: 3960
    progress: 86.086%
    wait time: 0:00:02.951694s
    

    row: 3410, total_rows: 3960
    progress: 86.111%
    wait time: 0:00:02.946280s
    

    row: 3411, total_rows: 3960
    progress: 86.136%
    wait time: 0:00:02.941027s
    

    row: 3412, total_rows: 3960
    progress: 86.162%
    wait time: 0:00:02.935774s
    




    row: 3497, total_rows: 3960
    progress: 88.308%
    wait time: 0:00:02.482004s
    

    row: 3498, total_rows: 3960
    progress: 88.333%
    wait time: 0:00:02.476597s
    

    row: 3499, total_rows: 3960
    progress: 88.359%
    wait time: 0:00:02.471321s
    

    row: 3500, total_rows: 3960
    progress: 88.384%
    wait time: 0:00:02.466176s
    

    row: 3501, total_rows: 3960
    progress: 88.409%
    wait time: 0:00:02.460900s
    

    row: 3502, total_rows: 3960
    progress: 88.434%
    wait time: 0:00:02.455622s
    

    row: 3503, total_rows: 3960
    progress: 88.46%
    wait time: 0:00:02.450084s
    

    row: 3504, total_rows: 3960
    progress: 88.485%
    wait time: 0:00:02.444676s
    

    row: 3505, total_rows: 3960
    progress: 88.51%
    wait time: 0:00:02.439269s
    

    row: 3506, total_rows: 3960
    progress: 88.535%
    wait time: 0:00:02.433991s
    

    row: 3507, total_rows: 3960
    progress: 88.561%
    wait time: 0:00:02.428713s
    




    row: 3590, total_rows: 3960
    progress: 90.657%
    wait time: 0:00:01.986854s
    

    row: 3591, total_rows: 3960
    progress: 90.682%
    wait time: 0:00:01.981549s
    

    row: 3592, total_rows: 3960
    progress: 90.707%
    wait time: 0:00:01.976142s
    

    row: 3593, total_rows: 3960
    progress: 90.732%
    wait time: 0:00:01.970837s
    

    row: 3594, total_rows: 3960
    progress: 90.758%
    wait time: 0:00:01.965531s
    

    row: 3595, total_rows: 3960
    progress: 90.783%
    wait time: 0:00:01.960226s
    

    row: 3596, total_rows: 3960
    progress: 90.808%
    wait time: 0:00:01.954919s
    

    row: 3597, total_rows: 3960
    progress: 90.833%
    wait time: 0:00:01.949512s
    

    row: 3598, total_rows: 3960
    progress: 90.859%
    wait time: 0:00:01.944205s
    

    row: 3599, total_rows: 3960
    progress: 90.884%
    wait time: 0:00:01.938697s
    

    row: 3600, total_rows: 3960
    progress: 90.909%
    wait time: 0:00:01.933491s
    


    row: 3684, total_rows: 3960
    progress: 93.03%
    wait time: 0:00:01.483562s
    

    row: 3685, total_rows: 3960
    progress: 93.056%
    wait time: 0:00:01.478309s
    

    row: 3686, total_rows: 3960
    progress: 93.081%
    wait time: 0:00:01.472831s
    

    row: 3687, total_rows: 3960
    progress: 93.106%
    wait time: 0:00:01.467577s
    

    row: 3688, total_rows: 3960
    progress: 93.131%
    wait time: 0:00:01.462247s
    

    row: 3689, total_rows: 3960
    progress: 93.157%
    wait time: 0:00:01.456918s
    

    row: 3690, total_rows: 3960
    progress: 93.182%
    wait time: 0:00:01.451514s
    

    row: 3691, total_rows: 3960
    progress: 93.207%
    wait time: 0:00:01.446038s
    

    row: 3692, total_rows: 3960
    progress: 93.232%
    wait time: 0:00:01.440709s
    

    row: 3693, total_rows: 3960
    progress: 93.258%
    wait time: 0:00:01.435306s
    

    row: 3694, total_rows: 3960
    progress: 93.283%
    wait time: 0:00:01.430048s
    



    row: 3779, total_rows: 3960
    progress: 95.429%
    wait time: 0:00:00.973339s
    

    row: 3780, total_rows: 3960
    progress: 95.455%
    wait time: 0:00:00.968039s
    

    row: 3781, total_rows: 3960
    progress: 95.48%
    wait time: 0:00:00.962596s
    

    row: 3782, total_rows: 3960
    progress: 95.505%
    wait time: 0:00:00.957248s
    

    row: 3783, total_rows: 3960
    progress: 95.53%
    wait time: 0:00:00.951806s
    

    row: 3784, total_rows: 3960
    progress: 95.556%
    wait time: 0:00:00.946411s
    

    row: 3785, total_rows: 3960
    progress: 95.581%
    wait time: 0:00:00.941109s
    

    row: 3786, total_rows: 3960
    progress: 95.606%
    wait time: 0:00:00.935760s
    

    row: 3787, total_rows: 3960
    progress: 95.631%
    wait time: 0:00:00.930365s
    

    row: 3788, total_rows: 3960
    progress: 95.657%
    wait time: 0:00:00.925015s
    

    row: 3789, total_rows: 3960
    progress: 95.682%
    wait time: 0:00:00.919575s
    




    row: 3874, total_rows: 3960
    progress: 97.828%
    wait time: 0:00:00.462795s
    

    row: 3875, total_rows: 3960
    progress: 97.854%
    wait time: 0:00:00.457405s
    

    row: 3876, total_rows: 3960
    progress: 97.879%
    wait time: 0:00:00.451972s
    

    row: 3877, total_rows: 3960
    progress: 97.904%
    wait time: 0:00:00.446669s
    

    row: 3878, total_rows: 3960
    progress: 97.929%
    wait time: 0:00:00.441280s
    

    row: 3879, total_rows: 3960
    progress: 97.955%
    wait time: 0:00:00.435932s
    

    row: 3880, total_rows: 3960
    progress: 97.98%
    wait time: 0:00:00.430563s
    

    row: 3881, total_rows: 3960
    progress: 98.005%
    wait time: 0:00:00.425194s
    

    row: 3882, total_rows: 3960
    progress: 98.03%
    wait time: 0:00:00.419824s
    

    row: 3883, total_rows: 3960
    progress: 98.056%
    wait time: 0:00:00.414414s
    

    row: 3884, total_rows: 3960
    progress: 98.081%
    wait time: 0:00:00.409045s
    



In [23]:
df_representation = pd.DataFrame(data_all_attributes).set_index("id").fillna(0)
df_representation

Unnamed: 0_level_0,retro<&>num_tokens,retro<&>lenght,retro<&>num_numbs,retro<&>num_alpha,retro<&>num_with_uppercase,retro<&>num_tokens_upper,retro<&>prop_vowels,retro<&>len_max_rep_char,retro<&>max_char_fre_per_token(o),retro<&>max_char_fre_per_token(s),...,linguistics<&>lemma<&>scritche,"linguistics<&>lemma<&>loud,\n#amndbot",linguistics<&>lemma<&>randomly,linguistics<&>lemma<&>essay,linguistics<&>lemma<&>concession,linguistics<&>lemma<&>shortsightedness,linguistics<&>lemma<&>@keitholbermann,linguistics<&>lemma<&>despicable,linguistics<&>lemma<&>rancour,linguistics<&>lemma<&>immoral
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10000,18,96,0,18,3,1,0.253165,2,2,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10001,23,119,0,25,7,2,0.329897,2,2,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10002,19,108,0,19,2,2,0.333333,2,3,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10003,24,134,0,23,1,1,0.315315,2,1,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10004,24,125,0,25,3,0,0.352941,2,1,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40855,24,112,0,23,2,1,0.325843,2,2,2,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
40856,20,121,0,21,1,0,0.352941,2,1,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40857,23,139,0,23,2,1,0.316239,2,2,4,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
40858,14,83,2,15,5,1,0.285714,2,1,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
pickle.dump(df_representation, open("df_representation_v1.pickle", "wb"))

In [25]:
partition_attrib = {
    type_attrib: [c for c in df_representation.columns if type_attrib+"<&>" in c]
    for type_attrib in "retro punct emoji linguistics".split()
}

In [26]:
"summary type attrib", {k: len(v) for k, v in partition_cols.items()}

NameError: name 'partition_cols' is not defined

In [None]:
sen = sentiments[0]
sen

In [None]:
explorer = partition_attrib["emoji"][12:12+5]
colors_int = {"low": "tab:blue", "high": "tab:red", "medium": "tab:green"}
fig, ax = plt.subplots(1, len(explorer), figsize=(len(explorer)*8, 1*6))
for k, col in enumerate(explorer):      
    axi = ax[k]
    for int_ in intensities:
        _sub = df_representation[(df_train["sen"] == sen) & (df_train["int"] == int_)]
        sns.distplot(_sub[col], ax=axi , hist=False, kde_kws={"shade": True, "label": int_}, color=colors_int[int_])
        if "emoji<&>" in col:
            print(emojilib.emojize(col.replace("emoji<&>", "")))
            axi.set_title(emojilib.emojize(col.replace("emoji<&>", "")))
    if k == 0:
        axi.legend(loc='upper left')

In [None]:
emojilib.emojize("_hugging_face_")

In [None]:
df_describe_cols = df_representation.describe().T

In [None]:
df_describe_cols.shape

In [None]:
rel_cols = list(df_describe_cols[df_describe_cols["std"]>[0.0762, 0.0710, 0.0635, 0.0573, 0.0527, 0.0503, 0.0477, 0.0390, 0.0318, 0.0276, 0.0225, 0.0159][0]].index)

In [350]:
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [None]:
indexs = df_train[df_train["sen"] == sentiments[3]].index
X = df_representation.loc[indexs][rel_cols]
y = df_train.loc[X.index]["int"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = make_pipeline(StandardScaler(), SVC(gamma='auto', class_weight="balanced"))
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

In [None]:
from sklearn.feature_selection import SelectKBest, chi2

In [420]:
indexs = df_train[df_train["sen"] == sentiments[3]].index
indexsLH = df_train.loc[indexs][(df_train.loc[indexs]["int"].isin(["low", "high"]))].index

X = df_representation.loc[indexsLH]#[rel_cols]
y = df_train.loc[X.index]["int"]

dic_label_count = y.value_counts().to_dict()
min_label = min(dic_label_count.items(), key=lambda x: x[1])[0]
max_label = max(dic_label_count.items(), key=lambda x: x[1])[0]
index_label_1 = y[y==min_label].index
oversampling_steps = int(dic_label_count[max_label] / dic_label_count[min_label]) - 1
X_res, y_res = X.copy(), y.copy()

for step in range(oversampling_steps):
    new_indexs = [f"{ix}+{step + 1}" for ix in index_label_1]
    copied_sub_X = pd.DataFrame(X.loc[index_label_1].values, columns=X.columns, index=new_indexs)
    copied_sub_y = pd.Series(y.loc[index_label_1].values, index=new_indexs)
    X_res = pd.concat([X_res, copied_sub_X], axis=0)
    y_res = pd.concat([y_res, copied_sub_y], axis=0)

X_res = pd.DataFrame(StandardScaler().fit_transform(X_res), columns=X_res.columns, index=X_res.index)
selector = SelectKBest(chi2, k=X.shape[1])
X_res_ = X_res - X_res.min()
selector.fit(X_res_, y_res)
scores_selector = {col: selector.scores_[i] if str(selector.scores_[i]) != "nan" else 0 for i, col in enumerate(X.columns.tolist())}
ranked_cols = [x[0] for x in sorted(scores_selector.items(), key=lambda x: x[1], reverse=True)]
ranked_cols


['linguistics<&>lemma<&>depression',
 'linguistics<&>lemma<&>depress',
 'linguistics<&>lemma<&>sadness',
 'linguistics<&>lemma<&>bad',
 'linguistics<&>lemma<&>sad',
 'linguistics<&>lemma<&>depressing',
 'linguistics<&>lemma<&>serious',
 'linguistics<&>lemma<&>feel',
 'linguistics<&>lemma<&>unhappy',
 'linguistics<&>lemma<&>pine',
 'linguistics<&>lemma<&>sober',
 'linguistics<&>lemma<&>dull',
 'linguistics<&>lemma<&>blue',
 'emoji<&>_loudly_crying_face_',
 'linguistics<&>tag<&>RBS',
 'linguistics<&>lemma<&>hard',
 'linguistics<&>lemma<&>grim',
 'linguistics<&>lemma<&>frown',
 'linguistics<&>lemma<&>service',
 'linguistics<&>lemma<&>😭',
 'linguistics<&>shape<&>😭',
 'linguistics<&>lemma<&>despair',
 'linguistics<&>lemma<&>️',
 'linguistics<&>shape<&>️',
 'linguistics<&>lemma<&>ever',
 'linguistics<&>lemma<&>fucking',
 'linguistics<&>lemma<&>sadly',
 'linguistics<&>lemma<&>dishearten',
 'linguistics<&>lemma<&>fan',
 'linguistics<&>lemma<&>fret',
 'linguistics<&>lemma<&>guy',
 'linguistics<

In [457]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import precision_recall_fscore_support

In [490]:
sen = sentiments[3]

indexs = df_train[df_train["sen"] == sen].index
indexsLH = df_train.loc[indexs][(df_train.loc[indexs]["int"].isin(["low", "high"]))].index

X = df_representation.loc[indexsLH]#[rel_cols]
y = df_train.loc[X.index]["int"]

dic_label_count = y.value_counts().to_dict()
min_label = min(dic_label_count.items(), key=lambda x: x[1])[0]
max_label = max(dic_label_count.items(), key=lambda x: x[1])[0]
index_label_1 = y[y==min_label].index
oversampling_steps = int(dic_label_count[max_label] / dic_label_count[min_label]) - 1
X_res, y_res = X.copy(), y.copy()

for step in range(oversampling_steps):
    new_indexs = [f"{ix}+{step + 1}" for ix in index_label_1]
    copied_sub_X = pd.DataFrame(X.loc[index_label_1].values, columns=X.columns, index=new_indexs)
    copied_sub_y = pd.Series(y.loc[index_label_1].values, index=new_indexs)
    X_res = pd.concat([X_res, copied_sub_X], axis=0)
    y_res = pd.concat([y_res, copied_sub_y], axis=0)

X_res = pd.DataFrame(StandardScaler().fit_transform(X_res), columns=X_res.columns, index=X_res.index)
selector = SelectKBest(chi2, k=X.shape[1])
X_res_ = X_res - X_res.min()
selector.fit(X_res_, y_res)
scores_selector = {col: selector.scores_[i] if str(selector.scores_[i]) != "nan" else 0 for i, col in enumerate(X.columns.tolist())}
ranked_cols = [x[0] for x in sorted(scores_selector.items(), key=lambda x: x[1], reverse=True)]
ranked_cols


f1_weight = []

for num_cols in range(1, len(ranked_cols), 10):
    X = df_representation.loc[indexs][ranked_cols[:num_cols+1]]
    y = df_train.loc[X.index]["int"]

    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    clf = make_pipeline(StandardScaler(), SVC(kernel="rbf", gamma='auto', class_weight="balanced"))
    # clf.fit(X_train, y_train)
    cv_results = cross_validate(clf, X, y, cv=5, scoring="f1_weighted")
    test_score = cv_results["test_score"]
    f1_weight.append([num_cols, np.mean(test_score), np.std(test_score)])
    print(f1_weight[-1])

[1, 0.17764500176218634, 0.04267409206149184]
[11, 0.3024626291701063, 0.11194946412399774]
[21, 0.3789822004596333, 0.12318989310995017]
[31, 0.3701253003455773, 0.11069502685570087]
[41, 0.5140830478624945, 0.017767922025634833]
[51, 0.5342248667751726, 0.03551424167580498]
[61, 0.5491272393246589, 0.030019183298879884]
[71, 0.5457643682281283, 0.03447586913309776]
[81, 0.5489638582045673, 0.03671751504099164]
[91, 0.5520267521245351, 0.0369083094955541]
[101, 0.5483413541151839, 0.03554506473054729]
[111, 0.5406813100478531, 0.03556012609921711]
[121, 0.545480283305222, 0.025609896525082942]
[131, 0.5430986284055243, 0.02283672535416118]
[141, 0.5497817170239718, 0.020301535161532727]
[151, 0.5466863699520738, 0.021681248098248722]
[161, 0.5498641938086111, 0.024186356456579877]
[171, 0.5504741999475588, 0.02943339321641828]
[181, 0.541437834710182, 0.026221993648884345]
[191, 0.5426160508667537, 0.03308127191841251]
[201, 0.5472075820086931, 0.03797134153686845]
[211, 0.54852956744

KeyboardInterrupt: 

In [491]:
best_f1 = sorted(f1_weight, key=lambda x: x[1], reverse=True)[0][0]
best_f1

411

In [492]:
fine_f1_weight = []

for num_cols in range(best_f1-10, best_f1+10, 1):
    X = df_representation.loc[indexs][ranked_cols[:num_cols+1]]
    y = df_train.loc[X.index]["int"]

    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    clf = make_pipeline(StandardScaler(), SVC(kernel="rbf", gamma='auto', class_weight="balanced"))
    # clf.fit(X_train, y_train)
    cv_results = cross_validate(clf, X, y, cv=5, scoring="f1_weighted")
    test_score = cv_results["test_score"]
    fine_f1_weight.append([num_cols, np.mean(test_score), np.std(test_score)])
    print(fine_f1_weight[-1])

[401, 0.570114121731599, 0.05907171316101098]
[402, 0.5712222690419162, 0.05923443886487164]
[403, 0.5725630121704057, 0.05995403527922426]
[404, 0.5739216678031865, 0.05837815516438582]
[405, 0.5748030859662495, 0.05745062043931183]
[406, 0.5748030859662495, 0.05745062043931183]
[407, 0.5752528637574006, 0.05942996348227247]
[408, 0.5764656712182863, 0.060260561111525085]
[409, 0.5776576842811842, 0.06106224580367734]
[410, 0.5754051067666254, 0.060563936265593706]
[411, 0.5766974194155757, 0.06076214922222044]
[412, 0.5753265314269608, 0.0596831636912677]
[413, 0.5764329741509286, 0.0599693219218823]
[414, 0.5763298330915798, 0.059542580170901666]
[415, 0.5763298330915798, 0.059542580170901666]
[416, 0.5763298330915798, 0.059542580170901666]
[417, 0.5763298330915798, 0.059542580170901666]
[418, 0.5749267655920647, 0.061409203480120626]
[419, 0.5763298330915798, 0.059542580170901666]
[420, 0.5763298330915798, 0.059542580170901666]


In [494]:
fine_best_f1 = sorted(fine_f1_weight, key=lambda x: x[1], reverse=True)[0][0]
fine_best_f1

409

In [495]:
pickle.dump(ranked_cols[:fine_best_f1+1], open(f"cols_selected_{sen}_v1.pickle", "wb"))

In [27]:
!pip install torchvision



In [28]:
!pip install transformers



In [29]:
import torch
from transformers import AutoModel, AutoTokenizer

bertweet = AutoModel.from_pretrained("vinai/bertweet-base")

# For transformers v4.x+:
tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False, normalization=True)

# For transformers v3.x:
# tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base")


Some weights of the model checkpoint at vinai/bertweet-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
emoji is not installed, thus not converting emoticons or emojis into text. Please install emoji: pip3 install emoji
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
data_bertweet = []
col_names = [f"BERTweet_{i}" for i in range(768)]
for k, ix in enumerate(df_train.index):
    print(k+1, df_train.shape[0], 100*(k+1)/df_train.shape[0])
    tweet = df_train.loc[ix]["text"]
    input_ids = torch.tensor([tokenizer.encode(tweet)])
    with torch.no_grad():
        outputs = bertweet(input_ids)
        hidden_states = outputs[0]

    token_embeddings = np.array([ll.numpy() for ll in hidden_states[0]])
    sentence_embedding = np.mean(token_embeddings, axis=0)
    o = {"id": ix}
    o = {**o, **dict(zip(col_names, sentence_embedding))}
    data_bertweet.append(o)

1 3960 0.025252525252525252
2 3960 0.050505050505050504
3 3960 0.07575757575757576
4 3960 0.10101010101010101
5 3960 0.12626262626262627
6 3960 0.15151515151515152
7 3960 0.17676767676767677
8 3960 0.20202020202020202
9 3960 0.22727272727272727
10 3960 0.25252525252525254
11 3960 0.2777777777777778
12 3960 0.30303030303030304
13 3960 0.3282828282828283
14 3960 0.35353535353535354
15 3960 0.3787878787878788
16 3960 0.40404040404040403
17 3960 0.4292929292929293
18 3960 0.45454545454545453
19 3960 0.4797979797979798
20 3960 0.5050505050505051
21 3960 0.5303030303030303
22 3960 0.5555555555555556
23 3960 0.5808080808080808
24 3960 0.6060606060606061
25 3960 0.6313131313131313
26 3960 0.6565656565656566
27 3960 0.6818181818181818
28 3960 0.7070707070707071
29 3960 0.7323232323232324
30 3960 0.7575757575757576
31 3960 0.7828282828282829
32 3960 0.8080808080808081
33 3960 0.8333333333333334
34 3960 0.8585858585858586
35 3960 0.8838383838383839
36 3960 0.9090909090909091
37 3960 0.93434343434

307 3960 7.752525252525253
308 3960 7.777777777777778
309 3960 7.803030303030303
310 3960 7.828282828282828
311 3960 7.853535353535354
312 3960 7.878787878787879
313 3960 7.904040404040404
314 3960 7.929292929292929
315 3960 7.954545454545454
316 3960 7.97979797979798
317 3960 8.005050505050505
318 3960 8.030303030303031
319 3960 8.055555555555555
320 3960 8.080808080808081
321 3960 8.106060606060606
322 3960 8.131313131313131
323 3960 8.156565656565656
324 3960 8.181818181818182
325 3960 8.207070707070708
326 3960 8.232323232323232
327 3960 8.257575757575758
328 3960 8.282828282828282
329 3960 8.308080808080808
330 3960 8.333333333333334
331 3960 8.358585858585858
332 3960 8.383838383838384
333 3960 8.409090909090908
334 3960 8.434343434343434
335 3960 8.45959595959596
336 3960 8.484848484848484
337 3960 8.51010101010101
338 3960 8.535353535353535
339 3960 8.56060606060606
340 3960 8.585858585858587
341 3960 8.61111111111111
342 3960 8.636363636363637
343 3960 8.66161616161616
344 396

In [None]:
pickle.dump(pd.DataFrame(data_bertweet).set_index("id"), open("df_representation_v2.pickle", "wb"))

In [None]:
df_representation_v2 = pickle.load(open("df_representation_v2.pickle", "rb"))

In [None]:
df_representation_v2

In [613]:
sen = sentiments[3]

indexs = df_train[df_train["sen"] == sen].index
indexsLH = df_train.loc[indexs][(df_train.loc[indexs]["int"].isin(["low", "high"]))].index

X = df_representation_v2.loc[indexsLH]#[rel_cols]
y = df_train.loc[X.index]["int"]

dic_label_count = y.value_counts().to_dict()
min_label = min(dic_label_count.items(), key=lambda x: x[1])[0]
max_label = max(dic_label_count.items(), key=lambda x: x[1])[0]
index_label_1 = y[y==min_label].index
oversampling_steps = int(dic_label_count[max_label] / dic_label_count[min_label]) - 1

X_res, y_res = X.copy(), y.copy()
for step in range(oversampling_steps):
    new_indexs = [f"{ix}+{step + 1}" for ix in index_label_1]
    copied_sub_X = pd.DataFrame(X.loc[index_label_1].values, columns=X.columns, index=new_indexs)
    copied_sub_y = pd.Series(y.loc[index_label_1].values, index=new_indexs)
    X_res = pd.concat([X_res, copied_sub_X], axis=0)
    y_res = pd.concat([y_res, copied_sub_y], axis=0)

X_res = pd.DataFrame(StandardScaler().fit_transform(X_res), columns=X_res.columns, index=X_res.index)
selector = SelectKBest(chi2, k=X.shape[1])
X_res_ = X_res - X_res.min()
selector.fit(X_res_, y_res)
scores_selector = {col: selector.scores_[i] if str(selector.scores_[i]) != "nan" else 0 for i, col in enumerate(X.columns.tolist())}
ranked_cols = [x[0] for x in sorted(scores_selector.items(), key=lambda x: x[1], reverse=True)]
print(ranked_cols)

f1_weight = []

for num_cols in range(1, len(ranked_cols)+1, 10):
    X = df_representation_v2.loc[indexs][ranked_cols[:num_cols+1]]
    y = df_train.loc[X.index]["int"]

    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    clf = make_pipeline(StandardScaler(), SVC(kernel="rbf", gamma='auto', class_weight="balanced"))
    # clf.fit(X_train, y_train)
    cv_results = cross_validate(clf, X, y, cv=5, scoring="f1_weighted")
    test_score = cv_results["test_score"]
    f1_weight.append([num_cols, np.mean(test_score), np.std(test_score)])
    print(f1_weight[-1])

['BERTweet_422', 'BERTweet_491', 'BERTweet_68', 'BERTweet_550', 'BERTweet_190', 'BERTweet_251', 'BERTweet_546', 'BERTweet_344', 'BERTweet_200', 'BERTweet_205', 'BERTweet_213', 'BERTweet_102', 'BERTweet_221', 'BERTweet_745', 'BERTweet_99', 'BERTweet_677', 'BERTweet_139', 'BERTweet_746', 'BERTweet_648', 'BERTweet_303', 'BERTweet_162', 'BERTweet_455', 'BERTweet_307', 'BERTweet_286', 'BERTweet_332', 'BERTweet_540', 'BERTweet_590', 'BERTweet_566', 'BERTweet_714', 'BERTweet_122', 'BERTweet_760', 'BERTweet_58', 'BERTweet_144', 'BERTweet_442', 'BERTweet_440', 'BERTweet_452', 'BERTweet_723', 'BERTweet_164', 'BERTweet_732', 'BERTweet_626', 'BERTweet_301', 'BERTweet_86', 'BERTweet_161', 'BERTweet_753', 'BERTweet_136', 'BERTweet_675', 'BERTweet_226', 'BERTweet_187', 'BERTweet_310', 'BERTweet_531', 'BERTweet_31', 'BERTweet_445', 'BERTweet_130', 'BERTweet_627', 'BERTweet_697', 'BERTweet_276', 'BERTweet_184', 'BERTweet_79', 'BERTweet_755', 'BERTweet_142', 'BERTweet_707', 'BERTweet_588', 'BERTweet_766

[1, 0.42526762847144806, 0.01708408372717207]
[11, 0.523759038606341, 0.03348429040490501]
[21, 0.5747636447637632, 0.06602933326941429]
[31, 0.576223254221679, 0.04766115157377913]
[41, 0.5808773517443296, 0.06388626326451052]
[51, 0.6090467545604787, 0.04696588688232197]
[61, 0.6081209998301078, 0.044473571761780614]
[71, 0.5982765784014975, 0.06347640473692785]
[81, 0.5868008695366169, 0.05969350639021546]
[91, 0.5858505240586893, 0.05143799274850169]
[101, 0.5914875629020632, 0.06022011685352232]
[111, 0.5917913603219469, 0.04710115388414654]
[121, 0.5956057058893638, 0.047567050981943804]
[131, 0.5923111938484314, 0.045337790613449946]
[141, 0.6042347259243508, 0.04268177776103945]
[151, 0.5929107201320485, 0.04616234066886808]
[161, 0.5950814142317332, 0.04183897491925079]
[171, 0.5865037723473224, 0.047702280963444116]
[181, 0.5899003095471291, 0.04400093312539821]
[191, 0.5880695086518672, 0.05615953140596373]
[201, 0.5966941314775722, 0.047280879419035846]
[211, 0.606476745692

In [614]:
best_f1 = sorted(f1_weight, key=lambda x: x[1], reverse=True)[0][0]
best_f1

411

In [615]:
fine_f1_weight = []

for num_cols in range(best_f1-10, best_f1+10, 1):
    X = df_representation_v2.loc[indexs][ranked_cols[:num_cols+1]]
    y = df_train.loc[X.index]["int"]

    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    clf = make_pipeline(StandardScaler(), SVC(kernel="rbf", gamma='auto', class_weight="balanced"))
    # clf.fit(X_train, y_train)
    cv_results = cross_validate(clf, X, y, cv=5, scoring="f1_weighted")
    test_score = cv_results["test_score"]
    fine_f1_weight.append([num_cols, np.mean(test_score), np.std(test_score)])
    print(fine_f1_weight[-1])

[401, 0.6260232879350924, 0.04980420841408524]
[402, 0.6260232879350924, 0.04980420841408524]
[403, 0.6260232879350924, 0.04980420841408524]
[404, 0.6283425491517305, 0.04874116164664491]
[405, 0.6296290233579253, 0.04688143574556793]
[406, 0.6286699800029222, 0.04830895805472933]
[407, 0.6286319803098179, 0.047983079671125195]
[408, 0.6296821538402746, 0.0467536803984406]
[409, 0.6296821538402746, 0.0467536803984406]
[410, 0.6296821538402746, 0.0467536803984406]
[411, 0.6296579544159963, 0.04671499360793837]
[412, 0.6251670491899469, 0.04384903353107423]
[413, 0.6251670491899469, 0.04384903353107423]
[414, 0.6263376272562879, 0.04567871088941871]
[415, 0.6265167083347271, 0.044432151023578566]
[416, 0.6277412455315968, 0.047762393326056383]
[417, 0.6288149541120509, 0.04682118135950047]
[418, 0.6244563857169697, 0.04517239817305424]
[419, 0.6255300942974238, 0.04425580559325953]
[420, 0.626509218817694, 0.04459999122576923]


In [616]:
fine_best_f1 = sorted(fine_f1_weight, key=lambda x: x[1], reverse=True)[0][0]
fine_best_f1

408

In [618]:
pickle.dump(ranked_cols[:fine_best_f1+1], open(f"cols_selected_{sen}_v2.pickle", "wb"))

In [None]:
df_representation_v3 = df_representation
df_representation_v3 = pd.concat([df_representation_v3, df_representation_v2], axis=1)
df_representation_v3

In [None]:
pickle.dump(df_representation_v3, open("df_representation_v3.pickle", "wb"))

In [656]:
sen = sentiments[3]

indexs = df_train[df_train["sen"] == sen].index
indexsLH = df_train.loc[indexs][(df_train.loc[indexs]["int"].isin(["low", "high"]))].index

X = df_representation_v3.loc[indexsLH]#[rel_cols]
y = df_train.loc[X.index]["int"]

dic_label_count = y.value_counts().to_dict()
min_label = min(dic_label_count.items(), key=lambda x: x[1])[0]
max_label = max(dic_label_count.items(), key=lambda x: x[1])[0]
index_label_1 = y[y==min_label].index
oversampling_steps = int(dic_label_count[max_label] / dic_label_count[min_label]) - 1

X_res, y_res = X.copy(), y.copy()
for step in range(oversampling_steps):
    new_indexs = [f"{ix}+{step + 1}" for ix in index_label_1]
    copied_sub_X = pd.DataFrame(X.loc[index_label_1].values, columns=X.columns, index=new_indexs)
    copied_sub_y = pd.Series(y.loc[index_label_1].values, index=new_indexs)
    X_res = pd.concat([X_res, copied_sub_X], axis=0)
    y_res = pd.concat([y_res, copied_sub_y], axis=0)

X_res = pd.DataFrame(StandardScaler().fit_transform(X_res), columns=X_res.columns, index=X_res.index)
selector = SelectKBest(chi2, k=X.shape[1])
X_res_ = X_res - X_res.min()
selector.fit(X_res_, y_res)
scores_selector = {col: selector.scores_[i] if str(selector.scores_[i]) != "nan" else 0 for i, col in enumerate(X.columns.tolist())}
ranked_cols = [x[0] for x in sorted(scores_selector.items(), key=lambda x: x[1], reverse=True)]
print(ranked_cols)

f1_weight = []

for num_cols in range(1, len(ranked_cols)+1, 10):
    X = df_representation_v3.loc[indexs][ranked_cols[:num_cols+1]]
    y = df_train.loc[X.index]["int"]

    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    clf = make_pipeline(StandardScaler(), SVC(kernel="rbf", gamma='auto', class_weight="balanced"))
    # clf.fit(X_train, y_train)
    cv_results = cross_validate(clf, X, y, cv=5, scoring="f1_weighted")
    test_score = cv_results["test_score"]
    f1_weight.append([num_cols, np.mean(test_score), np.std(test_score)])
    print(f1_weight[-1])

['linguistics<&>lemma<&>depression', 'linguistics<&>lemma<&>depress', 'linguistics<&>lemma<&>sadness', 'linguistics<&>lemma<&>bad', 'linguistics<&>lemma<&>sad', 'linguistics<&>lemma<&>depressing', 'linguistics<&>lemma<&>serious', 'linguistics<&>lemma<&>feel', 'linguistics<&>lemma<&>unhappy', 'linguistics<&>lemma<&>pine', 'linguistics<&>lemma<&>sober', 'linguistics<&>lemma<&>dull', 'linguistics<&>lemma<&>blue', 'emoji<&>_loudly_crying_face_', 'linguistics<&>tag<&>RBS', 'linguistics<&>lemma<&>hard', 'linguistics<&>lemma<&>grim', 'linguistics<&>lemma<&>frown', 'linguistics<&>lemma<&>service', 'linguistics<&>lemma<&>😭', 'linguistics<&>shape<&>😭', 'linguistics<&>lemma<&>despair', 'linguistics<&>lemma<&>️', 'linguistics<&>shape<&>️', 'linguistics<&>lemma<&>ever', 'linguistics<&>lemma<&>fucking', 'linguistics<&>lemma<&>sadly', 'linguistics<&>lemma<&>dishearten', 'linguistics<&>lemma<&>fan', 'linguistics<&>lemma<&>fret', 'linguistics<&>lemma<&>guy', 'linguistics<&>lemma<&>into', 'linguistics<&

[11, 0.3024626291701063, 0.11194946412399774]
[21, 0.3789822004596333, 0.12318989310995017]
[31, 0.3701253003455773, 0.11069502685570087]
[41, 0.5140830478624945, 0.017767922025634833]
[51, 0.5342248667751726, 0.03551424167580498]
[61, 0.5491272393246589, 0.030019183298879884]
[71, 0.5457643682281283, 0.03447586913309776]
[81, 0.5489638582045673, 0.03671751504099164]
[91, 0.5520267521245351, 0.0369083094955541]
[101, 0.5483413541151839, 0.03554506473054729]
[111, 0.5406813100478531, 0.03556012609921711]
[121, 0.545480283305222, 0.025609896525082942]
[131, 0.5430986284055243, 0.02283672535416118]
[141, 0.5497817170239718, 0.020301535161532727]
[151, 0.5466863699520738, 0.021681248098248722]
[161, 0.5498641938086111, 0.024186356456579877]
[171, 0.5504741999475588, 0.02943339321641828]
[181, 0.541437834710182, 0.026221993648884345]
[191, 0.5426160508667537, 0.03308127191841251]
[201, 0.5472075820086931, 0.03797134153686845]
[211, 0.5485295674491542, 0.03109521831367633]
[221, 0.5680717802

KeyboardInterrupt: 

In [657]:
best_f1 = sorted(f1_weight, key=lambda x: x[1], reverse=True)[0][0]
best_f1

411

In [658]:
fine_f1_weight = []

for num_cols in range(best_f1-10, best_f1+10, 1):
    X = df_representation_v3.loc[indexs][ranked_cols[:num_cols+1]]
    y = df_train.loc[X.index]["int"]

    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    clf = make_pipeline(StandardScaler(), SVC(kernel="rbf", gamma='auto', class_weight="balanced"))
    # clf.fit(X_train, y_train)
    cv_results = cross_validate(clf, X, y, cv=5, scoring="f1_weighted")
    test_score = cv_results["test_score"]
    fine_f1_weight.append([num_cols, np.mean(test_score), np.std(test_score)])
    print(fine_f1_weight[-1])

[401, 0.570114121731599, 0.05907171316101098]
[402, 0.5712222690419162, 0.05923443886487164]
[403, 0.5725630121704057, 0.05995403527922426]
[404, 0.5739216678031865, 0.05837815516438582]
[405, 0.5748030859662495, 0.05745062043931183]
[406, 0.5748030859662495, 0.05745062043931183]
[407, 0.5752528637574006, 0.05942996348227247]
[408, 0.5764656712182863, 0.060260561111525085]
[409, 0.5776576842811842, 0.06106224580367734]
[410, 0.5754051067666254, 0.060563936265593706]
[411, 0.5766974194155757, 0.06076214922222044]
[412, 0.5753265314269608, 0.0596831636912677]
[413, 0.5764329741509286, 0.0599693219218823]
[414, 0.5763298330915798, 0.059542580170901666]
[415, 0.5763298330915798, 0.059542580170901666]
[416, 0.5763298330915798, 0.059542580170901666]
[417, 0.5763298330915798, 0.059542580170901666]
[418, 0.5749267655920647, 0.061409203480120626]
[419, 0.5763298330915798, 0.059542580170901666]
[420, 0.5763298330915798, 0.059542580170901666]


In [659]:
fine_best_f1 = sorted(fine_f1_weight, key=lambda x: x[1], reverse=True)[0][0]
fine_best_f1

409

In [660]:
pickle.dump(ranked_cols[:fine_best_f1+1], open(f"cols_selected_{sen}_v3.pickle", "wb"))

In [694]:
import nltk
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('opinion_lexicon')
# nltk.download("sentiwordnet")
# nltk.download('omw-1.4')

from nltk.corpus import opinion_lexicon
from nltk.corpus import sentiwordnet as swn
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [710]:
def get_sentiwordnet_sent(tweet):
    def penn_to_wn(tag):
        if tag.startswith('J'):
            return wn.ADJ
        elif tag.startswith('N'):
            return wn.NOUN
        elif tag.startswith('R'):
            return wn.ADV
        elif tag.startswith('V'):
            return wn.VERB
        return None

    def get_sentiment(word,tag):
        wn_tag = penn_to_wn(tag)
        if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
            return [0,0,0]

        lemma = lemmatizer.lemmatize(word, pos=wn_tag)
        if not lemma:
            return [0,0,0]

        synsets = wn.synsets(word, pos=wn_tag)
        if not synsets:
            return [0,0,0]

        synset = synsets[0]
        swn_synset = swn.senti_synset(synset.name())

        return [swn_synset.pos_score(),swn_synset.neg_score(),swn_synset.obj_score()]

    words_data = tweet.lower().strip().split()

    pos_val = nltk.pos_tag(words_data)
    senti_val = [get_sentiment(x,y) for (x,y) in pos_val]
    return dict(zip("+ - o".split(), np.sum(senti_val, axis=0)))

In [662]:
print(f"Cantidad de palabras positivas: {len(opinion_lexicon.positive())}")
print(f"Cantidad de palabras negativas: {len(opinion_lexicon.negative())}")

Cantidad de palabras positivas: 2006
Cantidad de palabras positivas: 4783


In [712]:
def get_lexicon_attrib(tweet):
    o = {}
    o["lexicon<&>LiuHu<&>+"] = sum(int(t.lower() in opinion_lexicon.positive()) for t in tweet.split())
    o["lexicon<&>LiuHu<&>-"] = sum(int(t.lower() in opinion_lexicon.negative()) for t in tweet.split())
    o_sentiwordnet = get_sentiwordnet_sent(tweet)
    o["lexicon<&>sentiwordnet<&>+"] = o_sentiwordnet["+"]
    o["lexicon<&>sentiwordnet<&>-"] = o_sentiwordnet["-"]
    o["lexicon<&>sentiwordnet<&>o"] = o_sentiwordnet["o"]
    return o

In [713]:
get_lexicon_attrib(tweet)

{'lexicon<&>LiuHu<&>+': 0,
 'lexicon<&>LiuHu<&>-': 6,
 'lexicon<&>sentiwordnet<&>+': 0.375,
 'lexicon<&>sentiwordnet<&>-': 3.5,
 'lexicon<&>sentiwordnet<&>o': 3.125}

In [669]:
data_lexicon_attributes = []
times = []
for k, ix in enumerate(df_train.index):
    start_time = time.time()
    tweet = df_train.loc[ix]["text"]
    o = {"id": ix}
    o = {**o, **get_lexicon_attrib(tweet)}
    data_lexicon_attributes.append(o)
    dt = time.time()-start_time
    times.append(dt)
    print(f"""
    row: {k+1}, total_rows: {df_train.shape[0]}
    progress: {np.round(100*(k+1)/df_train.shape[0], 3)}%
    wait time: {datetime.timedelta(seconds = np.median(times)*(df_train.shape[0]-k-1))}s
    """)


    row: 1, total_rows: 3960
    progress: 0.025%
    wait time: 0:13:56.109647s
    

    row: 2, total_rows: 3960
    progress: 0.051%
    wait time: 0:15:50.783487s
    

    row: 3, total_rows: 3960
    progress: 0.076%
    wait time: 0:15:28.099567s
    

    row: 4, total_rows: 3960
    progress: 0.101%
    wait time: 0:16:15.050782s
    

    row: 5, total_rows: 3960
    progress: 0.126%
    wait time: 0:16:43.898985s
    

    row: 6, total_rows: 3960
    progress: 0.152%
    wait time: 0:14:27.372041s
    

    row: 7, total_rows: 3960
    progress: 0.177%
    wait time: 0:15:09.450331s
    

    row: 8, total_rows: 3960
    progress: 0.202%
    wait time: 0:15:23.135576s
    

    row: 9, total_rows: 3960
    progress: 0.227%
    wait time: 0:14:53.737124s
    

    row: 10, total_rows: 3960
    progress: 0.253%
    wait time: 0:14:44.022115s
    

    row: 11, total_rows: 3960
    progress: 0.278%
    wait time: 0:14:05.134906s
    

    row: 12, total_rows: 3960
    progre


    row: 95, total_rows: 3960
    progress: 2.399%
    wait time: 0:12:20.473808s
    

    row: 96, total_rows: 3960
    progress: 2.424%
    wait time: 0:12:23.408039s
    

    row: 97, total_rows: 3960
    progress: 2.449%
    wait time: 0:12:27.711201s
    

    row: 98, total_rows: 3960
    progress: 2.475%
    wait time: 0:12:29.593108s
    

    row: 99, total_rows: 3960
    progress: 2.5%
    wait time: 0:12:30.495198s
    

    row: 100, total_rows: 3960
    progress: 2.525%
    wait time: 0:12:25.540909s
    

    row: 101, total_rows: 3960
    progress: 2.551%
    wait time: 0:12:24.201626s
    

    row: 102, total_rows: 3960
    progress: 2.576%
    wait time: 0:12:22.923252s
    

    row: 103, total_rows: 3960
    progress: 2.601%
    wait time: 0:12:25.264672s
    

    row: 104, total_rows: 3960
    progress: 2.626%
    wait time: 0:12:23.102775s
    

    row: 105, total_rows: 3960
    progress: 2.652%
    wait time: 0:12:18.407062s
    

    row: 106, total_rows: 3


    row: 189, total_rows: 3960
    progress: 4.773%
    wait time: 0:12:04.059774s
    

    row: 190, total_rows: 3960
    progress: 4.798%
    wait time: 0:12:02.163114s
    

    row: 191, total_rows: 3960
    progress: 4.823%
    wait time: 0:12:03.682352s
    

    row: 192, total_rows: 3960
    progress: 4.848%
    wait time: 0:12:04.790017s
    

    row: 193, total_rows: 3960
    progress: 4.874%
    wait time: 0:12:02.660098s
    

    row: 194, total_rows: 3960
    progress: 4.899%
    wait time: 0:12:02.144447s
    

    row: 195, total_rows: 3960
    progress: 4.924%
    wait time: 0:12:00.492093s
    

    row: 196, total_rows: 3960
    progress: 4.949%
    wait time: 0:12:01.066073s
    

    row: 197, total_rows: 3960
    progress: 4.975%
    wait time: 0:12:01.497881s
    

    row: 198, total_rows: 3960
    progress: 5.0%
    wait time: 0:12:01.656797s
    

    row: 199, total_rows: 3960
    progress: 5.025%
    wait time: 0:12:03.665845s
    

    row: 200, total_ro


    row: 283, total_rows: 3960
    progress: 7.146%
    wait time: 0:11:49.317130s
    

    row: 284, total_rows: 3960
    progress: 7.172%
    wait time: 0:11:50.086412s
    

    row: 285, total_rows: 3960
    progress: 7.197%
    wait time: 0:11:49.738444s
    

    row: 286, total_rows: 3960
    progress: 7.222%
    wait time: 0:11:48.067297s
    

    row: 287, total_rows: 3960
    progress: 7.247%
    wait time: 0:11:48.303070s
    

    row: 288, total_rows: 3960
    progress: 7.273%
    wait time: 0:11:47.157382s
    

    row: 289, total_rows: 3960
    progress: 7.298%
    wait time: 0:11:48.269182s
    

    row: 290, total_rows: 3960
    progress: 7.323%
    wait time: 0:11:46.217272s
    

    row: 291, total_rows: 3960
    progress: 7.348%
    wait time: 0:11:46.665222s
    

    row: 292, total_rows: 3960
    progress: 7.374%
    wait time: 0:11:45.222480s
    

    row: 293, total_rows: 3960
    progress: 7.399%
    wait time: 0:11:46.100171s
    

    row: 294, total_


    row: 376, total_rows: 3960
    progress: 9.495%
    wait time: 0:11:23.973449s
    

    row: 377, total_rows: 3960
    progress: 9.52%
    wait time: 0:11:22.939135s
    

    row: 378, total_rows: 3960
    progress: 9.545%
    wait time: 0:11:23.531673s
    

    row: 379, total_rows: 3960
    progress: 9.571%
    wait time: 0:11:22.805092s
    

    row: 380, total_rows: 3960
    progress: 9.596%
    wait time: 0:11:22.977451s
    

    row: 381, total_rows: 3960
    progress: 9.621%
    wait time: 0:11:24.341773s
    

    row: 382, total_rows: 3960
    progress: 9.646%
    wait time: 0:11:23.278345s
    

    row: 383, total_rows: 3960
    progress: 9.672%
    wait time: 0:11:22.799527s
    

    row: 384, total_rows: 3960
    progress: 9.697%
    wait time: 0:11:23.235824s
    

    row: 385, total_rows: 3960
    progress: 9.722%
    wait time: 0:11:23.092268s
    

    row: 386, total_rows: 3960
    progress: 9.747%
    wait time: 0:11:21.780740s
    

    row: 387, total_r


    row: 468, total_rows: 3960
    progress: 11.818%
    wait time: 0:11:07.036582s
    

    row: 469, total_rows: 3960
    progress: 11.843%
    wait time: 0:11:06.191097s
    

    row: 470, total_rows: 3960
    progress: 11.869%
    wait time: 0:11:06.248073s
    

    row: 471, total_rows: 3960
    progress: 11.894%
    wait time: 0:11:06.459558s
    

    row: 472, total_rows: 3960
    progress: 11.919%
    wait time: 0:11:05.722350s
    

    row: 473, total_rows: 3960
    progress: 11.944%
    wait time: 0:11:06.131475s
    

    row: 474, total_rows: 3960
    progress: 11.97%
    wait time: 0:11:06.015086s
    

    row: 475, total_rows: 3960
    progress: 11.995%
    wait time: 0:11:05.538513s
    

    row: 476, total_rows: 3960
    progress: 12.02%
    wait time: 0:11:05.773930s
    

    row: 477, total_rows: 3960
    progress: 12.045%
    wait time: 0:11:05.619952s
    

    row: 478, total_rows: 3960
    progress: 12.071%
    wait time: 0:11:05.225185s
    

    row: 47


    row: 560, total_rows: 3960
    progress: 14.141%
    wait time: 0:10:55.321180s
    

    row: 561, total_rows: 3960
    progress: 14.167%
    wait time: 0:10:55.792078s
    

    row: 562, total_rows: 3960
    progress: 14.192%
    wait time: 0:10:55.485599s
    

    row: 563, total_rows: 3960
    progress: 14.217%
    wait time: 0:10:56.031119s
    

    row: 564, total_rows: 3960
    progress: 14.242%
    wait time: 0:10:56.344577s
    

    row: 565, total_rows: 3960
    progress: 14.268%
    wait time: 0:10:55.615470s
    

    row: 566, total_rows: 3960
    progress: 14.293%
    wait time: 0:10:55.716839s
    

    row: 567, total_rows: 3960
    progress: 14.318%
    wait time: 0:10:54.990426s
    

    row: 568, total_rows: 3960
    progress: 14.343%
    wait time: 0:10:54.998503s
    

    row: 569, total_rows: 3960
    progress: 14.369%
    wait time: 0:10:54.889356s
    

    row: 570, total_rows: 3960
    progress: 14.394%
    wait time: 0:10:54.232206s
    

    row: 


    row: 652, total_rows: 3960
    progress: 16.465%
    wait time: 0:10:31.314302s
    

    row: 653, total_rows: 3960
    progress: 16.49%
    wait time: 0:10:31.728323s
    

    row: 654, total_rows: 3960
    progress: 16.515%
    wait time: 0:10:31.871970s
    

    row: 655, total_rows: 3960
    progress: 16.54%
    wait time: 0:10:31.711369s
    

    row: 656, total_rows: 3960
    progress: 16.566%
    wait time: 0:10:31.036456s
    

    row: 657, total_rows: 3960
    progress: 16.591%
    wait time: 0:10:30.177131s
    

    row: 658, total_rows: 3960
    progress: 16.616%
    wait time: 0:10:29.958132s
    

    row: 659, total_rows: 3960
    progress: 16.641%
    wait time: 0:10:30.208037s
    

    row: 660, total_rows: 3960
    progress: 16.667%
    wait time: 0:10:29.527969s
    

    row: 661, total_rows: 3960
    progress: 16.692%
    wait time: 0:10:29.853766s
    

    row: 662, total_rows: 3960
    progress: 16.717%
    wait time: 0:10:29.803717s
    

    row: 66


    row: 745, total_rows: 3960
    progress: 18.813%
    wait time: 0:10:10.920474s
    

    row: 746, total_rows: 3960
    progress: 18.838%
    wait time: 0:10:10.468054s
    

    row: 747, total_rows: 3960
    progress: 18.864%
    wait time: 0:10:10.468539s
    

    row: 748, total_rows: 3960
    progress: 18.889%
    wait time: 0:10:10.277151s
    

    row: 749, total_rows: 3960
    progress: 18.914%
    wait time: 0:10:10.173714s
    

    row: 750, total_rows: 3960
    progress: 18.939%
    wait time: 0:10:09.971463s
    

    row: 751, total_rows: 3960
    progress: 18.965%
    wait time: 0:10:09.970265s
    

    row: 752, total_rows: 3960
    progress: 18.99%
    wait time: 0:10:09.178529s
    

    row: 753, total_rows: 3960
    progress: 19.015%
    wait time: 0:10:08.674373s
    

    row: 754, total_rows: 3960
    progress: 19.04%
    wait time: 0:10:08.881975s
    

    row: 755, total_rows: 3960
    progress: 19.066%
    wait time: 0:10:08.472187s
    

    row: 75


    row: 837, total_rows: 3960
    progress: 21.136%
    wait time: 0:09:42.349968s
    

    row: 838, total_rows: 3960
    progress: 21.162%
    wait time: 0:09:41.815581s
    

    row: 839, total_rows: 3960
    progress: 21.187%
    wait time: 0:09:41.669463s
    

    row: 840, total_rows: 3960
    progress: 21.212%
    wait time: 0:09:41.835553s
    

    row: 841, total_rows: 3960
    progress: 21.237%
    wait time: 0:09:41.491988s
    

    row: 842, total_rows: 3960
    progress: 21.263%
    wait time: 0:09:41.434291s
    

    row: 843, total_rows: 3960
    progress: 21.288%
    wait time: 0:09:40.945095s
    

    row: 844, total_rows: 3960
    progress: 21.313%
    wait time: 0:09:40.883578s
    

    row: 845, total_rows: 3960
    progress: 21.338%
    wait time: 0:09:40.655648s
    

    row: 846, total_rows: 3960
    progress: 21.364%
    wait time: 0:09:40.763103s
    

    row: 847, total_rows: 3960
    progress: 21.389%
    wait time: 0:09:40.104514s
    

    row: 


    row: 929, total_rows: 3960
    progress: 23.46%
    wait time: 0:09:26.745879s
    

    row: 930, total_rows: 3960
    progress: 23.485%
    wait time: 0:09:26.259489s
    

    row: 931, total_rows: 3960
    progress: 23.51%
    wait time: 0:09:26.373130s
    

    row: 932, total_rows: 3960
    progress: 23.535%
    wait time: 0:09:25.881075s
    

    row: 933, total_rows: 3960
    progress: 23.561%
    wait time: 0:09:25.165808s
    

    row: 934, total_rows: 3960
    progress: 23.586%
    wait time: 0:09:25.165433s
    

    row: 935, total_rows: 3960
    progress: 23.611%
    wait time: 0:09:25.035009s
    

    row: 936, total_rows: 3960
    progress: 23.636%
    wait time: 0:09:24.885237s
    

    row: 937, total_rows: 3960
    progress: 23.662%
    wait time: 0:09:24.899847s
    

    row: 938, total_rows: 3960
    progress: 23.687%
    wait time: 0:09:25.096473s
    

    row: 939, total_rows: 3960
    progress: 23.712%
    wait time: 0:09:24.581584s
    

    row: 94


    row: 1021, total_rows: 3960
    progress: 25.783%
    wait time: 0:09:12.059567s
    

    row: 1022, total_rows: 3960
    progress: 25.808%
    wait time: 0:09:11.797867s
    

    row: 1023, total_rows: 3960
    progress: 25.833%
    wait time: 0:09:11.576597s
    

    row: 1024, total_rows: 3960
    progress: 25.859%
    wait time: 0:09:11.681231s
    

    row: 1025, total_rows: 3960
    progress: 25.884%
    wait time: 0:09:11.491233s
    

    row: 1026, total_rows: 3960
    progress: 25.909%
    wait time: 0:09:11.570298s
    

    row: 1027, total_rows: 3960
    progress: 25.934%
    wait time: 0:09:11.045512s
    

    row: 1028, total_rows: 3960
    progress: 25.96%
    wait time: 0:09:10.984079s
    

    row: 1029, total_rows: 3960
    progress: 25.985%
    wait time: 0:09:10.622960s
    

    row: 1030, total_rows: 3960
    progress: 26.01%
    wait time: 0:09:10.262295s
    

    row: 1031, total_rows: 3960
    progress: 26.035%
    wait time: 0:09:09.939051s
    




    row: 1113, total_rows: 3960
    progress: 28.106%
    wait time: 0:08:56.373608s
    

    row: 1114, total_rows: 3960
    progress: 28.131%
    wait time: 0:08:56.442890s
    

    row: 1115, total_rows: 3960
    progress: 28.157%
    wait time: 0:08:56.355740s
    

    row: 1116, total_rows: 3960
    progress: 28.182%
    wait time: 0:08:56.372917s
    

    row: 1117, total_rows: 3960
    progress: 28.207%
    wait time: 0:08:56.272393s
    

    row: 1118, total_rows: 3960
    progress: 28.232%
    wait time: 0:08:55.782366s
    

    row: 1119, total_rows: 3960
    progress: 28.258%
    wait time: 0:08:55.557372s
    

    row: 1120, total_rows: 3960
    progress: 28.283%
    wait time: 0:08:55.416224s
    

    row: 1121, total_rows: 3960
    progress: 28.308%
    wait time: 0:08:55.482817s
    

    row: 1122, total_rows: 3960
    progress: 28.333%
    wait time: 0:08:55.143701s
    

    row: 1123, total_rows: 3960
    progress: 28.359%
    wait time: 0:08:54.949089s
    


    row: 1204, total_rows: 3960
    progress: 30.404%
    wait time: 0:08:42.111353s
    

    row: 1205, total_rows: 3960
    progress: 30.429%
    wait time: 0:08:41.916704s
    

    row: 1206, total_rows: 3960
    progress: 30.455%
    wait time: 0:08:41.692355s
    

    row: 1207, total_rows: 3960
    progress: 30.48%
    wait time: 0:08:41.490918s
    

    row: 1208, total_rows: 3960
    progress: 30.505%
    wait time: 0:08:41.107094s
    

    row: 1209, total_rows: 3960
    progress: 30.53%
    wait time: 0:08:40.751062s
    

    row: 1210, total_rows: 3960
    progress: 30.556%
    wait time: 0:08:40.763943s
    

    row: 1211, total_rows: 3960
    progress: 30.581%
    wait time: 0:08:40.699093s
    

    row: 1212, total_rows: 3960
    progress: 30.606%
    wait time: 0:08:40.687333s
    

    row: 1213, total_rows: 3960
    progress: 30.631%
    wait time: 0:08:40.619560s
    

    row: 1214, total_rows: 3960
    progress: 30.657%
    wait time: 0:08:40.261707s
    




    row: 1295, total_rows: 3960
    progress: 32.702%
    wait time: 0:08:24.465765s
    

    row: 1296, total_rows: 3960
    progress: 32.727%
    wait time: 0:08:24.529286s
    

    row: 1297, total_rows: 3960
    progress: 32.753%
    wait time: 0:08:24.473033s
    

    row: 1298, total_rows: 3960
    progress: 32.778%
    wait time: 0:08:24.443160s
    

    row: 1299, total_rows: 3960
    progress: 32.803%
    wait time: 0:08:24.009001s
    

    row: 1300, total_rows: 3960
    progress: 32.828%
    wait time: 0:08:23.550828s
    

    row: 1301, total_rows: 3960
    progress: 32.854%
    wait time: 0:08:23.332612s
    

    row: 1302, total_rows: 3960
    progress: 32.879%
    wait time: 0:08:23.247277s
    

    row: 1303, total_rows: 3960
    progress: 32.904%
    wait time: 0:08:22.908622s
    

    row: 1304, total_rows: 3960
    progress: 32.929%
    wait time: 0:08:22.572347s
    

    row: 1305, total_rows: 3960
    progress: 32.955%
    wait time: 0:08:22.586656s
    


    row: 1387, total_rows: 3960
    progress: 35.025%
    wait time: 0:08:08.797388s
    

    row: 1388, total_rows: 3960
    progress: 35.051%
    wait time: 0:08:08.745041s
    

    row: 1389, total_rows: 3960
    progress: 35.076%
    wait time: 0:08:08.444128s
    

    row: 1390, total_rows: 3960
    progress: 35.101%
    wait time: 0:08:08.398844s
    

    row: 1391, total_rows: 3960
    progress: 35.126%
    wait time: 0:08:08.305177s
    

    row: 1392, total_rows: 3960
    progress: 35.152%
    wait time: 0:08:08.281464s
    

    row: 1393, total_rows: 3960
    progress: 35.177%
    wait time: 0:08:07.934603s
    

    row: 1394, total_rows: 3960
    progress: 35.202%
    wait time: 0:08:07.893931s
    

    row: 1395, total_rows: 3960
    progress: 35.227%
    wait time: 0:08:07.587913s
    

    row: 1396, total_rows: 3960
    progress: 35.253%
    wait time: 0:08:07.475177s
    

    row: 1397, total_rows: 3960
    progress: 35.278%
    wait time: 0:08:07.369616s
    


    row: 1478, total_rows: 3960
    progress: 37.323%
    wait time: 0:07:52.918256s
    

    row: 1479, total_rows: 3960
    progress: 37.348%
    wait time: 0:07:52.774114s
    

    row: 1480, total_rows: 3960
    progress: 37.374%
    wait time: 0:07:52.557757s
    

    row: 1481, total_rows: 3960
    progress: 37.399%
    wait time: 0:07:52.519042s
    

    row: 1482, total_rows: 3960
    progress: 37.424%
    wait time: 0:07:52.146957s
    

    row: 1483, total_rows: 3960
    progress: 37.449%
    wait time: 0:07:51.976733s
    

    row: 1484, total_rows: 3960
    progress: 37.475%
    wait time: 0:07:51.849030s
    

    row: 1485, total_rows: 3960
    progress: 37.5%
    wait time: 0:07:51.557711s
    

    row: 1486, total_rows: 3960
    progress: 37.525%
    wait time: 0:07:51.224947s
    

    row: 1487, total_rows: 3960
    progress: 37.551%
    wait time: 0:07:51.065608s
    

    row: 1488, total_rows: 3960
    progress: 37.576%
    wait time: 0:07:50.794793s
    




    row: 1569, total_rows: 3960
    progress: 39.621%
    wait time: 0:07:35.148631s
    

    row: 1570, total_rows: 3960
    progress: 39.646%
    wait time: 0:07:34.775146s
    

    row: 1571, total_rows: 3960
    progress: 39.672%
    wait time: 0:07:34.685153s
    

    row: 1572, total_rows: 3960
    progress: 39.697%
    wait time: 0:07:34.385124s
    

    row: 1573, total_rows: 3960
    progress: 39.722%
    wait time: 0:07:34.193166s
    

    row: 1574, total_rows: 3960
    progress: 39.747%
    wait time: 0:07:34.102868s
    

    row: 1575, total_rows: 3960
    progress: 39.773%
    wait time: 0:07:33.980531s
    

    row: 1576, total_rows: 3960
    progress: 39.798%
    wait time: 0:07:33.904985s
    

    row: 1577, total_rows: 3960
    progress: 39.823%
    wait time: 0:07:33.567541s
    

    row: 1578, total_rows: 3960
    progress: 39.848%
    wait time: 0:07:33.355809s
    

    row: 1579, total_rows: 3960
    progress: 39.874%
    wait time: 0:07:33.234681s
    


    row: 1660, total_rows: 3960
    progress: 41.919%
    wait time: 0:07:18.908198s
    

    row: 1661, total_rows: 3960
    progress: 41.944%
    wait time: 0:07:18.647192s
    

    row: 1662, total_rows: 3960
    progress: 41.97%
    wait time: 0:07:18.315748s
    

    row: 1663, total_rows: 3960
    progress: 41.995%
    wait time: 0:07:18.158863s
    

    row: 1664, total_rows: 3960
    progress: 42.02%
    wait time: 0:07:18.003217s
    

    row: 1665, total_rows: 3960
    progress: 42.045%
    wait time: 0:07:17.630896s
    

    row: 1666, total_rows: 3960
    progress: 42.071%
    wait time: 0:07:17.356805s
    

    row: 1667, total_rows: 3960
    progress: 42.096%
    wait time: 0:07:17.133826s
    

    row: 1668, total_rows: 3960
    progress: 42.121%
    wait time: 0:07:16.805012s
    

    row: 1669, total_rows: 3960
    progress: 42.146%
    wait time: 0:07:16.653723s
    

    row: 1670, total_rows: 3960
    progress: 42.172%
    wait time: 0:07:16.380198s
    




    row: 1751, total_rows: 3960
    progress: 44.217%
    wait time: 0:07:03.671303s
    

    row: 1752, total_rows: 3960
    progress: 44.242%
    wait time: 0:07:03.558215s
    

    row: 1753, total_rows: 3960
    progress: 44.268%
    wait time: 0:07:03.242903s
    

    row: 1754, total_rows: 3960
    progress: 44.293%
    wait time: 0:07:02.884210s
    

    row: 1755, total_rows: 3960
    progress: 44.318%
    wait time: 0:07:02.671734s
    

    row: 1756, total_rows: 3960
    progress: 44.343%
    wait time: 0:07:02.427894s
    

    row: 1757, total_rows: 3960
    progress: 44.369%
    wait time: 0:07:02.273263s
    

    row: 1758, total_rows: 3960
    progress: 44.394%
    wait time: 0:07:02.074695s
    

    row: 1759, total_rows: 3960
    progress: 44.419%
    wait time: 0:07:01.965051s
    

    row: 1760, total_rows: 3960
    progress: 44.444%
    wait time: 0:07:01.708853s
    

    row: 1761, total_rows: 3960
    progress: 44.47%
    wait time: 0:07:01.380296s
    



    row: 1843, total_rows: 3960
    progress: 46.54%
    wait time: 0:06:44.199280s
    

    row: 1844, total_rows: 3960
    progress: 46.566%
    wait time: 0:06:44.085583s
    

    row: 1845, total_rows: 3960
    progress: 46.591%
    wait time: 0:06:43.835189s
    

    row: 1846, total_rows: 3960
    progress: 46.616%
    wait time: 0:06:43.827768s
    

    row: 1847, total_rows: 3960
    progress: 46.641%
    wait time: 0:06:43.775465s
    

    row: 1848, total_rows: 3960
    progress: 46.667%
    wait time: 0:06:43.409452s
    

    row: 1849, total_rows: 3960
    progress: 46.692%
    wait time: 0:06:43.328334s
    

    row: 1850, total_rows: 3960
    progress: 46.717%
    wait time: 0:06:43.017537s
    

    row: 1851, total_rows: 3960
    progress: 46.742%
    wait time: 0:06:42.836991s
    

    row: 1852, total_rows: 3960
    progress: 46.768%
    wait time: 0:06:42.745286s
    

    row: 1853, total_rows: 3960
    progress: 46.793%
    wait time: 0:06:42.645413s
    



    row: 1935, total_rows: 3960
    progress: 48.864%
    wait time: 0:06:25.961682s
    

    row: 1936, total_rows: 3960
    progress: 48.889%
    wait time: 0:06:25.816681s
    

    row: 1937, total_rows: 3960
    progress: 48.914%
    wait time: 0:06:25.699811s
    

    row: 1938, total_rows: 3960
    progress: 48.939%
    wait time: 0:06:25.396909s
    

    row: 1939, total_rows: 3960
    progress: 48.965%
    wait time: 0:06:25.167260s
    

    row: 1940, total_rows: 3960
    progress: 48.99%
    wait time: 0:06:25.074215s
    

    row: 1941, total_rows: 3960
    progress: 49.015%
    wait time: 0:06:24.857080s
    

    row: 1942, total_rows: 3960
    progress: 49.04%
    wait time: 0:06:24.639997s
    

    row: 1943, total_rows: 3960
    progress: 49.066%
    wait time: 0:06:24.456218s
    

    row: 1944, total_rows: 3960
    progress: 49.091%
    wait time: 0:06:24.141639s
    

    row: 1945, total_rows: 3960
    progress: 49.116%
    wait time: 0:06:24.010847s
    




    row: 2026, total_rows: 3960
    progress: 51.162%
    wait time: 0:06:07.947059s
    

    row: 2027, total_rows: 3960
    progress: 51.187%
    wait time: 0:06:07.687054s
    

    row: 2028, total_rows: 3960
    progress: 51.212%
    wait time: 0:06:07.428143s
    

    row: 2029, total_rows: 3960
    progress: 51.237%
    wait time: 0:06:07.336070s
    

    row: 2030, total_rows: 3960
    progress: 51.263%
    wait time: 0:06:07.234282s
    

    row: 2031, total_rows: 3960
    progress: 51.288%
    wait time: 0:06:06.942187s
    

    row: 2032, total_rows: 3960
    progress: 51.313%
    wait time: 0:06:06.793704s
    

    row: 2033, total_rows: 3960
    progress: 51.338%
    wait time: 0:06:06.491439s
    

    row: 2034, total_rows: 3960
    progress: 51.364%
    wait time: 0:06:06.333856s
    

    row: 2035, total_rows: 3960
    progress: 51.389%
    wait time: 0:06:06.151196s
    

    row: 2036, total_rows: 3960
    progress: 51.414%
    wait time: 0:06:06.043243s
    


    row: 2117, total_rows: 3960
    progress: 53.46%
    wait time: 0:05:49.224421s
    

    row: 2118, total_rows: 3960
    progress: 53.485%
    wait time: 0:05:49.116409s
    

    row: 2119, total_rows: 3960
    progress: 53.51%
    wait time: 0:05:49.045701s
    

    row: 2120, total_rows: 3960
    progress: 53.535%
    wait time: 0:05:48.959116s
    

    row: 2121, total_rows: 3960
    progress: 53.561%
    wait time: 0:05:48.668380s
    

    row: 2122, total_rows: 3960
    progress: 53.586%
    wait time: 0:05:48.529564s
    

    row: 2123, total_rows: 3960
    progress: 53.611%
    wait time: 0:05:48.297977s
    

    row: 2124, total_rows: 3960
    progress: 53.636%
    wait time: 0:05:48.128768s
    

    row: 2125, total_rows: 3960
    progress: 53.662%
    wait time: 0:05:47.878273s
    

    row: 2126, total_rows: 3960
    progress: 53.687%
    wait time: 0:05:47.615814s
    

    row: 2127, total_rows: 3960
    progress: 53.712%
    wait time: 0:05:47.413881s
    




    row: 2209, total_rows: 3960
    progress: 55.783%
    wait time: 0:05:32.139388s
    

    row: 2210, total_rows: 3960
    progress: 55.808%
    wait time: 0:05:31.854186s
    

    row: 2211, total_rows: 3960
    progress: 55.833%
    wait time: 0:05:31.551762s
    

    row: 2212, total_rows: 3960
    progress: 55.859%
    wait time: 0:05:31.380076s
    

    row: 2213, total_rows: 3960
    progress: 55.884%
    wait time: 0:05:31.235620s
    

    row: 2214, total_rows: 3960
    progress: 55.909%
    wait time: 0:05:30.962799s
    

    row: 2215, total_rows: 3960
    progress: 55.934%
    wait time: 0:05:30.807292s
    

    row: 2216, total_rows: 3960
    progress: 55.96%
    wait time: 0:05:30.586679s
    

    row: 2217, total_rows: 3960
    progress: 55.985%
    wait time: 0:05:30.449544s
    

    row: 2218, total_rows: 3960
    progress: 56.01%
    wait time: 0:05:30.307956s
    

    row: 2219, total_rows: 3960
    progress: 56.035%
    wait time: 0:05:30.080303s
    




    row: 2300, total_rows: 3960
    progress: 58.081%
    wait time: 0:05:14.345943s
    

    row: 2301, total_rows: 3960
    progress: 58.106%
    wait time: 0:05:14.179901s
    

    row: 2302, total_rows: 3960
    progress: 58.131%
    wait time: 0:05:13.937747s
    

    row: 2303, total_rows: 3960
    progress: 58.157%
    wait time: 0:05:13.807326s
    

    row: 2304, total_rows: 3960
    progress: 58.182%
    wait time: 0:05:13.658441s
    

    row: 2305, total_rows: 3960
    progress: 58.207%
    wait time: 0:05:13.492929s
    

    row: 2306, total_rows: 3960
    progress: 58.232%
    wait time: 0:05:13.283945s
    

    row: 2307, total_rows: 3960
    progress: 58.258%
    wait time: 0:05:13.117314s
    

    row: 2308, total_rows: 3960
    progress: 58.283%
    wait time: 0:05:12.946010s
    

    row: 2309, total_rows: 3960
    progress: 58.308%
    wait time: 0:05:12.795404s
    

    row: 2310, total_rows: 3960
    progress: 58.333%
    wait time: 0:05:12.620922s
    


    row: 2391, total_rows: 3960
    progress: 60.379%
    wait time: 0:04:56.286417s
    

    row: 2392, total_rows: 3960
    progress: 60.404%
    wait time: 0:04:56.137165s
    

    row: 2393, total_rows: 3960
    progress: 60.429%
    wait time: 0:04:55.932774s
    

    row: 2394, total_rows: 3960
    progress: 60.455%
    wait time: 0:04:55.674073s
    

    row: 2395, total_rows: 3960
    progress: 60.48%
    wait time: 0:04:55.514280s
    

    row: 2396, total_rows: 3960
    progress: 60.505%
    wait time: 0:04:55.270797s
    

    row: 2397, total_rows: 3960
    progress: 60.53%
    wait time: 0:04:55.072462s
    

    row: 2398, total_rows: 3960
    progress: 60.556%
    wait time: 0:04:54.966728s
    

    row: 2399, total_rows: 3960
    progress: 60.581%
    wait time: 0:04:54.794387s
    

    row: 2400, total_rows: 3960
    progress: 60.606%
    wait time: 0:04:54.552076s
    

    row: 2401, total_rows: 3960
    progress: 60.631%
    wait time: 0:04:54.368693s
    




    row: 2482, total_rows: 3960
    progress: 62.677%
    wait time: 0:04:38.735193s
    

    row: 2483, total_rows: 3960
    progress: 62.702%
    wait time: 0:04:38.490389s
    

    row: 2484, total_rows: 3960
    progress: 62.727%
    wait time: 0:04:38.308748s
    

    row: 2485, total_rows: 3960
    progress: 62.753%
    wait time: 0:04:38.091152s
    

    row: 2486, total_rows: 3960
    progress: 62.778%
    wait time: 0:04:37.968251s
    

    row: 2487, total_rows: 3960
    progress: 62.803%
    wait time: 0:04:37.731409s
    

    row: 2488, total_rows: 3960
    progress: 62.828%
    wait time: 0:04:37.621989s
    

    row: 2489, total_rows: 3960
    progress: 62.854%
    wait time: 0:04:37.385810s
    

    row: 2490, total_rows: 3960
    progress: 62.879%
    wait time: 0:04:37.279822s
    

    row: 2491, total_rows: 3960
    progress: 62.904%
    wait time: 0:04:37.036624s
    

    row: 2492, total_rows: 3960
    progress: 62.929%
    wait time: 0:04:36.808874s
    


    row: 2573, total_rows: 3960
    progress: 64.975%
    wait time: 0:04:21.126320s
    

    row: 2574, total_rows: 3960
    progress: 65.0%
    wait time: 0:04:20.895437s
    

    row: 2575, total_rows: 3960
    progress: 65.025%
    wait time: 0:04:20.645789s
    

    row: 2576, total_rows: 3960
    progress: 65.051%
    wait time: 0:04:20.500987s
    

    row: 2577, total_rows: 3960
    progress: 65.076%
    wait time: 0:04:20.349969s
    

    row: 2578, total_rows: 3960
    progress: 65.101%
    wait time: 0:04:20.117891s
    

    row: 2579, total_rows: 3960
    progress: 65.126%
    wait time: 0:04:19.958053s
    

    row: 2580, total_rows: 3960
    progress: 65.152%
    wait time: 0:04:19.807253s
    

    row: 2581, total_rows: 3960
    progress: 65.177%
    wait time: 0:04:19.647814s
    

    row: 2582, total_rows: 3960
    progress: 65.202%
    wait time: 0:04:19.422184s
    

    row: 2583, total_rows: 3960
    progress: 65.227%
    wait time: 0:04:19.190657s
    




    row: 2665, total_rows: 3960
    progress: 67.298%
    wait time: 0:04:02.980390s
    

    row: 2666, total_rows: 3960
    progress: 67.323%
    wait time: 0:04:02.794855s
    

    row: 2667, total_rows: 3960
    progress: 67.348%
    wait time: 0:04:02.567694s
    

    row: 2668, total_rows: 3960
    progress: 67.374%
    wait time: 0:04:02.334808s
    

    row: 2669, total_rows: 3960
    progress: 67.399%
    wait time: 0:04:02.096216s
    

    row: 2670, total_rows: 3960
    progress: 67.424%
    wait time: 0:04:01.884822s
    

    row: 2671, total_rows: 3960
    progress: 67.449%
    wait time: 0:04:01.680728s
    

    row: 2672, total_rows: 3960
    progress: 67.475%
    wait time: 0:04:01.449172s
    

    row: 2673, total_rows: 3960
    progress: 67.5%
    wait time: 0:04:01.221091s
    

    row: 2674, total_rows: 3960
    progress: 67.525%
    wait time: 0:04:01.005136s
    

    row: 2675, total_rows: 3960
    progress: 67.551%
    wait time: 0:04:00.817616s
    




    row: 2757, total_rows: 3960
    progress: 69.621%
    wait time: 0:03:45.579698s
    

    row: 2758, total_rows: 3960
    progress: 69.646%
    wait time: 0:03:45.442199s
    

    row: 2759, total_rows: 3960
    progress: 69.672%
    wait time: 0:03:45.294559s
    

    row: 2760, total_rows: 3960
    progress: 69.697%
    wait time: 0:03:45.168583s
    

    row: 2761, total_rows: 3960
    progress: 69.722%
    wait time: 0:03:45.016380s
    

    row: 2762, total_rows: 3960
    progress: 69.747%
    wait time: 0:03:44.762504s
    

    row: 2763, total_rows: 3960
    progress: 69.773%
    wait time: 0:03:44.620660s
    

    row: 2764, total_rows: 3960
    progress: 69.798%
    wait time: 0:03:44.408545s
    

    row: 2765, total_rows: 3960
    progress: 69.823%
    wait time: 0:03:44.265269s
    

    row: 2766, total_rows: 3960
    progress: 69.848%
    wait time: 0:03:44.022943s
    

    row: 2767, total_rows: 3960
    progress: 69.874%
    wait time: 0:03:43.869647s
    


    row: 2850, total_rows: 3960
    progress: 71.97%
    wait time: 0:03:28.248242s
    

    row: 2851, total_rows: 3960
    progress: 71.995%
    wait time: 0:03:28.091607s
    

    row: 2852, total_rows: 3960
    progress: 72.02%
    wait time: 0:03:27.898342s
    

    row: 2853, total_rows: 3960
    progress: 72.045%
    wait time: 0:03:27.701207s
    

    row: 2854, total_rows: 3960
    progress: 72.071%
    wait time: 0:03:27.503320s
    

    row: 2855, total_rows: 3960
    progress: 72.096%
    wait time: 0:03:27.270207s
    

    row: 2856, total_rows: 3960
    progress: 72.121%
    wait time: 0:03:27.065064s
    

    row: 2857, total_rows: 3960
    progress: 72.146%
    wait time: 0:03:26.868081s
    

    row: 2858, total_rows: 3960
    progress: 72.172%
    wait time: 0:03:26.683858s
    

    row: 2859, total_rows: 3960
    progress: 72.197%
    wait time: 0:03:26.504251s
    

    row: 2860, total_rows: 3960
    progress: 72.222%
    wait time: 0:03:26.275734s
    




    row: 2941, total_rows: 3960
    progress: 74.268%
    wait time: 0:03:11.185895s
    

    row: 2942, total_rows: 3960
    progress: 74.293%
    wait time: 0:03:10.977999s
    

    row: 2943, total_rows: 3960
    progress: 74.318%
    wait time: 0:03:10.774685s
    

    row: 2944, total_rows: 3960
    progress: 74.343%
    wait time: 0:03:10.554832s
    

    row: 2945, total_rows: 3960
    progress: 74.369%
    wait time: 0:03:10.330924s
    

    row: 2946, total_rows: 3960
    progress: 74.394%
    wait time: 0:03:10.143050s
    

    row: 2947, total_rows: 3960
    progress: 74.419%
    wait time: 0:03:09.923071s
    

    row: 2948, total_rows: 3960
    progress: 74.444%
    wait time: 0:03:09.679471s
    

    row: 2949, total_rows: 3960
    progress: 74.47%
    wait time: 0:03:09.480286s
    

    row: 2950, total_rows: 3960
    progress: 74.495%
    wait time: 0:03:09.315742s
    

    row: 2951, total_rows: 3960
    progress: 74.52%
    wait time: 0:03:09.115887s
    




    row: 3033, total_rows: 3960
    progress: 76.591%
    wait time: 0:02:54.211443s
    

    row: 3034, total_rows: 3960
    progress: 76.616%
    wait time: 0:02:54.045657s
    

    row: 3035, total_rows: 3960
    progress: 76.641%
    wait time: 0:02:53.886444s
    

    row: 3036, total_rows: 3960
    progress: 76.667%
    wait time: 0:02:53.694555s
    

    row: 3037, total_rows: 3960
    progress: 76.692%
    wait time: 0:02:53.545264s
    

    row: 3038, total_rows: 3960
    progress: 76.717%
    wait time: 0:02:53.374541s
    

    row: 3039, total_rows: 3960
    progress: 76.742%
    wait time: 0:02:53.162205s
    

    row: 3040, total_rows: 3960
    progress: 76.768%
    wait time: 0:02:52.999076s
    

    row: 3041, total_rows: 3960
    progress: 76.793%
    wait time: 0:02:52.821055s
    

    row: 3042, total_rows: 3960
    progress: 76.818%
    wait time: 0:02:52.621559s
    

    row: 3043, total_rows: 3960
    progress: 76.843%
    wait time: 0:02:52.415766s
    


    row: 3125, total_rows: 3960
    progress: 78.914%
    wait time: 0:02:36.387193s
    

    row: 3126, total_rows: 3960
    progress: 78.939%
    wait time: 0:02:36.188389s
    

    row: 3127, total_rows: 3960
    progress: 78.965%
    wait time: 0:02:36.018415s
    

    row: 3128, total_rows: 3960
    progress: 78.99%
    wait time: 0:02:35.822565s
    

    row: 3129, total_rows: 3960
    progress: 79.015%
    wait time: 0:02:35.641627s
    

    row: 3130, total_rows: 3960
    progress: 79.04%
    wait time: 0:02:35.450849s
    

    row: 3131, total_rows: 3960
    progress: 79.066%
    wait time: 0:02:35.284729s
    

    row: 3132, total_rows: 3960
    progress: 79.091%
    wait time: 0:02:35.066944s
    

    row: 3133, total_rows: 3960
    progress: 79.116%
    wait time: 0:02:34.904798s
    

    row: 3134, total_rows: 3960
    progress: 79.141%
    wait time: 0:02:34.745333s
    

    row: 3135, total_rows: 3960
    progress: 79.167%
    wait time: 0:02:34.576383s
    




    row: 3217, total_rows: 3960
    progress: 81.237%
    wait time: 0:02:19.075889s
    

    row: 3218, total_rows: 3960
    progress: 81.263%
    wait time: 0:02:18.864934s
    

    row: 3219, total_rows: 3960
    progress: 81.288%
    wait time: 0:02:18.670877s
    

    row: 3220, total_rows: 3960
    progress: 81.313%
    wait time: 0:02:18.471092s
    

    row: 3221, total_rows: 3960
    progress: 81.338%
    wait time: 0:02:18.265608s
    

    row: 3222, total_rows: 3960
    progress: 81.364%
    wait time: 0:02:18.089531s
    

    row: 3223, total_rows: 3960
    progress: 81.389%
    wait time: 0:02:17.888927s
    

    row: 3224, total_rows: 3960
    progress: 81.414%
    wait time: 0:02:17.707333s
    

    row: 3225, total_rows: 3960
    progress: 81.439%
    wait time: 0:02:17.539180s
    

    row: 3226, total_rows: 3960
    progress: 81.465%
    wait time: 0:02:17.357299s
    

    row: 3227, total_rows: 3960
    progress: 81.49%
    wait time: 0:02:17.162442s
    



    row: 3309, total_rows: 3960
    progress: 83.561%
    wait time: 0:02:01.815947s
    

    row: 3310, total_rows: 3960
    progress: 83.586%
    wait time: 0:02:01.608197s
    

    row: 3311, total_rows: 3960
    progress: 83.611%
    wait time: 0:02:01.407782s
    

    row: 3312, total_rows: 3960
    progress: 83.636%
    wait time: 0:02:01.228370s
    

    row: 3313, total_rows: 3960
    progress: 83.662%
    wait time: 0:02:01.066131s
    

    row: 3314, total_rows: 3960
    progress: 83.687%
    wait time: 0:02:00.867510s
    

    row: 3315, total_rows: 3960
    progress: 83.712%
    wait time: 0:02:00.700871s
    

    row: 3316, total_rows: 3960
    progress: 83.737%
    wait time: 0:02:00.523269s
    

    row: 3317, total_rows: 3960
    progress: 83.763%
    wait time: 0:02:00.327395s
    

    row: 3318, total_rows: 3960
    progress: 83.788%
    wait time: 0:02:00.147433s
    

    row: 3319, total_rows: 3960
    progress: 83.813%
    wait time: 0:01:59.951593s
    


    row: 3401, total_rows: 3960
    progress: 85.884%
    wait time: 0:01:44.825925s
    

    row: 3402, total_rows: 3960
    progress: 85.909%
    wait time: 0:01:44.617329s
    

    row: 3403, total_rows: 3960
    progress: 85.934%
    wait time: 0:01:44.446665s
    

    row: 3404, total_rows: 3960
    progress: 85.96%
    wait time: 0:01:44.243888s
    

    row: 3405, total_rows: 3960
    progress: 85.985%
    wait time: 0:01:44.060262s
    

    row: 3406, total_rows: 3960
    progress: 86.01%
    wait time: 0:01:43.888017s
    

    row: 3407, total_rows: 3960
    progress: 86.035%
    wait time: 0:01:43.707909s
    

    row: 3408, total_rows: 3960
    progress: 86.061%
    wait time: 0:01:43.533445s
    

    row: 3409, total_rows: 3960
    progress: 86.086%
    wait time: 0:01:43.345821s
    

    row: 3410, total_rows: 3960
    progress: 86.111%
    wait time: 0:01:43.173534s
    

    row: 3411, total_rows: 3960
    progress: 86.136%
    wait time: 0:01:42.990550s
    




    row: 3493, total_rows: 3960
    progress: 88.207%
    wait time: 0:01:27.745236s
    

    row: 3494, total_rows: 3960
    progress: 88.232%
    wait time: 0:01:27.569931s
    

    row: 3495, total_rows: 3960
    progress: 88.258%
    wait time: 0:01:27.386707s
    

    row: 3496, total_rows: 3960
    progress: 88.283%
    wait time: 0:01:27.211032s
    

    row: 3497, total_rows: 3960
    progress: 88.308%
    wait time: 0:01:27.035497s
    

    row: 3498, total_rows: 3960
    progress: 88.333%
    wait time: 0:01:26.853357s
    

    row: 3499, total_rows: 3960
    progress: 88.359%
    wait time: 0:01:26.674221s
    

    row: 3500, total_rows: 3960
    progress: 88.384%
    wait time: 0:01:26.498200s
    

    row: 3501, total_rows: 3960
    progress: 88.409%
    wait time: 0:01:26.313064s
    

    row: 3502, total_rows: 3960
    progress: 88.434%
    wait time: 0:01:26.130925s
    

    row: 3503, total_rows: 3960
    progress: 88.46%
    wait time: 0:01:25.930476s
    



    row: 3585, total_rows: 3960
    progress: 90.53%
    wait time: 0:01:10.751156s
    

    row: 3586, total_rows: 3960
    progress: 90.556%
    wait time: 0:01:10.570368s
    

    row: 3587, total_rows: 3960
    progress: 90.581%
    wait time: 0:01:10.392344s
    

    row: 3588, total_rows: 3960
    progress: 90.606%
    wait time: 0:01:10.223180s
    

    row: 3589, total_rows: 3960
    progress: 90.631%
    wait time: 0:01:10.043037s
    

    row: 3590, total_rows: 3960
    progress: 90.657%
    wait time: 0:01:09.857994s
    

    row: 3591, total_rows: 3960
    progress: 90.682%
    wait time: 0:01:09.674678s
    

    row: 3592, total_rows: 3960
    progress: 90.707%
    wait time: 0:01:09.478613s
    

    row: 3593, total_rows: 3960
    progress: 90.732%
    wait time: 0:01:09.292816s
    

    row: 3594, total_rows: 3960
    progress: 90.758%
    wait time: 0:01:09.111994s
    

    row: 3595, total_rows: 3960
    progress: 90.783%
    wait time: 0:01:08.928076s
    



    row: 3676, total_rows: 3960
    progress: 92.828%
    wait time: 0:00:53.674421s
    

    row: 3677, total_rows: 3960
    progress: 92.854%
    wait time: 0:00:53.487212s
    

    row: 3678, total_rows: 3960
    progress: 92.879%
    wait time: 0:00:53.309205s
    

    row: 3679, total_rows: 3960
    progress: 92.904%
    wait time: 0:00:53.111001s
    

    row: 3680, total_rows: 3960
    progress: 92.929%
    wait time: 0:00:52.925281s
    

    row: 3681, total_rows: 3960
    progress: 92.955%
    wait time: 0:00:52.737791s
    

    row: 3682, total_rows: 3960
    progress: 92.98%
    wait time: 0:00:52.538878s
    

    row: 3683, total_rows: 3960
    progress: 93.005%
    wait time: 0:00:52.345311s
    

    row: 3684, total_rows: 3960
    progress: 93.03%
    wait time: 0:00:52.168037s
    

    row: 3685, total_rows: 3960
    progress: 93.056%
    wait time: 0:00:51.984785s
    

    row: 3686, total_rows: 3960
    progress: 93.081%
    wait time: 0:00:51.786906s
    




    row: 3767, total_rows: 3960
    progress: 95.126%
    wait time: 0:00:36.488668s
    

    row: 3768, total_rows: 3960
    progress: 95.152%
    wait time: 0:00:36.291759s
    

    row: 3769, total_rows: 3960
    progress: 95.177%
    wait time: 0:00:36.101327s
    

    row: 3770, total_rows: 3960
    progress: 95.202%
    wait time: 0:00:35.913181s
    

    row: 3771, total_rows: 3960
    progress: 95.227%
    wait time: 0:00:35.721098s
    

    row: 3772, total_rows: 3960
    progress: 95.253%
    wait time: 0:00:35.529052s
    

    row: 3773, total_rows: 3960
    progress: 95.278%
    wait time: 0:00:35.341708s
    

    row: 3774, total_rows: 3960
    progress: 95.303%
    wait time: 0:00:35.151483s
    

    row: 3775, total_rows: 3960
    progress: 95.328%
    wait time: 0:00:34.961224s
    

    row: 3776, total_rows: 3960
    progress: 95.354%
    wait time: 0:00:34.765864s
    

    row: 3777, total_rows: 3960
    progress: 95.379%
    wait time: 0:00:34.579491s
    


    row: 3859, total_rows: 3960
    progress: 97.449%
    wait time: 0:00:19.100182s
    

    row: 3860, total_rows: 3960
    progress: 97.475%
    wait time: 0:00:18.911851s
    

    row: 3861, total_rows: 3960
    progress: 97.5%
    wait time: 0:00:18.723811s
    

    row: 3862, total_rows: 3960
    progress: 97.525%
    wait time: 0:00:18.530492s
    

    row: 3863, total_rows: 3960
    progress: 97.551%
    wait time: 0:00:18.342991s
    

    row: 3864, total_rows: 3960
    progress: 97.576%
    wait time: 0:00:18.152099s
    

    row: 3865, total_rows: 3960
    progress: 97.601%
    wait time: 0:00:17.965773s
    

    row: 3866, total_rows: 3960
    progress: 97.626%
    wait time: 0:00:17.777074s
    

    row: 3867, total_rows: 3960
    progress: 97.652%
    wait time: 0:00:17.587561s
    

    row: 3868, total_rows: 3960
    progress: 97.677%
    wait time: 0:00:17.399169s
    

    row: 3869, total_rows: 3960
    progress: 97.702%
    wait time: 0:00:17.209951s
    




    row: 3951, total_rows: 3960
    progress: 99.773%
    wait time: 0:00:01.703423s
    

    row: 3952, total_rows: 3960
    progress: 99.798%
    wait time: 0:00:01.514287s
    

    row: 3953, total_rows: 3960
    progress: 99.823%
    wait time: 0:00:01.325114s
    

    row: 3954, total_rows: 3960
    progress: 99.848%
    wait time: 0:00:01.135861s
    

    row: 3955, total_rows: 3960
    progress: 99.874%
    wait time: 0:00:00.946529s
    

    row: 3956, total_rows: 3960
    progress: 99.899%
    wait time: 0:00:00.757312s
    

    row: 3957, total_rows: 3960
    progress: 99.924%
    wait time: 0:00:00.568015s
    

    row: 3958, total_rows: 3960
    progress: 99.949%
    wait time: 0:00:00.378715s
    

    row: 3959, total_rows: 3960
    progress: 99.975%
    wait time: 0:00:00.189351s
    

    row: 3960, total_rows: 3960
    progress: 100.0%
    wait time: 0:00:00s
    


In [682]:
df_representation_v4 = pd.DataFrame(data_lexicon_attributes).set_index("id")
df_representation_v4 = pd.concat([df_representation_v4, df_representation], axis=1)
df_representation_v4

Unnamed: 0_level_0,lexicon<&>+,lexicon<&>-,retro<&>num_tokens,retro<&>lenght,retro<&>num_numbs,retro<&>num_alpha,retro<&>num_with_uppercase,retro<&>num_tokens_upper,retro<&>prop_vowels,retro<&>len_max_rep_char,...,linguistics<&>lemma<&>scritche,"linguistics<&>lemma<&>loud,\n#amndbot",linguistics<&>lemma<&>randomly,linguistics<&>lemma<&>essay,linguistics<&>lemma<&>concession,linguistics<&>lemma<&>shortsightedness,linguistics<&>lemma<&>@keitholbermann,linguistics<&>lemma<&>despicable,linguistics<&>lemma<&>rancour,linguistics<&>lemma<&>immoral
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10000,0,1,18,96,0,18,3,1,0.253165,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10001,0,0,23,119,0,25,7,2,0.329897,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10002,0,0,19,108,0,19,2,2,0.333333,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10003,0,1,24,134,0,23,1,1,0.315315,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10004,0,0,24,125,0,25,3,0,0.352941,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
40855,0,2,24,112,0,23,2,1,0.325843,2,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
40856,1,0,20,121,0,21,1,0,0.352941,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
40857,0,0,23,139,0,23,2,1,0.316239,2,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0
40858,0,0,14,83,2,15,5,1,0.285714,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [683]:
pickle.dump(df_representation_v4, open("df_representation_v4.pickle", "wb"))

In [685]:
sen = sentiments[0]

indexs = df_train[df_train["sen"] == sen].index
indexsLH = df_train.loc[indexs][(df_train.loc[indexs]["int"].isin(["low", "high"]))].index

X = df_representation_v4.loc[indexsLH]#[rel_cols]
y = df_train.loc[X.index]["int"]

dic_label_count = y.value_counts().to_dict()
min_label = min(dic_label_count.items(), key=lambda x: x[1])[0]
max_label = max(dic_label_count.items(), key=lambda x: x[1])[0]
index_label_1 = y[y==min_label].index
oversampling_steps = int(dic_label_count[max_label] / dic_label_count[min_label]) - 1

X_res, y_res = X.copy(), y.copy()
for step in range(oversampling_steps):
    new_indexs = [f"{ix}+{step + 1}" for ix in index_label_1]
    copied_sub_X = pd.DataFrame(X.loc[index_label_1].values, columns=X.columns, index=new_indexs)
    copied_sub_y = pd.Series(y.loc[index_label_1].values, index=new_indexs)
    X_res = pd.concat([X_res, copied_sub_X], axis=0)
    y_res = pd.concat([y_res, copied_sub_y], axis=0)

X_res = pd.DataFrame(StandardScaler().fit_transform(X_res), columns=X_res.columns, index=X_res.index)
selector = SelectKBest(chi2, k=X.shape[1])
X_res_ = X_res - X_res.min()
selector.fit(X_res_, y_res)
scores_selector = {col: selector.scores_[i] if str(selector.scores_[i]) != "nan" else 0 for i, col in enumerate(X.columns.tolist())}
ranked_cols = [x[0] for x in sorted(scores_selector.items(), key=lambda x: x[1], reverse=True)]
print(ranked_cols)

f1_weight = []

for num_cols in range(1, len(ranked_cols)+1, 1):
    X = df_representation_v4.loc[indexs][ranked_cols[:num_cols+1]]
    y = df_train.loc[X.index]["int"]

    # X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    clf = make_pipeline(StandardScaler(), SVC(kernel="rbf", gamma='auto', class_weight="balanced"))
    # clf.fit(X_train, y_train)
    cv_results = cross_validate(clf, X, y, cv=5, scoring="f1_weighted")
    test_score = cv_results["test_score"]
    f1_weight.append([num_cols, np.mean(test_score), np.std(test_score)])
    print(f1_weight[-1])

['linguistics<&>lemma<&>fume', 'linguistics<&>lemma<&>fuck', 'linguistics<&>lemma<&>angry', 'linguistics<&>lemma<&>snap', 'linguistics<&>lemma<&>so', 'linguistics<&>lemma<&>:', 'linguistics<&>shape<&>:', "linguistics<&>lemma<&>'s", 'linguistics<&>lemma<&>fucking', 'linguistics<&>tag<&>JJR', 'linguistics<&>lemma<&>well', 'linguistics<&>lemma<&>relentless', 'linguistics<&>lemma<&>frown', 'linguistics<&>lemma<&>wtf', 'linguistics<&>lemma<&>irritate', 'linguistics<&>lemma<&>boil', 'linguistics<&>lemma<&>incense', 'linguistics<&>lemma<&>piss', 'linguistics<&>lemma<&>blood', 'linguistics<&>lemma<&>again', 'linguistics<&>lemma<&>then', 'emoji<&>_pouting_face_', 'linguistics<&>lemma<&>outrage', 'linguistics<&>lemma<&>why', 'linguistics<&>lemma<&>right', 'linguistics<&>lemma<&>offense', 'linguistics<&>lemma<&>laugh', 'linguistics<&>lemma<&>pic', 'linguistics<&>lemma<&>burst', 'linguistics<&>lemma<&>shoot', 'linguistics<&>lemma<&>move', 'linguistics<&>lemma<&>hell', 'linguistics<&>dep<&>agent', 

[1, 0.10722920733970236, 0.016683552738673963]
[2, 0.126973552422759, 0.010344127296425039]
[3, 0.41021076060251066, 0.2376088996248618]
[4, 0.13216938990052995, 0.01289986942377626]
[5, 0.13216938990052995, 0.01289986942377626]
[6, 0.13216938990052995, 0.01289986942377626]
[7, 0.31670029429282015, 0.2211776004064279]
[8, 0.23149368012398183, 0.1704424808998914]
[9, 0.4138532375929329, 0.22741492913142267]
[10, 0.49902045181502, 0.18102437308958125]
[11, 0.5870662164050036, 0.009452745427872654]
[12, 0.5907965664959758, 0.0101197367837813]
[13, 0.5941208356115116, 0.011753973392230133]
[14, 0.5975933249128916, 0.01355009486233394]
[15, 0.5980103708225781, 0.017174520712786637]
[16, 0.6054784102315744, 0.0151791353821705]
[17, 0.6047712393909619, 0.015584556059913754]
[18, 0.6088474930429287, 0.014296679905311003]
[19, 0.610797811715364, 0.013338075151880959]
[20, 0.6093158956832087, 0.010724790259922857]
[21, 0.6117225895395404, 0.012316797372359998]
[22, 0.5161612315603071, 0.16565950

[177, 0.6356286916606998, 0.03686390692380396]
[178, 0.6356286916606998, 0.03686390692380396]
[179, 0.6378495957558898, 0.03639495007936231]
[180, 0.6389129994341035, 0.037009022873224315]
[181, 0.6390276434593424, 0.03859287444876934]
[182, 0.6390276434593424, 0.03859287444876934]
[183, 0.6386313132924375, 0.03588945035497212]
[184, 0.6405982363132715, 0.038013277882206604]
[185, 0.6377485038926554, 0.0403045794809684]
[186, 0.6405032515297292, 0.03889525234543259]
[187, 0.6413337827140321, 0.03821611705587485]
[188, 0.6404389340604302, 0.03905366977273297]
[189, 0.6405439397341275, 0.03917678069184365]
[190, 0.6423304640371388, 0.037571417262071784]
[191, 0.6423304640371388, 0.037571417262071784]
[192, 0.6433842592681828, 0.03778657086304954]
[193, 0.6421735636445239, 0.037504523396681214]
[194, 0.6443514313897334, 0.03859712329148512]
[195, 0.6464942932851718, 0.03929679708573715]
[196, 0.6464942932851718, 0.03929679708573715]
[197, 0.6507105704019445, 0.040136900648746844]
[198, 0.

[351, 0.6208281911184776, 0.03894211913698201]
[352, 0.6190415500756937, 0.03747987455879015]
[353, 0.6199940510213577, 0.03823491026492489]
[354, 0.6219354385387789, 0.03594612028958478]
[355, 0.6201289028269207, 0.03295739301897371]
[356, 0.6220076522793884, 0.0308026806990369]
[357, 0.6190631311433078, 0.028932548952881387]
[358, 0.6171336512818474, 0.028786656851904794]
[359, 0.6191403505565617, 0.028794325313326312]
[360, 0.6191403505565617, 0.028794325313326312]
[361, 0.6191403505565617, 0.028794325313326312]
[362, 0.6191403505565617, 0.028794325313326312]
[363, 0.6196374301380805, 0.026755320009399428]
[364, 0.6204282095341486, 0.029506728057217015]
[365, 0.6213226548825885, 0.03022103230832267]
[366, 0.6193346166484409, 0.03134382275940507]
[367, 0.6193488322805203, 0.03294009592159789]
[368, 0.6202382332053535, 0.03170777914249353]
[369, 0.6200095012429592, 0.02880757716301869]
[370, 0.61746403184909, 0.029120533837570246]
[371, 0.61746403184909, 0.029120533837570246]
[372, 0.

KeyboardInterrupt: 