In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
TRAIN_FILE = '../data/tsd_train.csv'
TRIAL_FILE = '../data/tsd_trial.csv'
TEST_FILE = '../data/tsd_test_spans.csv'

In [3]:
from transformers import BertTokenizer

In [4]:
train = pd.read_csv(TRAIN_FILE)
trial = pd.read_csv(TRIAL_FILE)
test = pd.read_csv(TEST_FILE)

In [5]:
train.head()

Unnamed: 0,spans,text
0,"[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,...",Another violent and aggressive immigrant killi...
1,"[33, 34, 35, 36, 37, 38, 39]","I am 56 years old, I am not your fucking junio..."
2,"[0, 1, 2, 3]","Damn, a whole family. Sad indeed."
3,"[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]",What a knucklehead. How can anyone not know th...
4,"[32, 33, 34, 35, 36, 37, 38]","""who do you think should do the killing?""\n\nA..."


In [6]:
train.shape

(7939, 2)

In [7]:
train.isnull().sum()

spans    0
text     0
dtype: int64

In [8]:
train.duplicated().sum()

0

In [9]:
trial.head()

Unnamed: 0,spans,text
0,"[15, 16, 17, 18, 19, 27, 28, 29, 30, 31]",Because he's a moron and a bigot. It's not any...
1,"[29, 30, 31, 32, 33, 34]",How about we stop protecting idiots and let na...
2,"[166, 167, 168, 169, 170, 171]","If people were smart, they would Boycott th..."
3,"[87, 88, 89, 90, 91, 92]",Trump Claimed that Russia will never invade th...
4,[],As long as your willing to pay a lot more for ...


In [10]:
trial.shape

(690, 2)

In [11]:
trial.isnull().sum()

spans    0
text     0
dtype: int64

In [12]:
trial.duplicated().sum()

0

In [13]:
combo = pd.concat([train,trial])

In [14]:
combo.head()

Unnamed: 0,spans,text
0,"[8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,...",Another violent and aggressive immigrant killi...
1,"[33, 34, 35, 36, 37, 38, 39]","I am 56 years old, I am not your fucking junio..."
2,"[0, 1, 2, 3]","Damn, a whole family. Sad indeed."
3,"[7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]",What a knucklehead. How can anyone not know th...
4,"[32, 33, 34, 35, 36, 37, 38]","""who do you think should do the killing?""\n\nA..."


In [15]:
combo.isnull().sum()

spans    0
text     0
dtype: int64

In [16]:
combo.duplicated().sum()

5

In [17]:
combo[combo.duplicated()]

Unnamed: 0,spans,text
100,"[6, 7, 8, 9, 10]",Trump troll!
224,"[0, 1, 2, 3, 4]",Idiot!
276,"[0, 1, 2, 3, 4]",Idiot
363,"[14, 15, 16, 17, 18, 19]",You can't fix stupid
588,"[11, 12, 13, 14, 15]",You are an idiot.


In [18]:
# merged_df = train.merge(trial, how='inner',indicator=False)
# new_train = train[~train.text.isin(merged_df['text'].values)]
# new_train.to_csv('../data/modified_train.csv',index=False)

In [19]:
def get_continuous_spans(spans):
    continuous_spans = []
    current_span = []
    for i in range(len(spans)):
        if current_span == []:
            current_span.append(spans[i])
            continue
        if spans[i]==current_span[-1]+1:
            current_span.append(spans[i])
        else:
            continuous_spans.append(current_span)
            current_span = [spans[i]]
    if(current_span!=[]):
        continuous_spans.append(current_span)
    return continuous_spans

Check if any spaces are marked in the toxic spans

In [20]:
def get_space_count(df):
    df['space_count'] = df.apply(lambda x: np.sum([1 if x['text'][i]==' ' else 0 for i in eval(x['spans'])]),axis=1)
    return df['space_count'].sum(), df['space_count'].mean(),  df['space_count'].std()

In [21]:
print(['%.2f'% i for i in get_space_count(train)])

['13278.00', '1.67', '7.72']


In [22]:
print(['%.2f'% i for i in get_space_count(trial)])

['830.00', '1.20', '4.48']


In [23]:
print(['%.2f'% i for i in get_space_count(test)])

['575.00', '0.29', '3.19']


Check if any words are cut across spans

In [24]:
def check_if_word_cut(text,contiguous_spans):
    words_cuts = []
    for i in contiguous_spans:
        words_cut = 0
        if i[0]==0 and i[-1]==len(text)-1:
            words_cut = 0
        elif i[0]==0:
            if text[i[-1]]!=' ' and text[i[-1]+1].isalnum():
                words_cut+=1
        elif i[-1]==len(text)-1:
            if text[i[0]]!=' ' and text[i[0]-1].isalnum():
                words_cut +=1
        else:
            if text[i[0]]!=' ' and text[i[0]-1].isalnum():
                words_cut +=1
            if text[i[-1]]!=' ' and text[i[-1]+1].isalnum():
                words_cut +=1
        words_cuts.append(words_cut)
    return words_cuts     

In [25]:
def words_cut_count(df):
    df['contiguous_spans'] = df.apply(lambda x : get_continuous_spans(eval(x['spans'])),axis=1)
    df['words_cut'] = df.apply(lambda x: np.sum(check_if_word_cut(x['text'],x['contiguous_spans'])),axis=1)
    return df['words_cut'].sum(),df['words_cut'].mean(),df['words_cut'].std()

In [26]:
print(['%.2f'% i for i in words_cut_count(train)])

['263.00', '0.03', '0.20']


In [27]:
print(['%.2f'% i for i in words_cut_count(trial)])

['26.00', '0.04', '0.23']


In [28]:
print(['%.2f'% i for i in words_cut_count(test)])

['8.00', '0.00', '0.06']


In [29]:
def print_contiguous_spans(text,contiguous_spans):
    return [text[i[0]:i[-1]+1] for i in contiguous_spans]

In [30]:
# train[train['words_cut']>0].apply(lambda x: (x['text'],len(x['text']),x['spans'],x['contiguous_spans'],print_contiguous_spans(x['text'],x['contiguous_spans']),x['words_cut']),axis=1).values

Check if any spans end or start with space

In [31]:
import string
def check_if_start_or_end_with_whitespace(text,contiguous_spans):
    start_or_end_with_space = []
    for i in contiguous_spans:
        s_e = 0
        if text[i[0]] in string.whitespace:
            s_e+=1
        if text[i[-1]] in string.whitespace:
            s_e+=1
        start_or_end_with_space.append(s_e)
    return start_or_end_with_space 

In [32]:
def start_end_with_space(df):
    df['start_or_end_with_space'] = df.apply(lambda x: np.sum(check_if_start_or_end_with_whitespace(x['text'],x['contiguous_spans'])),axis=1)
    return df['start_or_end_with_space'].sum(),df['start_or_end_with_space'].mean(),df['start_or_end_with_space'].std()
    

In [33]:
print(['%.2f'% i for i in start_end_with_space(train)])

['22.00', '0.00', '0.05']


In [34]:
print(['%.2f'% i for i in start_end_with_space(trial)])

['1.00', '0.00', '0.04']


In [35]:
print(['%.2f'% i for i in start_end_with_space(test)])

['1.00', '0.00', '0.02']


In [36]:
# train[train['start_or_end_with_space']>0].apply(lambda x: (x['text'],len(x['text']),x['start_or_end_with_space'],print_contiguous_spans(x['text'],x['contiguous_spans'])),axis=1).values

Clean out spans which are half in, remove trailing and beginning spaces

In [37]:
def find_word_by_character_index(text, idx):
    if(text[idx].isalnum()):
        backward = idx
        forward = idx
        while(backward>-1 and text[backward].isalnum()):
            backward-=1
        while(forward<len(text) and text[forward].isalnum()):
            forward+=1

        return text[backward+1:forward], backward+1,forward-1
    else:
        return text[idx], idx, idx

In [38]:
train.loc[20,'spans']

'[0, 1, 2, 3, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71]'

In [39]:
train.loc[20,'text']

'Kill some more kids and then complain about guns, LOL the left is a joke'

In [40]:
find_word_by_character_index(train.loc[20,'text'],22)

('and', 20, 22)

In [41]:
# def clean_text(text,contiguous_spans):
#     new_contiguous_spans = []

#     for i in contiguous_spans:
#         start = i[0] 
#         end = i[-1]

#         if start==0 and end==len(text)-1:
#             new_contiguous_spans.append([start,end])

#         elif start==0:
#             if text[end].isalnum() and text[end+1].isalnum():

#                 full_word,full_start,full_end = find_word_by_character_index(text,end)
#                 cut_word_len = end-full_start+1
#                 if(cut_word_len*2>=len(full_word)):
#                     new_contiguous_spans.append([start,full_end])
#                 else:
#                     new_contiguous_spans.append([start,full_start-1])


#         elif i[-1]==len(text)-1:
#             if text[start].isalnum() and text[start-1].isalnum():
#                 full_word, full_start,full_end = find_word_by_character_index(text,start)
#                 cut_word_len = full_end-start+1
#                 if(cut_word_len*2>=len(full_word)):
#                     new_contiguous_spans.append([full_start,end])
#                 else:
#                     new_contiguous_spans.append([full_end+1,end])
                
                
#         else:
#             new_start = start
#             new_end = end
           
#             if text[start].isalnum() and text[start-1].isalnum():
#                 full_word, full_start,full_end = find_word_by_character_index(text,start)
#                 cut_word_len = full_end-start+1
#                 if(cut_word_len*2>=len(full_word)):
#                     new_start = full_start
#                 else:
#                     new_start = full_end+1

#             if text[end].isalnum() and text[end+1].isalnum():
#                 full_word, full_start,full_end = find_word_by_character_index(text, end)
#                 cut_word_len = end-full_start+1
#                 if(cut_word_len*2>=len(full_word)):
#                     new_end = full_end
#                 else:
#                     new_end = full_start-1
#             new_contiguous_spans.append([new_start,new_end])
#     ## Remove Spaces from span beginning and end

#     newest_contiguous_spans = []
#     for i in new_contiguous_spans:
#         start = i[0]
#         end = i[-1]
#         while start<=end:
#             if(not (text[start].isalnum()) or not (text[end].isalnum())):
#                 if not (text[start].isalnum()):
#                     start+=1
#                 if not (text[end].isalnum()):
#                     end-=1
#             else:
#                 break
#         if(start<=end):
#             newest_contiguous_spans.append([start,end])
#     return newest_contiguous_spans


In [42]:
text = "Hello How are you sir, My name is Gunjan Chhablani?"

In [43]:
contiguous_spans = get_continuous_spans([1,2,3,4,6,7,11,12,13,14,15,16,17, 18, 19, 22,23,24,25,30,31,32,33,34,35,36, 41,42,43,44])

In [44]:
print_contiguous_spans(text,contiguous_spans)

['ello', 'Ho', 're you si', ' My ', ' is Gun', 'Chha']

In [45]:
# new_contiguous_spans = clean_text(text,contiguous_spans)
#print_contiguous_spans(text,new_contiguous_spans)

In [46]:
# clean_train = new_train.copy()
# clean_train['contiguous_spans'] = clean_train.apply(lambda x: clean_text(x['text'],x['contiguous_spans']),axis=1)
# clean_trial = trial.copy()
# clean_trial['contiguous_spans'] = clean_trial.apply(lambda x:clean_text(x['text'],x['contiguous_spans']),axis=1)

In [47]:
#start_end_with_space(clean_train)

In [48]:
#start_end_with_space(clean_trial)

In [49]:
# def words_cut_count_clean(df):
#     df['words_cut'] = df.apply(lambda x: np.sum(check_if_word_cut(x['text'],x['contiguous_spans'])),axis=1)
#     return df['words_cut'].sum(),df['words_cut'].mean(),df['words_cut'].std()

In [50]:
# words_cut_count_clean(clean_train)

In [51]:
# words_cut_count_clean(clean_trial)

In [52]:
# clean_train.to_csv('../data/clean_train.csv',index=False)

In [53]:
# clean_trial.to_csv('../data/clean_trial.csv',index=False)

In [54]:
# clean_train = pd.read_csv('../data/clean_train.csv')

In [55]:
# clean_trial = pd.read_csv('../data/clean_trial.csv')

In [56]:
# def get_spans_from_contiguous(contiguous_spans):
#     spans = []
#     for i in eval(contiguous_spans):
#         spans+=list(range(i[0],i[-1]+1))
#     return spans

In [57]:
# clean_train['spans'] = clean_train['contiguous_spans'].apply(get_spans_from_contiguous)

In [58]:
# clean_trial['spans'] = clean_trial['contiguous_spans'].apply(get_spans_from_contiguous)

In [59]:
# clean_train[['spans','text']].to_csv('../data/clean_train.csv',index=False)

In [60]:
# clean_trial[['spans','text']].to_csv('../data/clean_trial.csv',index=False)

In [61]:
# new_train.to_csv('../data/modified_train.csv',index=False)

Tokens Count

In [62]:
# clean_train = pd.read_csv('../data/clean_train.csv')

In [63]:
# clean_trial = pd.read_csv('../data/clean_trial.csv')

In [64]:
# reduced_train = pd.read_csv('../data/modified_train.csv')

In [65]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
def tokenize_text(df):
    df['#tokens'] = df.apply(lambda x: len(tokenizer.tokenize(x['text'])),axis=1)
    df['#words'] = df.apply(lambda x: len(x['text'].split()),axis=1)
    df['#chars'] = df.apply(lambda x: len(x['text']),axis=1)
    

In [66]:
lists = []
for df in [train,trial,test]:
    tokenize_text(df)
    lists.append(['%.2f' % i for i in (df['#tokens'].mean(),df['#tokens'].std(),df['#tokens'].max(),df['#tokens'].min())]+['%.2f' % i for i in (df['#words'].mean(),df['#words'].std(),df['#words'].max(),df['#words'].min())] +['%.2f' % i for i in (df['#chars'].mean(),df['#chars'].std(),df['#chars'].max(),df['#chars'].min())] )

In [67]:
print(pd.DataFrame(lists, columns=["#Tokens(mean)","#Tokens(std)","#Tokens(max)","#Tokens(min)","#Words(mean)","#Words(std)","#Words(max)","#Words(min)", "#Chars(mean)","#Chars(std)","#Chars(max)","#Chars(min)"]).T.to_markdown())

|               |       0 |      1 |       2 |
|:--------------|--------:|-------:|--------:|
| #Tokens(mean) |   47.5  |  46.1  |   43.12 |
| #Tokens(std)  |   45.46 |  43.82 |   39.88 |
| #Tokens(max)  |  335    | 234    |  291    |
| #Tokens(min)  |    1    |   1    |    2    |
| #Words(mean)  |   35.95 |  35.01 |   32.86 |
| #Words(std)   |   34.97 |  34.42 |   31.01 |
| #Words(max)   |  192    | 182    |  186    |
| #Words(min)   |    1    |   1    |    1    |
| #Chars(mean)  |  204.57 | 199.47 |  186.41 |
| #Chars(std)   |  201.37 | 196.63 |  178.76 |
| #Chars(max)   | 1000    | 998    | 1000    |
| #Chars(min)   |    4    |   5    |    6    |
