In [7]:
import sys
sys.path.append('../src/')

In [8]:
import numpy as np
import time
from pprint import pprint
import io
import pandas as pd
from argparse import Namespace

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from collections import Counter

from tqdm import tqdm
import random


In [9]:
import utils.hasoc2019 as hasoc_utils
import utils.preprocessing as preprocessor

### Opening the dataset

In [10]:
args = Namespace(
    data_file = '../data/train/english_dataset.tsv'
)

In [11]:
data_df = hasoc_utils.open_data_as_df(args.data_file)

In [12]:
data_df

Unnamed: 0,id,text,task_1,task_2,task_3
0,hasoc_en_1,#DhoniKeepsTheGlove | WATCH: Sports Minister K...,NOT,NONE,NONE
1,hasoc_en_2,@politico No. We should remember very clearly ...,HOF,HATE,TIN
2,hasoc_en_3,@cricketworldcup Guess who would be the winner...,NOT,NONE,NONE
3,hasoc_en_4,Corbyn is too politically intellectual for #Bo...,NOT,NONE,NONE
4,hasoc_en_5,All the best to #TeamIndia for another swimmin...,NOT,NONE,NONE
...,...,...,...,...,...
5847,hasoc_en_5848,"@davidfrum @trueblueusa1 That's cute and all, ...",HOF,PRFN,UNT
5848,hasoc_en_5849,a recession issa comin' #maga #magamyass #fuck...,NOT,NONE,NONE
5849,hasoc_en_5850,#DoctorsFightBack Will 'The Mad n Irrational ...,HOF,OFFN,TIN
5850,hasoc_en_5851,#ShiningIndia #educatedindia or more like RUND...,HOF,OFFN,UNT


# Data preprocessing

In [None]:
data_df['clean_text'] = data_df.text.map(preprocessor.preprocess)

In [85]:
preprocessor.preprocess('I am ok')

['I', ' ', 'a', 'm', ' ', 'o', 'k']

### Examples from preprocessed data

In [5]:
proprocessed_data[:5]

["dhoni keeps the glove | WATCH: Sports Minister Kiren Rijiju issues statement backing MS Dhoni over 'Balidaan Badge', tells BCCI to take up the matter with ICC and keep government in the know as nation's pride is involved ",
 'politico No. We should remember very clearly that individual 1 just admitted to treason . trump is a traitor mccain sa hero john mccain day',
 'cricketworldcup Guess who would be the winner of this cwc19? Team who gets maximum points from the abandoned matches 😄 shame on icc wiv seng ICC',
 "Corbyn is too politically intellectual for boris johnson should not be pm Can't wait general election now ",
 'All the best to team india for another swimming competition on Sunday against pakistan. in dv pak shame on icc cwc19 cwc19rains ☔☔ ']

In [7]:
print('Number of samples:',len(proprocessed_data))
print('Number of samples per label: ',Counter(labels)) #we count HOF as 1 and NOT as 0

Number of samples: 5852
Number of samples per label:  Counter({0: 3591, 1: 2261})


### Balancing the dataset using interweaving

In [83]:
def interweave( df, primary_label, secondary_label ):
    """Interweaves the negative and positive examples to balance the dataset
        The majority class should be primary
        while the minority class should be secondary
        
        Args:
            df (pd.DataFrame) = df containing the samples and their corresponding labels
            primary_label (str) = label of the majority samples
            secondary_label (str) = label of the minority samples
            
        Returns:
            balanced_df (pd.DataFrame)
    """    
    
    primary = df.text[df.label == primary_label].tolist()
    secondary = df.text[df.label == secondary_label].tolist()
    
    weave = []
    weave_labels = []
    
    for i in range(len(primary)):
        weave.append(primary[i])
        weave_labels.append(primary_label)
        try:
            weave.append(secondary[i])
            weave_labels.append(secondary_label)
        except:
            r = random.randint(0,len(secondary)-1)
            weave.append(secondary[r])
            weave_labels.append(secondary_label)
    
    balanced_df = pd.DataFrame(
        {'text':weave, 'label':weave_labels}
        
    )
    
    return balanced_df

In [77]:
#find minority class
data_df.task_1.value_counts()

NOT    3591
HOF    2261
Name: task_1, dtype: int64

In [78]:
task_a_df

Unnamed: 0,text,label
0,#DhoniKeepsTheGlove | WATCH: Sports Minister K...,NOT
1,@politico No. We should remember very clearly ...,HOF
2,@cricketworldcup Guess who would be the winner...,NOT
3,Corbyn is too politically intellectual for #Bo...,NOT
4,All the best to #TeamIndia for another swimmin...,NOT
...,...,...
5847,"@davidfrum @trueblueusa1 That's cute and all, ...",HOF
5848,a recession issa comin' #maga #magamyass #fuck...,NOT
5849,#DoctorsFightBack Will 'The Mad n Irrational ...,HOF
5850,#ShiningIndia #educatedindia or more like RUND...,HOF


In [79]:
task_a_df = data_df[['text','task_1']].copy()
task_a_df.rename(
    columns = {'task_1':'label'},
    inplace = True,
)
balanced_data_df = interweave( task_a_df,'NOT','HOF' )

In [82]:
balanced_data_df.label.value_counts()

HOF    3591
NOT    3591
Name: label, dtype: int64