In [1]:
!pip install ekphrasis scikit-learn pandas numpy

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import logging
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from sklearn.preprocessing import LabelEncoder
import re
import pandas as pd
import os
from tqdm import tqdm
import signal
import numpy as np

In [3]:
# download the training set if it does not exist
train_file = 'nlbse23-issue-classification-train.csv'
test_file = 'nlbse23-issue-classification-test.csv'

if not os.path.isfile(train_file):
  !curl "https://tickettagger.blob.core.windows.net/datasets/{train_file}.tar.gz" | tar -xz


if not os.path.isfile(test_file):
  !curl "https://tickettagger.blob.core.windows.net/datasets/{test_file}.tar.gz" | tar -xz

In [4]:
image_regex = re.compile('!\[(.*)\]\(.*\)')
link_regex_1 = re.compile('\[(.*)\]\(.*\)')
link_regex_2 = re.compile('\[(.*)\]: [^\s]+')
code_regex = re.compile('(:?`[^`]+`|```[^`]*```)')

# Define label mapping
label2int = {
    "bug": 0,
    "documentation" : 1,
    "feature" : 2,
    "question" : 3,
}

def preprocess_raw(output_filepath=''):
    """ preprocesses NLBSE23 raw data (data/raw) and saves it (data/processed)
    """
    logger = logging.getLogger(__name__)
    logger.info('preprocessing data set from raw data')
    
    train_set = pd.read_csv("./nlbse23-issue-classification-train.csv")
    test_set = pd.read_csv("./nlbse23-issue-classification-test.csv")

    train_set.drop_duplicates(subset=['id'], inplace=True)
    test_set.drop_duplicates(subset=['id'], inplace=True)

    lenc = lambda x: label2int[x]

    train_set = preprocess_rows(train_set, lenc)
    test_set = preprocess_rows(test_set, lenc)

    train_set.replace({pd.NA: np.nan, '': np.nan}, inplace=True)
    test_set.replace({pd.NA: np.nan, '': np.nan}, inplace=True)

    train_set.dropna(subset=['text'], inplace=True)
    test_set.dropna(subset=['text'], inplace=True)

    train_set.to_csv(os.path.join(output_filepath, 'train_set.csv'), index=False)
    test_set.to_csv(os.path.join(output_filepath, 'test_set.csv'), index=False)


def get_ekphrasis_preprocessor():
    return TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'emphasis', 'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
    )

def preprocess_rows(df, label_encoder):
    logger = logging.getLogger(__name__)
    logger.info('started preprocessing rows')
    df = df.fillna({
                        'title': '',
                        'body': ''                  
                   })
    df['text'] = df['title'] + df['body']
    df['label'] = [label_encoder(x) for x in df['labels']]
    df = df.filter(['id', 'text', 'label'])
    text_processor = get_ekphrasis_preprocessor()
    df['text'] = [clean_text(text, text_processor) for text in tqdm(df['text'])]
    return df

class TimeoutException(Exception):   # Custom exception class
    pass

def timeout_handler(signum, frame):   # Custom signal handler
    raise TimeoutException

def clean_text(text, text_processor):
    
    #bar.set_description('regex')
    cleaned = re.sub(image_regex, r'\1 <img>', text)
    cleaned = re.sub(link_regex_1, r'\1 <url>', cleaned)
    cleaned = re.sub(link_regex_2, r'\1 <url>', cleaned)
    cleaned = re.sub(code_regex, '<code>', cleaned)
    #bar.set_description('ekph')
    signal.signal(signal.SIGALRM, timeout_handler)
    
    signal.alarm(5)
    
    try:
        cleaned = " ".join(text_processor.pre_process_doc(cleaned))
    except (RecursionError, TimeoutException):
        cleaned = pd.NA
    else:
        signal.alarm(0)
    #bar.set_description('end')
    return cleaned

In [5]:
log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
logging.basicConfig(level=logging.INFO, format=log_fmt)

logger = logging.getLogger(__name__)
logger.info('making final data set from raw data')

preprocess_raw('.')





  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Reading twitter - 1grams ...
Reading twitter - 2grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


Reading twitter - 1grams ...


100%|██████████| 99726/99726 [04:03<00:00, 409.25it/s] 


Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


100%|██████████| 99718/99718 [03:48<00:00, 437.29it/s]


In [6]:
!curl -LJO "https://zenodo.org/record/7628150/files/test_set_r.csv"
!curl -LJO "https://zenodo.org/record/7628150/files/train_set_r.csv"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0  3801    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
curl: (23) Failed writing header
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0  3833    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
curl: (23) Failed writing header


In [10]:
import pandas as pd


# Preprocess dataset

train_set = pd.read_csv("./train_set.csv")
test_set = pd.read_csv("./test_set.csv")

train_set.drop_duplicates(subset=['id'], inplace=True)
test_set.drop_duplicates(subset=['id'], inplace=True)

# -- Open df
train_sample = pd.read_csv('./train_set_r.csv')
test_sample = pd.read_csv('./test_set_r.csv')

train_df = pd.merge(train_sample, train_set, on=['id'], how='inner')
test_df = pd.merge(test_sample, train_set, on=['id'], how='inner')

def filter_df(df):
    df = df[~df["new_label"].isin(["unknown", "discard"])]
    df["label"] = df["new_label"].map(label2int).tolist()
    df = df.drop(["new_label"], axis=1)
    return df

train_df = filter_df(train_df)
test_df = filter_df(test_df)

# -- Plot distribution
def plot_dist(df):
    print(df["label"].value_counts())
    
plot_dist(train_df)
plot_dist(test_df)

train_df.to_csv("train_set_hand.csv", index=False)
test_df.to_csv("test_set_hand.csv", index=False)

2    5
1    4
0    2
3    2
Name: label, dtype: int64
Series([], Name: label, dtype: int64)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["label"] = labels
