In [1]:
import logging
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from sklearn.preprocessing import LabelEncoder
import re
import pandas as pd
import os
from tqdm import tqdm
import signal
import numpy as np
import glob
from pathlib import Path

In [2]:
# Define label mapping
label2int = {
    "bug": 0,
    "documentation" : 1,
    "docs" : 1, 
    "enhancement" : 2,
    "feature" : 2, 
    "question" : 3,
}

In [3]:
image_regex = re.compile('!\[(.*)\]\(.*\)')
link_regex_1 = re.compile('\[(.*)\]\(.*\)')
link_regex_2 = re.compile('\[(.*)\]: [^\s]+')
code_regex = re.compile('(:?`[^`]+`|```[^`]*```)')

def preprocess_raw(directory = '.', output_filepath=''):
    """ preprocesses defect report raw data (data/raw) and saves it (data/processed)
    """
    logger = logging.getLogger(__name__)
    logger.info('preprocessing data set from raw data')
    
    unlabeled_df_all = None
    labeled_df_all = None
    
    for file in glob.glob(os.path.join(directory, '*')):
    
        df = pd.read_csv(file)

        unlabeled_df, labeled_df = preprocess_rows(df)

        unlabeled_df.replace({pd.NA: np.nan, '': np.nan}, inplace=True)
        labeled_df.replace({pd.NA: np.nan, '': np.nan}, inplace=True)

        unlabeled_df.dropna(subset=['text'], inplace=True)
        labeled_df.dropna(subset=['text'], inplace=True)

        unlabeled_df.to_csv(os.path.join(output_filepath + '_unlabeled', Path(file).stem) + '.csv', index=False)
            
        unlabeled_df_all = unlabeled_df if unlabeled_df_all is None else pd.concat([unlabeled_df_all, unlabeled_df], ignore_index=True)
        
        labeled_df.to_csv(os.path.join(output_filepath + '_labeled', Path(file).stem + '_labeled.csv'), index=False)
        
        labeled_df_all = labeled_df if labeled_df_all is None else pd.concat([labeled_df_all, labeled_df], ignore_index=True)
         
    unlabeled_df_all.to_csv(os.path.join(output_filepath + '_unlabeled', 'all_unlabeled.csv'), index=False)
        
    labeled_df_all.to_csv(os.path.join(output_filepath + '_labeled', 'all_labeled.csv'), index=False)
    
        
    
    


def get_ekphrasis_preprocessor():
    return TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
    )

def preprocess_rows(df):
    logger = logging.getLogger(__name__)
    logger.info('started preprocessing rows')

    df = df.fillna({
                        'Title': '',
                        'Body': '',
                        'Labels': ''
                   })
    df['text'] = df['Title'] + ' ' + df['Body']
    
    unlabeled_df = df[~df["Labels"].str.contains("bug|documentation|docs|question|enhancement|feature", regex=True)]
    labeled_df = df[df["Labels"].str.contains("bug|documentation|docs|question|enhancement|feature", regex=True)]
    
    
    labels = []
    for i, label in enumerate(labeled_df['Labels']):
#         appear = False
        if "bug" in label:
            labels.append("bug")
#             if appear is False:
#                 appear = True
#             else: raise Exception(label + str(i))
        
        elif "doc" in label:
            labels.append("documentation")
                
        elif "question" in label:
            labels.append("question")
                
        elif "enhancement" in label:
            labels.append("enhancement")
                
        elif "feature" in label:
            labels.append("feature")
        
    labeled_df['Labels'] = labels    
    labeled_df["label"] = labeled_df['Labels'].map(label2int).tolist()
    
    unlabeled_df = unlabeled_df.drop(["Labels"], axis=1)
    labeled_df = labeled_df.drop(["Labels"], axis=1)
    
    unlabeled_df = unlabeled_df.filter(['text', 'label'])
    labeled_df = labeled_df.filter(['text', 'label'])
    text_processor = get_ekphrasis_preprocessor()
    unlabeled_df['text'] = [clean_text(text, text_processor) for text in tqdm(unlabeled_df['text'])]
    labeled_df['text'] = [clean_text(text, text_processor) for text in tqdm(labeled_df['text'])]
    return unlabeled_df, labeled_df

class TimeoutException(Exception):   # Custom exception class
    pass

def timeout_handler(signum, frame):   # Custom signal handler
    raise TimeoutException

def clean_text(text, text_processor):
    
    #bar.set_description('regex')
    cleaned = text
    cleaned = cleaned.replace('**Checklist**', ' ')
    cleaned = re.sub(r"\* \[x\] .+\s", " ", cleaned)
    cleaned = re.sub(r"\*{2}Checklist.+\*{2}", " ", cleaned)
    cleaned = re.sub(r"\*{2}.+\*{2}","<section>", cleaned)
    cleaned = re.sub(image_regex, r'\1 <img>', cleaned)
    cleaned = re.sub(link_regex_1, r'\1 <url>', cleaned)
    cleaned = re.sub(link_regex_2, r'\1 <url>', cleaned)
    cleaned = re.sub(code_regex, '<code>', cleaned)
    
    
#     cleaned = cleaned.replace('**Describe the contribution**', '')
#     cleaned = cleaned.replace('**Checklist**', '')
#     cleaned = cleaned.replace('**Testing Performed**', '')
#     cleaned = cleaned.replace('**Expected Behavior Changes**', '')
#     cleaned = cleaned.replace('**Contributor Info**', '')
    
#     cleaned = cleaned.replace('**Describe the bug**', '')
#     cleaned = cleaned.replace('**Expected Behavior**', '')
#     cleaned = cleaned.replace('**Reporter Info**', '')
#     cleaned = cleaned.replace('**Checklist (Please check before submitting)**', '')
#     cleaned = cleaned.replace('**System(s) tested on**', '')
#     cleaned = cleaned.replace('**Additional context**', '')
#     cleaned = cleaned.replace('**Contributor Info**', '')
    
    #bar.set_description('ekph')
    signal.signal(signal.SIGALRM, timeout_handler)
    
    signal.alarm(5)
    
    try:
        cleaned = " ".join(text_processor.pre_process_doc(cleaned))
    except (RecursionError, TimeoutException):
        cleaned = pd.NA
    else:
        signal.alarm(0)
    #bar.set_description('end')
    return cleaned

In [4]:
log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
logging.basicConfig(level=logging.INFO, format=log_fmt)

logger = logging.getLogger(__name__)
logger.info('making final data set from raw data')



preprocess_raw(directory='./issues', output_filepath='./data')





2023-08-15 19:04:32,417 - __main__ - INFO - making final data set from raw data
2023-08-15 19:04:32,419 - __main__ - INFO - preprocessing data set from raw data
2023-08-15 19:04:32,429 - __main__ - INFO - started preprocessing rows
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df['Labels'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df["label"] = labeled_df['Labels'].map(label2int).tolist()
  self.tok = re.compile(r"({})".format("|".join(pipeline)))


Reading twitter - 1grams ...
Reading twitter - 2grams ...


  regexes = {k.lower(): re.compile(self.expressions[k]) for k, v in


Reading twitter - 1grams ...


100%|█████████████████████████████████████████| 64/64 [00:00<00:00, 1475.30it/s]
100%|█████████████████████████████████████████| 30/30 [00:00<00:00, 1467.86it/s]
2023-08-15 19:04:34,596 - __main__ - INFO - started preprocessing rows
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df['Labels'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df["label"] = labeled_df['Labels'].map(label2int).tolist()


Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


100%|█████████████████████████████████████████| 46/46 [00:00<00:00, 1440.64it/s]
100%|█████████████████████████████████████████| 12/12 [00:00<00:00, 2037.72it/s]
2023-08-15 19:04:36,942 - __main__ - INFO - started preprocessing rows


Reading twitter - 1grams ...
Reading twitter - 2grams ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df['Labels'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df["label"] = labeled_df['Labels'].map(label2int).tolist()


KeyboardInterrupt: 

In [None]:
# About 60 with multiple labels in NASA#LC

## Checking Preprocessing 

In [5]:
data = pd.read_csv('./data_unlabeled/nasa#LC.csv')

In [6]:
data1 = pd.read_csv('./issues/nasa#LC.csv')

In [7]:
i=0
print(data1['Title'].iloc[i] + '\n' + data1['Body'].iloc[i] + '\n \n' + data['text'].iloc[i])

Fix #92, Refactor `LC_CreateTaskCDS()` to remove multiple returns
**Checklist**
* [x] I reviewed the [Contributing Guide](https://github.com/nasa/osal/blob/main/CONTRIBUTING.md).
* [x] I signed and emailed the appropriate [Contributor License Agreement](https://github.com/nasa/cFS/blob/main/CONTRIBUTING.md#contributor-license-agreement-cla) to GSFC-SoftwareRelease@mail.nasa.gov and copied cfs-program@lists.nasa.gov.

**Describe the contribution**
- Fixes #92 
  - Simple refactor removes multiple returns from `LC_CreateTaskCDS()`

**Testing performed**
GitHub CI actions all passing successfully (incl. Build + Run, Unit/Coverage Tests etc.).

**Expected behavior changes**
No change to logic/behavior

**Contributor Info**
Avi Weiss @thnkslprpt
 
fix # <number> , refactor <code> to remove multiple returns <section> - fixes # <number> - simple refactor removes multiple returns from <code> <section> github ci actions all passing successfully ( incl . build + run , unit / cover