In [8]:
import logging
from ekphrasis.classes.preprocessor import TextPreProcessor
from ekphrasis.classes.tokenizer import SocialTokenizer
from ekphrasis.dicts.emoticons import emoticons
from sklearn.preprocessing import LabelEncoder
import re
import pandas as pd
import os
from tqdm import tqdm
import signal
import numpy as np
import glob
from pathlib import Path

In [9]:
test_dir = '/Users/jchang15/scrape_issues/issues'

# Define label mapping
label2int = {
    "bug": 0,
    "documentation" : 1,
    "docs" : 1, 
    "enhancement" : 2,
    "feature" : 2, 
    "question" : 3,
}

In [10]:
image_regex = re.compile('!\[(.*)\]\(.*\)')
link_regex_1 = re.compile('\[(.*)\]\(.*\)')
link_regex_2 = re.compile('\[(.*)\]: [^\s]+')
code_regex = re.compile('(:?`[^`]+`|```[^`]*```)')

def preprocess_raw(directory = '.', output_filepath=''):
    """ preprocesses defect report raw data (data/raw) and saves it (data/processed)
    """
    logger = logging.getLogger(__name__)
    logger.info('preprocessing data set from raw data')
    
    unlabeled_df_all = None
    labeled_df_all = None
    
    for file in glob.glob(os.path.join(directory, '*')):
    
        df = pd.read_csv(file)

        unlabeled_df, labeled_df = preprocess_rows(df)

        unlabeled_df.replace({pd.NA: np.nan, '': np.nan}, inplace=True)
        labeled_df.replace({pd.NA: np.nan, '': np.nan}, inplace=True)

        unlabeled_df.dropna(subset=['text'], inplace=True)
        labeled_df.dropna(subset=['text'], inplace=True)

        unlabeled_df.to_csv(os.path.join(output_filepath + '_unlabeled', Path(file).stem) + '.csv', index=False)
            
        unlabeled_df_all = unlabeled_df if unlabeled_df_all is None else pd.concat([unlabeled_df_all, unlabeled_df], ignore_index=True)
        
        labeled_df.to_csv(os.path.join(output_filepath + '_labeled', Path(file).stem + '_labeled.csv'), index=False)
        
        labeled_df_all = labeled_df if labeled_df_all is None else pd.concat([labeled_df_all, labeled_df], ignore_index=True)
         
    unlabeled_df_all.to_csv(os.path.join(output_filepath + '_unlabeled', 'all_unlabeled.csv'), index=False)
        
    labeled_df_all.to_csv(os.path.join(output_filepath + '_labeled', 'all_labeled.csv'), index=False)
    
        
    
    


def get_ekphrasis_preprocessor():
    return TextPreProcessor(
    # terms that will be normalized
    normalize=['url', 'email', 'percent', 'money', 'phone', 'user',
        'time', 'url', 'date', 'number'],
    # terms that will be annotated
    annotate={"hashtag", "allcaps", "elongated", "repeated",
        'censored'},
    fix_html=True,  # fix HTML tokens
    
    # corpus from which the word statistics are going to be used 
    # for word segmentation 
    segmenter="twitter", 
    
    # corpus from which the word statistics are going to be used 
    # for spell correction
    corrector="twitter", 
    
    unpack_hashtags=True,  # perform word segmentation on hashtags
    unpack_contractions=True,  # Unpack contractions (can't -> can not)
    spell_correct_elong=False,  # spell correction for elongated words
    
    # select a tokenizer. You can use SocialTokenizer, or pass your own
    # the tokenizer, should take as input a string and return a list of tokens
    tokenizer=SocialTokenizer(lowercase=True).tokenize,
    
    # list of dictionaries, for replacing tokens extracted from the text,
    # with other expressions. You can pass more than one dictionaries.
    dicts=[emoticons]
    )

def preprocess_rows(df):
    logger = logging.getLogger(__name__)
    logger.info('started preprocessing rows')

    df = df.fillna({
                        'Title': '',
                        'Body': '',
                        'Labels': ''
                   })
    df['text'] = df['Title'] + ' ' + df['Body']
    
    unlabeled_df = df[~df["Labels"].str.contains("bug|documentation|docs|question|enhancement|feature", regex=True)]
    labeled_df = df[df["Labels"].str.contains("bug|documentation|docs|question|enhancement|feature", regex=True)]
    
    
    labels = []
    for i, label in enumerate(labeled_df['Labels']):
#         appear = False
        if "bug" in label:
            labels.append("bug")
#             if appear is False:
#                 appear = True
#             else: raise Exception(label + str(i))
        
        elif "doc" in label:
            labels.append("documentation")
                
        elif "question" in label:
            labels.append("question")
                
        elif "enhancement" in label:
            labels.append("enhancement")
                
        elif "feature" in label:
            labels.append("feature")
        
    labeled_df['Labels'] = labels    
    labeled_df["label"] = labeled_df['Labels'].map(label2int).tolist()
    
    unlabeled_df = unlabeled_df.drop(["Labels"], axis=1)
    labeled_df = labeled_df.drop(["Labels"], axis=1)
    
    unlabeled_df = unlabeled_df.filter(['text', 'label'])
    labeled_df = labeled_df.filter(['text', 'label'])
    text_processor = get_ekphrasis_preprocessor()
    unlabeled_df['text'] = [clean_text(text, text_processor) for text in tqdm(unlabeled_df['text'])]
    labeled_df['text'] = [clean_text(text, text_processor) for text in tqdm(labeled_df['text'])]
    return unlabeled_df, labeled_df

class TimeoutException(Exception):   # Custom exception class
    pass

def timeout_handler(signum, frame):   # Custom signal handler
    raise TimeoutException

def clean_text(text, text_processor):
    
    #bar.set_description('regex')
    cleaned = text
    cleaned = cleaned.replace('**Checklist**', ' ')
    cleaned = re.sub(r"\* \[x\] .+\s", " ", cleaned)
    cleaned = re.sub(r"\*{2}Checklist.+\*{2}", " ", cleaned)
    cleaned = re.sub(r"\*{2}.+\*{2}","<section>", cleaned)
    cleaned = re.sub(image_regex, r'\1 <img>', cleaned)
    cleaned = re.sub(link_regex_1, r'\1 <url>', cleaned)
    cleaned = re.sub(link_regex_2, r'\1 <url>', cleaned)
    cleaned = re.sub(code_regex, '<code>', cleaned)
    
    
#     cleaned = cleaned.replace('**Describe the contribution**', '')
#     cleaned = cleaned.replace('**Checklist**', '')
#     cleaned = cleaned.replace('**Testing Performed**', '')
#     cleaned = cleaned.replace('**Expected Behavior Changes**', '')
#     cleaned = cleaned.replace('**Contributor Info**', '')
    
#     cleaned = cleaned.replace('**Describe the bug**', '')
#     cleaned = cleaned.replace('**Expected Behavior**', '')
#     cleaned = cleaned.replace('**Reporter Info**', '')
#     cleaned = cleaned.replace('**Checklist (Please check before submitting)**', '')
#     cleaned = cleaned.replace('**System(s) tested on**', '')
#     cleaned = cleaned.replace('**Additional context**', '')
#     cleaned = cleaned.replace('**Contributor Info**', '')
    
    #bar.set_description('ekph')
    signal.signal(signal.SIGALRM, timeout_handler)
    
    signal.alarm(5)
    
    try:
        cleaned = " ".join(text_processor.pre_process_doc(cleaned))
    except (RecursionError, TimeoutException):
        cleaned = pd.NA
    else:
        signal.alarm(0)
    #bar.set_description('end')
    return cleaned

In [11]:
log_fmt = '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
logging.basicConfig(level=logging.INFO, format=log_fmt)

logger = logging.getLogger(__name__)
logger.info('making final data set from raw data')



preprocess_raw(test_dir, './data')





2023-08-14 18:15:03,005 - __main__ - INFO - making final data set from raw data
2023-08-14 18:15:03,008 - __main__ - INFO - preprocessing data set from raw data
2023-08-14 18:15:03,021 - __main__ - INFO - started preprocessing rows
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df['Labels'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df["label"] = labeled_df['Labels'].map(label2int).tolist()


Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


100%|█████████████████████████████████████████| 64/64 [00:00<00:00, 1496.07it/s]
100%|█████████████████████████████████████████| 30/30 [00:00<00:00, 1532.97it/s]
2023-08-14 18:15:05,202 - __main__ - INFO - started preprocessing rows


Reading twitter - 1grams ...
Reading twitter - 2grams ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df['Labels'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df["label"] = labeled_df['Labels'].map(label2int).tolist()


Reading twitter - 1grams ...


100%|█████████████████████████████████████████| 46/46 [00:00<00:00, 1471.71it/s]
100%|█████████████████████████████████████████| 12/12 [00:00<00:00, 1932.34it/s]
2023-08-14 18:15:07,452 - __main__ - INFO - started preprocessing rows


Reading twitter - 1grams ...
Reading twitter - 2grams ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df['Labels'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df["label"] = labeled_df['Labels'].map(label2int).tolist()


Reading twitter - 1grams ...


100%|█████████████████████████████████████████| 36/36 [00:00<00:00, 2432.97it/s]
100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 407.83it/s]
2023-08-14 18:15:09,674 - __main__ - INFO - started preprocessing rows


Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


100%|████████████████████████████████████████| 102/102 [00:00<00:00, 838.82it/s]
0it [00:00, ?it/s]
2023-08-14 18:15:11,974 - __main__ - INFO - started preprocessing rows


Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 2730.67it/s]
0it [00:00, ?it/s]
2023-08-14 18:15:14,155 - __main__ - INFO - started preprocessing rows


Reading twitter - 1grams ...
Reading twitter - 2grams ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df['Labels'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df["label"] = labeled_df['Labels'].map(label2int).tolist()


Reading twitter - 1grams ...


100%|██████████████████████████████████████████| 61/61 [00:00<00:00, 284.87it/s]
100%|█████████████████████████████████████████| 39/39 [00:00<00:00, 1535.38it/s]
2023-08-14 18:15:16,652 - __main__ - INFO - started preprocessing rows
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df['Labels'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df["label"] = labeled_df['Labels'].map(label2int).tolist()


Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


100%|████████████████████████████████████████| 338/338 [00:00<00:00, 954.02it/s]
100%|███████████████████████████████████████| 168/168 [00:00<00:00, 1290.13it/s]
2023-08-14 18:15:19,379 - __main__ - INFO - started preprocessing rows
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df['Labels'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df["label"] = labeled_df['Labels'].map(label2int).tolist()


Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 1995.39it/s]
100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 5184.55it/s]
2023-08-14 18:15:21,657 - __main__ - INFO - started preprocessing rows


Reading twitter - 1grams ...
Reading twitter - 2grams ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df['Labels'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df["label"] = labeled_df['Labels'].map(label2int).tolist()


Reading twitter - 1grams ...


100%|█████████████████████████████████████████| 46/46 [00:00<00:00, 1186.70it/s]
100%|█████████████████████████████████████████| 10/10 [00:00<00:00, 1886.01it/s]
2023-08-14 18:15:23,829 - __main__ - INFO - started preprocessing rows


Reading twitter - 1grams ...
Reading twitter - 2grams ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df['Labels'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df["label"] = labeled_df['Labels'].map(label2int).tolist()


Reading twitter - 1grams ...


100%|█████████████████████████████████████████| 53/53 [00:00<00:00, 1665.84it/s]
100%|██████████████████████████████████████████| 25/25 [00:00<00:00, 511.69it/s]
2023-08-14 18:15:26,041 - __main__ - INFO - started preprocessing rows


Reading twitter - 1grams ...
Reading twitter - 2grams ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df['Labels'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df["label"] = labeled_df['Labels'].map(label2int).tolist()


Reading twitter - 1grams ...


100%|█████████████████████████████████████| 1161/1161 [00:01<00:00, 1030.24it/s]
100%|█████████████████████████████████████| 1249/1249 [00:01<00:00, 1147.67it/s]
2023-08-14 18:15:30,377 - __main__ - INFO - started preprocessing rows
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df['Labels'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df["label"] = labeled_df['Labels'].map(label2int).tolist()


Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


100%|█████████████████████████████████████████| 55/55 [00:00<00:00, 1125.62it/s]
100%|█████████████████████████████████████████| 38/38 [00:00<00:00, 1686.26it/s]
2023-08-14 18:15:32,781 - __main__ - INFO - started preprocessing rows


Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


100%|███████████████████████████████████████████| 1/1 [00:00<00:00, 3132.42it/s]
0it [00:00, ?it/s]
2023-08-14 18:15:35,032 - __main__ - INFO - started preprocessing rows


Reading twitter - 1grams ...
Reading twitter - 2grams ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df['Labels'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df["label"] = labeled_df['Labels'].map(label2int).tolist()


Reading twitter - 1grams ...


100%|███████████████████████████████████████| 198/198 [00:00<00:00, 1175.25it/s]
100%|███████████████████████████████████████| 203/203 [00:00<00:00, 1156.49it/s]
2023-08-14 18:15:37,414 - __main__ - INFO - started preprocessing rows


Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


100%|█████████████████████████████████████████| 26/26 [00:00<00:00, 1056.99it/s]
0it [00:00, ?it/s]
2023-08-14 18:15:39,534 - __main__ - INFO - started preprocessing rows


Reading twitter - 1grams ...
Reading twitter - 2grams ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df['Labels'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df["label"] = labeled_df['Labels'].map(label2int).tolist()


Reading twitter - 1grams ...


100%|█████████████████████████████████████████| 82/82 [00:00<00:00, 1559.11it/s]
100%|█████████████████████████████████████████| 50/50 [00:00<00:00, 1701.31it/s]
2023-08-14 18:15:41,739 - __main__ - INFO - started preprocessing rows


Reading twitter - 1grams ...
Reading twitter - 2grams ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df['Labels'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df["label"] = labeled_df['Labels'].map(label2int).tolist()


Reading twitter - 1grams ...


100%|█████████████████████████████████████████| 83/83 [00:00<00:00, 1536.29it/s]
100%|█████████████████████████████████████████| 32/32 [00:00<00:00, 1338.90it/s]
2023-08-14 18:15:43,929 - __main__ - INFO - started preprocessing rows


Reading twitter - 1grams ...
Reading twitter - 2grams ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df['Labels'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df["label"] = labeled_df['Labels'].map(label2int).tolist()


Reading twitter - 1grams ...


100%|███████████████████████████████████████| 106/106 [00:00<00:00, 1024.01it/s]
100%|█████████████████████████████████████████| 52/52 [00:00<00:00, 1529.99it/s]
2023-08-14 18:15:46,305 - __main__ - INFO - started preprocessing rows


Reading twitter - 1grams ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df['Labels'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df["label"] = labeled_df['Labels'].map(label2int).tolist()


Reading twitter - 2grams ...
Reading twitter - 1grams ...


100%|███████████████████████████████████████| 102/102 [00:00<00:00, 1381.48it/s]
100%|█████████████████████████████████████████| 44/44 [00:00<00:00, 1748.11it/s]
2023-08-14 18:15:48,580 - __main__ - INFO - started preprocessing rows


Reading twitter - 1grams ...
Reading twitter - 2grams ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df['Labels'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df["label"] = labeled_df['Labels'].map(label2int).tolist()


Reading twitter - 1grams ...


100%|██████████████████████████████████████████| 77/77 [00:00<00:00, 222.54it/s]
100%|██████████████████████████████████████████| 33/33 [00:00<00:00, 978.09it/s]
2023-08-14 18:15:51,072 - __main__ - INFO - started preprocessing rows
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df['Labels'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df["label"] = labeled_df['Labels'].map(label2int).tolist()


Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


100%|█████████████████████████████████████████| 83/83 [00:00<00:00, 1437.08it/s]
100%|█████████████████████████████████████████| 26/26 [00:00<00:00, 1447.00it/s]
2023-08-14 18:15:53,504 - __main__ - INFO - started preprocessing rows


Reading twitter - 1grams ...
Reading twitter - 2grams ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df['Labels'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df["label"] = labeled_df['Labels'].map(label2int).tolist()


Reading twitter - 1grams ...


100%|█████████████████████████████████████████| 78/78 [00:00<00:00, 1381.02it/s]
100%|█████████████████████████████████████████| 59/59 [00:00<00:00, 1084.26it/s]
2023-08-14 18:15:55,782 - __main__ - INFO - started preprocessing rows


Reading twitter - 1grams ...
Reading twitter - 2grams ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df['Labels'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df["label"] = labeled_df['Labels'].map(label2int).tolist()


Reading twitter - 1grams ...


100%|███████████████████████████████████████| 709/709 [00:00<00:00, 1045.53it/s]
100%|███████████████████████████████████████| 687/687 [00:00<00:00, 1019.30it/s]
2023-08-14 18:15:59,268 - __main__ - INFO - started preprocessing rows
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df['Labels'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df["label"] = labeled_df['Labels'].map(label2int).tolist()


Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


100%|█████████████████████████████████████████| 51/51 [00:00<00:00, 1547.11it/s]
100%|██████████████████████████████████████████| 37/37 [00:00<00:00, 948.98it/s]
2023-08-14 18:16:01,531 - __main__ - INFO - started preprocessing rows


Reading twitter - 1grams ...
Reading twitter - 2grams ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df['Labels'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df["label"] = labeled_df['Labels'].map(label2int).tolist()


Reading twitter - 1grams ...


100%|███████████████████████████████████████| 276/276 [00:00<00:00, 1158.47it/s]
100%|████████████████████████████████████████| 124/124 [00:00<00:00, 923.60it/s]
2023-08-14 18:16:04,149 - __main__ - INFO - started preprocessing rows
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df['Labels'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df["label"] = labeled_df['Labels'].map(label2int).tolist()


Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


100%|█████████████████████████████████████████| 44/44 [00:00<00:00, 1011.38it/s]
100%|█████████████████████████████████████████| 37/37 [00:00<00:00, 1481.01it/s]
2023-08-14 18:16:06,418 - __main__ - INFO - started preprocessing rows


Reading twitter - 1grams ...
Reading twitter - 2grams ...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df['Labels'] = labels
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  labeled_df["label"] = labeled_df['Labels'].map(label2int).tolist()


Reading twitter - 1grams ...


100%|███████████████████████████████████████| 121/121 [00:00<00:00, 1259.88it/s]
100%|█████████████████████████████████████████| 83/83 [00:00<00:00, 1501.44it/s]
2023-08-14 18:16:08,831 - __main__ - INFO - started preprocessing rows


Reading twitter - 1grams ...
Reading twitter - 2grams ...
Reading twitter - 1grams ...


0it [00:00, ?it/s]
100%|███████████████████████████████████████████| 2/2 [00:00<00:00, 3930.93it/s]


In [None]:
# About 60 with multiple labels

In [90]:
data = pd.read_csv('./data/nasa#LC.csv')

In [91]:
data['text'][0]

'fix # <number> , refactor <code> to remove multiple returns <section> - fixes # <number> - simple refactor removes multiple returns from <code> <section> github ci actions all passing successfully ( incl . build + run , unit / coverage tests etc . ) . <section> no change to logic / behavior <section> avi weiss <user>'

In [5]:
data1 = pd.read_csv('/Users/jchang15/scrape_issues/issues/nasa#LC.csv')

In [99]:
i=17
print(data1['Title'].iloc[i] + '\n' + data1['Body'].iloc[i] + '\n \n' + data['text'].iloc[i])

Add break; for switch default case in LC_VerifyMsgLength()
**Checklist**
* [x] I reviewed the [Contributing Guide](https://github.com/nasa/cFS/blob/main/CONTRIBUTING.md).
* [x] I performed a cursory search to see if the bug report is relevant, not redundant, nor in conflict with other tickets.

**Describe the bug**
`default` case of the `switch` block in `LC_VerifyMsgLength()` is missing a `break;`.
Purely a style/guidelines issue for consistency and future maintenance.

**Code snips**
https://github.com/nasa/LC/blob/2f177ae83a24445d6ab6997682a2ffa71dacbd31/fsw/src/lc_utils.c#L83-L92

**Expected behavior**
All switch cases (including `default`) should be terminated by an unconditional `break` statement.

**Reporter Info**
Avi Weiss @thnkslprpt
 
add break ; for switch default case in lc_verifymsglength ( ) <section> <code> case of the <code> block in <code> is missing a <code> . purely a style / guidelines issue for consistency and future maintenance . <section> <url> <s