In [None]:
import pandas as pd
from collections import Counter

# Assuming get_r1 and get_r2 functions are defined elsewhere
def get_r1(ns, s):
    return s / (s + ns) if (s + ns) > 0 else 0

def get_r2(ns, s):
    return ns / (s + ns) if (s + ns) > 0 else 0

def lexicon_preprocess(severe_word_counts, nonsevere_word_counts):
    """
    Calculates the ratios for Severe and NonSevere categories.

    Args:
        severe_word_counts: Dictionary containing word counts for Severe category.
        nonsevere_word_counts: Dictionary containing word counts for NonSevere category.

    Returns:
        DataFrame: Contains words with their counts and ratios for Severe and NonSevere categories.
    """
    # Combine word counts into a single dictionary
    all_words = set(severe_word_counts.keys()).union(set(nonsevere_word_counts.keys()))
    all_data = {word: {'Severe': severe_word_counts.get(word, 0), 'NonSevere': nonsevere_word_counts.get(word, 0)} for word in all_words}

    # Calculate ratios and prepare payload
    payload_train = {}
    for word, counts in all_data.items():
        ns = counts.get('NonSevere', 0)
        s = counts.get('Severe', 0)
        r1 = get_r1(ns, s)
        r2 = get_r2(ns, s)
        payload_train[word] = {'r1': r1, 'r2': r2}

    # Convert to DataFrame and transpose
    payload_train_df = pd.DataFrame(payload_train).T
    
    return payload_train_df




# Example usage
severe_word_counts = Counter({
    'jspindexmanager': 1, 'variety': 1, 'imodule': 1, 'iarchive': 1, 'javaartifacteditmodel': 1,
    'aresource': 1, 'previous': 1, 'containers': 1, 'not_initialize': 1, 'inialize': 1, 'circumstances': 1,
    'g': 1, 'match': 1, 'universalpathtransformer': 1, 'w': 1, 'publishers': 1, 'clear': 1, 'transaction': 1,
    'not_ejbdoclet': 1, 'validatorstrategy': 1, 'unwanted': 1, 'activation': 1, 'serviceref': 1, 'dom': 1,
    'choke': 1, 'earartifactedit': 1, 'getearartifacteditforread': 1, 'healess': 1, 'publisherdelegate': 1,
    'delta': 1, 'kind': 1, 'doesnt': 1, 'erors': 1
})
nonsevere_word_counts = Counter({
    'update': 46, 'need': 38, 'xml': 35, 'component': 23, 'file': 22, 'wst': 20, 'version': 18, 'jst': 18,
    'wtp': 18, 'feature': 17, 'project': 16, 'java': 14, 'eclipse': 14, 'org': 14, 'jsp': 13, 'bundle': 12,
    'change': 11, 'validation': 11, 'web': 11, 'page': 11, 'doc': 10, 'source': 10, 'build': 10, 'html': 10,
    'remove': 10, 'number': 9, 'editor': 9, 'use': 9, 'api': 9, 'ee': 8, 'ui': 8, 'incorrect': 8, 'server': 8,
    'add': 8, 'miss': 8, 'j': 7, 'type': 7, 'attribute': 7, 'service': 7, 'jar': 6, 'runtime': 6, 'string': 6,
    'webtools': 6, 'plugin': 6, 'wizard': 6, 'common': 6, 'facet': 6, 'view': 5, 'jee': 5, 'delete': 5, 'edit': 5,
    'new': 5, 'name': 5, 'plugins': 5, 'errors': 5, 'tag': 5, 'ear': 5, 'class': 5, 'wsdl': 5, 'invalid': 5,
    'map': 5, 'cannot': 4, 'ws': 4, 'pom': 4, 'model': 4, 'move': 4, 'user': 4, 'npe': 4, 'not': 4
})

# Process the data
payload_train_df = lexicon_preprocess(severe_word_counts, nonsevere_word_counts)
print(payload_train_df)

# Optionally, convert to dictionary for further processing or saving to JSON
payload_train_dict = payload_train_df.to_dict(orient='index')
print(json.dumps(payload_train_dict, indent=2))


In [None]:
def dictionary_onthresholds(severe_threshold, nonsevere_threshold, payload_train):
    """
    Create dictionaries on each combination of severe and nonsevere threshold

    Args:
        severe_threshold: threshold set manually for severe from 0.1 to 1.0
        nonsevere_threshold: threshold set manually for nonsevere from 0.1 to 1.0
        payload_train: DataFrame having words with its counts as severe and nonsevere from the training dataset
      
    Returns: severe_dictionary, nonsevere_dictionary, severe_threshold, nonsevere_threshold
    """
    severe_dictionary = {}
    nonsevere_dictionary = {}

    for keyy in payload_train.index:
        # Check for 'r1' existence and value for severe threshold
        if 'r1' in payload_train.columns and payload_train.at[keyy, 'r1'] >= severe_threshold:
            severe_dictionary[keyy] = {'ratio': float(payload_train.at[keyy, 'r1'])}  # Store value and ratio as float

        # Check for 'r2' existence and value for non-severe threshold
        if 'r2' in payload_train.columns and payload_train.at[keyy, 'r2'] >= nonsevere_threshold:
            nonsevere_dictionary[keyy] = {'ratio': float(payload_train.at[keyy, 'r2'])}  # Store value and ratio as float

    print("severe_dictionary inside dictionary_onthresholds function", severe_dictionary)
    print("nonsevere_dictionary inside dictionary_onthresholds function", nonsevere_dictionary)

    return severe_dictionary, nonsevere_dictionary, severe_threshold, nonsevere_threshold

# Example usage with given data
import numpy as np

winning_threshold = {'severe threshold': np.float64(0.1), 'non severe threshold': np.float64(0.8)}

# Create example DataFrame for payload_train
payload_train = pd.DataFrame({
    'r1': [1.0, 0.5, 1.0, 1.0, 0.0],
    'r2': [0.0, 0.5, 0.0, 0.0, 1.0]
}, index=['axisclientgenerator', 'activation', 'ejbbean', 'synchhelpers', 'toggle'])

severe_threshold = winning_threshold['severe threshold']
nonsevere_threshold = winning_threshold['non severe threshold']

severe_dictionary, nonsevere_dictionary, severe_threshold, nonsevere_threshold = dictionary_onthresholds(severe_threshold, nonsevere_threshold, payload_train)

print("Severe Dictionary:", severe_dictionary)
print("Non-Severe Dictionary:", nonsevere_dictionary)


In [None]:
# # Sample data
# data = {
#     'Summary': [
#         'Invalid version of EAR project not created when creating a new Application Client Project',
#         'NullPointerException when accessing user Error details on the main dashboard',
#         'UI freezes during file upload after recent update',
#         'Crash occurs when opening large projects in Eclipse',
#         'Validation fails for XML schema in version 1.5.2',
#         'Application throws SAXParseException on parsing malformed XML',
#         'Update Manager fails to detect new updates on server',
#         'Error 500: Internal Server Error when accessing the Reports module',
#         'Configuration settings not saved after restart',
#         'Plugin dependency fails issues causing build failures'
#     ],
#     'Severity': [
#         'Severe',
#         'Severe',
#         'NonSevere',
#         'Severe',
#         'NonSevere',
#         'Severe',
#         'Severe',
#         'NonSevere',
#         'NonSevere',
#         'Severe'
#     ]
# }

# # Creating DataFrame
# bugs_df = pd.DataFrame(data)
# print(bugs_df)

In [10]:
def dictionary_onthresholds(severe_threshold, nonsevere_threshold, payload_train):
    """
    Create dictionaries on each combination of severe and nonsevere threshold

    Args:
        severe_threshold: threshold set manually for severe from 0.1 to 1.0
        nonsevere_threshold: threshold set manually for nonsevere from 0.1 to 1.0
        payload_train: DataFrame having words with its counts as severe and nonsevere from the training dataset
      
    Returns: severe_dictionary, nonsevere_dictionary, severe_threshold, nonsevere_threshold
    """      
    severe_dictionary = {}
    nonsevere_dictionary = {}

    for keyy in payload_train.index:
        # Check for 'r1' existence and value for severe threshold
        if 'r1' in payload_train.columns and payload_train.at[keyy, 'r1'] >= severe_threshold:
            severe_dictionary[keyy] = {'ratio': float(payload_train.at[keyy, 'r1'])}  # Store value and ratio as float

        # Check for 'r2' existence and value for non-severe threshold
        if 'r2' in payload_train.columns and payload_train.at[keyy, 'r2'] >= nonsevere_threshold:
            nonsevere_dictionary[keyy] = {'ratio': float(payload_train.at[keyy, 'r2'])}  # Store value and ratio as float

    print("severe_dictionary inside dictionary_onthresholds function", severe_dictionary)
    print("nonsevere_dictionary inside dictionary_onthresholds function", nonsevere_dictionary)

    return severe_dictionary, nonsevere_dictionary, severe_threshold, nonsevere_threshold

# Example usage with sample data
import numpy as np
import pandas as pd

# Sample DataFrame for payload_train
payload_train = pd.DataFrame({
    'r1': [0.0, 1.0, 0.0, 0.5, 0.5, 1.0, 1.0, 0.0, 1.0, 0.333333],
    'r2': [1.0, 0.0, 1.0, 0.5, 0.5, 0.0, 0.0, 1.0, 0.0, 0.666667]
}, index=['report', 'nullpointerexception', 'schema', 'access', 'fail', 'open', 'issue', 'internal', 'dashboard', 'error'])

severe_threshold = 0.5
nonsevere_threshold = 0.8

severe_dictionary, nonsevere_dictionary, severe_threshold, nonsevere_threshold = dictionary_onthresholds(severe_threshold, nonsevere_threshold, payload_train)

print("Severe Dictionary:", severe_dictionary)
print("Non-Severe Dictionary:", nonsevere_dictionary)


severe_dictionary inside dictionary_onthresholds function {'nullpointerexception': {'ratio': 1.0}, 'access': {'ratio': 0.5}, 'fail': {'ratio': 0.5}, 'open': {'ratio': 1.0}, 'issue': {'ratio': 1.0}, 'dashboard': {'ratio': 1.0}}
nonsevere_dictionary inside dictionary_onthresholds function {'report': {'ratio': 1.0}, 'schema': {'ratio': 1.0}, 'internal': {'ratio': 1.0}}
Severe Dictionary: {'nullpointerexception': {'ratio': 1.0}, 'access': {'ratio': 0.5}, 'fail': {'ratio': 0.5}, 'open': {'ratio': 1.0}, 'issue': {'ratio': 1.0}, 'dashboard': {'ratio': 1.0}}
Non-Severe Dictionary: {'report': {'ratio': 1.0}, 'schema': {'ratio': 1.0}, 'internal': {'ratio': 1.0}}


In [None]:
def nlpsteps(x):
    """
    Tokenizes and preprocesses a summary of a bug.

    Args:
        x (str): The summary text to be processed.

    Returns:
        list: The processed text as a list of tokens after removing non-alphabetic characters, converting to lowercase,
              lemmatizing words, and removing stopwords.
    """
    
    # Remove non-alphabetic characters
    review = re.sub('[^a-zA-Z]', ' ', str(x))
    review = review.lower()
    review = review.split()

    lemmatizer = WordNetLemmatizer()

    all_stopwords = set(stopwords.words('english'))
    all_stopwords.remove('not')
    
    # Concatenate 'not' with the next word
    processed_review = []
    i = 0
    while i < len(review):
        if review[i] == 'not' and i + 1 < len(review):
            processed_review.append('not_' + review[i + 1])
            i += 2  # Skip the next word as it has been concatenated
        else:
            if review[i] not in all_stopwords:
                processed_review.append(lemmatizer.lemmatize(review[i]))
            i += 1

    return processed_review


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample data
trainingdata_tokenised = [
    'jsp file not_indexed jsp model plugin not_activated'
]

# Initialize and apply CountVectorizer
cv = CountVectorizer()
X_train = cv.fit_transform(trainingdata_tokenised).toarray()

# Display the document-term matrix
feature_names = cv.get_feature_names_out()
import pandas as pd
df = pd.DataFrame(X_train, columns=feature_names)
print("Feature Names:", feature_names)
print('')
print(df)
