In [None]:
import pandas as pd
from gensim import corpora
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
import string
import nltk

In [None]:
# Download required NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

# Sample data
issues = [
    "Seems like this particular input is confusing the sentence boundary detection (full stop prediction) head. Note how the first instance of `thank` was capitalized after the comma, too. The model seems confused as to where the sentences end (likely due to repeated texts which are unusual of the training data). I would recommend the better model, This is the output I get with that one: ```text After all, isn't that what we're here for? \n That's why we're here for. \n Well, Rob, you are an inspiration for the nation. \n Thank you for coming on, thank you for having me. \n Thank you so much for watching this week's episode. ```"
]

# Preprocessing function to extract meaningful key phrases
def preprocess(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())
    # Remove punctuation and stop words
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Perform Part-of-Speech tagging
    tagged_tokens = pos_tag(tokens)
    # Select specific types of words (nouns and verbs)
    meaningful_tokens = [word for word, pos in tagged_tokens if pos.startswith('NN') or pos.startswith('VB')]
    # Extract first two meaningful tokens, or all if fewer than two meaningful tokens are found
    meaningful_tokens = meaningful_tokens[:2]
    return meaningful_tokens

# Preprocess issues to extract meaningful key phrases
processed_issues = [preprocess(issue) for issue in issues]

# Create dictionary and corpus for gensim
dictionary = corpora.Dictionary(processed_issues)
corpus = [dictionary.doc2bow(issue) for issue in processed_issues]

# Print processed data
print("Processed Issues (Meaningful key phrases):", processed_issues)
print("Dictionary:", dictionary)
print("Corpus:", corpus)


In [None]:
import pandas as pd
from gensim import corpora
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import nltk
import spacy

# Download required NLTK data files
nltk.download('punkt')
nltk.download('stopwords')

# Sample data
issues = [
    "Seems like this particular input is confusing the sentence boundary detection (full stop prediction) head. Note how the first instance of `thank` was capitalized after the comma, too. The model seems confused as to where the sentences end (likely due to repeated texts which are unusual of the training data). I would recommend the better model, This is the output I get with that one: ```text After all, isn't that what we're here for? \n That's why we're here for. \n Well, Rob, you are an inspiration for the nation. \n Thank you for coming on, thank you for having me. \n Thank you so much for watching this week's episode. ```"
]

# Load English language model in spaCy
nlp = spacy.load('en_core_web_sm')

# Preprocessing function using Dependency Parsing
def preprocess(text):
    # Tokenize text using spaCy
    doc = nlp(text)
    # Extract meaningful phrases based on dependency parsing
    meaningful_tokens = []
    for token in doc:
        if token.dep_ in ['nsubj', 'dobj', 'pobj']:  # Include subject, direct object, and object of preposition
            meaningful_tokens.append(token.text)
    # Extract first two meaningful tokens, or all if fewer than two meaningful tokens are found
    meaningful_tokens = meaningful_tokens[:2]
    return meaningful_tokens

# Preprocess issues to extract meaningful key phrases using Dependency Parsing
processed_issues = [preprocess(issue) for issue in issues]

# Create dictionary and corpus for gensim
dictionary = corpora.Dictionary(processed_issues)
corpus = [dictionary.doc2bow(issue) for issue in processed_issues]

# Print processed data
print("Processed Issues (Meaningful key phrases):", processed_issues)
print("Dictionary:", dictionary)
print("Corpus:", corpus)


In [7]:
import pandas as pd
from gensim import corpora
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import nltk
import re

# Download required NLTK data files
nltk.download('punkt')
nltk.download('stopwords')

def clean_commit_message(commit_message):
    # Remove emojis and other symbols
    cleaned_message = re.sub(r'[^\x00-\x7F]+', ' ', commit_message)
    # Remove quotes
    cleaned_message = re.sub(r'"(.*?)"', '', cleaned_message)
    # Remove leading and trailing whitespace
    cleaned_message = cleaned_message.strip()
    return cleaned_message

# Example commit message
commit_message = """
I am looking for this model to fine-tune my own data (such as medical science) and after the training, I want it to be able to answer the questions. Then I am not looking for the "extractive answers" where it returns the start and end sequence (which is pretty much related the given context scenario) but a "generative case" where I train the model with my data and then answer the question, and from its (the model's) own understanding for my data, it should be able to give me the answers. If anybody knows how to achieve that with this model, please let me know! Thank you so much ü§ó
"""

# Clean the commit message
cleaned_message = clean_commit_message(commit_message)

# Preprocessing function using POS tagging and refined heuristic approach
def preprocess(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())
    # Remove punctuation and stop words
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Perform Part-of-Speech tagging
    tagged_tokens = nltk.pos_tag(tokens)
    # Heuristic approach to select meaningful phrases
    meaningful_tokens = []
    for i in range(len(tagged_tokens) - 1):
        if tagged_tokens[i][1].startswith('NN') and tagged_tokens[i + 1][1].startswith('NN'):
            meaningful_tokens.append(tagged_tokens[i][0] + ' ' + tagged_tokens[i + 1][0])
        elif tagged_tokens[i][1].startswith('VB') and tagged_tokens[i + 1][1].startswith('NN'):
            meaningful_tokens.append(tagged_tokens[i][0] + ' ' + tagged_tokens[i + 1][0])
    # Extract first two meaningful tokens, or all if fewer than two meaningful tokens are found
    meaningful_tokens = meaningful_tokens[:2]
    # If no meaningful tokens are found, return an empty list
    if not meaningful_tokens:
        meaningful_tokens = []
    return meaningful_tokens

# Preprocess cleaned commit message to extract meaningful key phrases using POS tagging and refined heuristic approach
processed_issues = preprocess(cleaned_message)

# Create dictionary and corpus for gensim (for demonstration)
dictionary = corpora.Dictionary([processed_issues])
corpus = [dictionary.doc2bow(processed_issues)]

# Print processed data
print("Processed Issues (Meaningful key phrases):", processed_issues)
print("Dictionary:", dictionary)
print("Corpus:", corpus)


Processed Issues (Meaningful key phrases): ['looking model', 'model data']
Dictionary: Dictionary<2 unique tokens: ['looking model', 'model data']>
Corpus: [[(0, 1), (1, 1)]]


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adekunleajibode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adekunleajibode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
pip install summa

Collecting summa
  Downloading summa-1.2.0.tar.gz (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.9/54.9 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: summa
  Building wheel for summa (setup.py) ... [?25ldone
[?25h  Created wheel for summa: filename=summa-1.2.0-py3-none-any.whl size=54388 sha256=6939ccd536201a226c410d52a9b42fd497dd234c5589be3298bdf16ac2e5195a
  Stored in directory: /Users/adekunleajibode/Library/Caches/pip/wheels/10/2d/7a/abce87c4ea233f8dcca0d99b740ac0257eced1f99a124a0e1f
Successfully built summa
Installing collected packages: summa
Successfully installed summa-1.2.0
Note: you may need to restart the kernel to use updated packages.


In [27]:
import pandas as pd
from gensim import corpora
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import nltk
import re

# Download required NLTK data files
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

def clean_commit_message(commit_message):
    # Remove emojis and other symbols
    cleaned_message = re.sub(r'[^\x00-\x7F]+', ' ', commit_message)
    # Remove quotes
    cleaned_message = re.sub(r'"(.*?)"', '', cleaned_message)
    # Remove leading and trailing whitespace
    cleaned_message = cleaned_message.strip()
    return cleaned_message

# Example commit message
commit_message = """
Would be great for this model to be citable ! Side question: was this trained from scratch by HF? Is there an original paper to be cited if we use this in a publication?


"""

# Clean the commit message
cleaned_message = clean_commit_message(commit_message)

# Function to preprocess and extract meaningful key phrases using NLTK
def preprocess(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())
    # Remove punctuation and stop words
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Perform Part-of-Speech tagging
    tagged_tokens = nltk.pos_tag(tokens)
    # Heuristic approach to select meaningful phrases
    meaningful_tokens = []
    for i in range(len(tagged_tokens) - 1):
        if (tagged_tokens[i][1].startswith('NN') or tagged_tokens[i][1].startswith('NN')) and (
            tagged_tokens[i + 1][1].startswith('NN') or tagged_tokens[i + 1][1].startswith('NN')):
            meaningful_tokens.append(tagged_tokens[i][0] + ' ' + tagged_tokens[i + 1][0])
    # Extract first two meaningful tokens, or all if fewer than two meaningful tokens are found
    meaningful_tokens = meaningful_tokens[:2]
    return meaningful_tokens

# Preprocess and extract meaningful key phrases using NLTK
processed_issues = preprocess(cleaned_message)

# Create dictionary and corpus for gensim (for demonstration)
dictionary = corpora.Dictionary([processed_issues])
corpus = [dictionary.doc2bow(processed_issues)]

# Print processed data
print("Processed Issues (Meaningful key phrases):", processed_issues)
print("Dictionary:", dictionary)
print("Corpus:", corpus)


Processed Issues (Meaningful key phrases): ['scratch hf']
Dictionary: Dictionary<1 unique tokens: ['scratch hf']>
Corpus: [[(0, 1)]]


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adekunleajibode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adekunleajibode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/adekunleajibode/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [46]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim import corpora

# Download NLTK data if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

def clean_commit_message(commit_message):
    # Remove emojis and other symbols
    cleaned_message = re.sub(r'[^\x00-\x7F]+', ' ', commit_message)
    # Remove quotes
    cleaned_message = re.sub(r'"(.*?)"', '', cleaned_message)
    # Remove leading and trailing whitespace
    cleaned_message = cleaned_message.strip()
    return cleaned_message

# Example commit message
commit_message = """
Would be great for this model to be citable ! Side question: was this trained from scratch by HF? Is there an original paper to be cited if we use this in a publication?

"""

# Clean the commit message
cleaned_message = clean_commit_message(commit_message)

# Function to preprocess and extract meaningful key phrases using NLTK
def preprocess(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())
    # Remove punctuation and stop words
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Perform Part-of-Speech tagging
    tagged_tokens = nltk.pos_tag(tokens)
    # Heuristic approach to select meaningful phrases
    meaningful_tokens = []
    for i in range(len(tagged_tokens) - 1):
        if (tagged_tokens[i][1].startswith('NN') and tagged_tokens[i + 1][1].startswith('NN')):  # Noun-Noun pattern
            meaningful_tokens.append(tagged_tokens[i][0] + ' ' + tagged_tokens[i + 1][0])
        elif (tagged_tokens[i][1].startswith('VB') and tagged_tokens[i + 1][1].startswith('NN')):  # Verb-Noun pattern
            meaningful_tokens.append(tagged_tokens[i][0] + ' ' + tagged_tokens[i + 1][0])
        # Add more patterns as needed (e.g., Noun-Adjective, etc.)
    # Extract first two meaningful tokens, or all if fewer than two meaningful tokens are found
    meaningful_tokens = meaningful_tokens[:2]
    return meaningful_tokens

# Preprocess and extract meaningful key phrases using NLTK
processed_issues = preprocess(cleaned_message)

# Create dictionary and corpus for gensim (for demonstration)
dictionary = corpora.Dictionary([processed_issues])
corpus = [dictionary.doc2bow(processed_issues)]

# Print processed data
print("Processed Issues (Meaningful key phrases):", processed_issues)
print("Dictionary:", dictionary)
print("Corpus:", corpus)


Processed Issues (Meaningful key phrases): ['trained scratch', 'scratch hf']
Dictionary: Dictionary<2 unique tokens: ['scratch hf', 'trained scratch']>
Corpus: [[(0, 1), (1, 1)]]


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adekunleajibode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adekunleajibode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/adekunleajibode/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [47]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from gensim import corpora

# Download NLTK data if not already downloaded
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

def clean_commit_message(commit_message):
    # Remove emojis and other symbols
    cleaned_message = re.sub(r'[^\x00-\x7F]+', ' ', commit_message)
    # Remove quotes
    cleaned_message = re.sub(r'"(.*?)"', '', cleaned_message)
    # Remove leading and trailing whitespace
    cleaned_message = cleaned_message.strip()
    return cleaned_message

# Example commit message
commit_message = """
Would be great for this model to be citable ! Side question: was this trained from scratch by HF? Is there an original paper to be cited if we use this in a publication?

"""

# Clean the commit message
cleaned_message = clean_commit_message(commit_message)

# Function to preprocess and extract meaningful key phrases using NLTK
def preprocess(text):
    # Tokenize text
    tokens = word_tokenize(text.lower())
    # Remove punctuation and stop words
    tokens = [word for word in tokens if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Perform Part-of-Speech tagging
    tagged_tokens = nltk.pos_tag(tokens)
    # Heuristic approach to select meaningful phrases
    meaningful_tokens = []
    for i in range(len(tagged_tokens) - 1):
        if (tagged_tokens[i][1].startswith('NN') and tagged_tokens[i + 1][1].startswith('NN')):  # Noun-Noun pattern
            meaningful_tokens.append(tagged_tokens[i][0] + ' ' + tagged_tokens[i + 1][0])
        elif (tagged_tokens[i][1].startswith('VB') and tagged_tokens[i + 1][1].startswith('NN')):  # Verb-Noun pattern
            meaningful_tokens.append(tagged_tokens[i][0] + ' ' + tagged_tokens[i + 1][0])
        # Add more patterns as needed (e.g., Noun-Adjective, etc.)
    # Extract first two meaningful tokens, or all if fewer than two meaningful tokens are found
    meaningful_tokens = meaningful_tokens[:2]
    return meaningful_tokens

# Preprocess and extract meaningful key phrases using NLTK
processed_issues = preprocess(cleaned_message)

# Create dictionary and corpus for gensim (for demonstration)
dictionary = corpora.Dictionary([processed_issues])
corpus = [dictionary.doc2bow(processed_issues)]

# Print processed data
print("Processed Issues (Meaningful key phrases):", processed_issues)
print("Dictionary:", dictionary)
print("Corpus:", corpus)


Processed Issues (Meaningful key phrases): ['trained scratch', 'scratch hf']
Dictionary: Dictionary<2 unique tokens: ['scratch hf', 'trained scratch']>
Corpus: [[(0, 1), (1, 1)]]


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/adekunleajibode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/adekunleajibode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/adekunleajibode/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
