In [1]:
from sqlalchemy import create_engine
import pandas as pd
import os
import urllib.parse
import json

# Format: mysql+pymysql://user:password@host/database
password = urllib.parse.quote_plus(os.environ['MYSQL_PASSWORD'])
engine = create_engine(
    f"mysql+pymysql://root:{password}@localhost/tawos"
)

query = """
SELECT
    p.Name AS project_name,
    p.Description AS project_description,

    r.Name AS repository_name,
    r.Description AS repository_description,

    s.ID AS sprint_id,
    s.JiraID AS sprint_jira_id,
    s.Name AS sprint_name,
    s.State AS sprint_state,
    s.Start_Date AS sprint_start_date,
    s.End_Date AS sprint_end_date,
    s.Activated_Date AS sprint_activated_date,
    s.Complete_Date AS sprint_complete_date,
    s.Project_ID AS sprint_project_id,

    i.ID AS issue_id,
    i.Jira_ID AS issue_jira_id,
    i.Issue_Key AS issue_issue_key,
    i.URL AS issue_url,
    i.Title AS issue_title,
    i.Description AS issue_description,
    i.Description_Text AS issue_description_text,
    i.Description_Code AS issue_description_code,
    i.Type AS issue_type,
    i.Priority AS issue_priority,
    i.Status AS issue_status,
    i.Resolution AS issue_resolution,
    i.Creation_Date AS issue_creation_date,
    i.Estimation_Date AS issue_estimation_date,
    i.Resolution_Date AS issue_resolution_date,
    i.Last_Updated AS issue_last_updated,
    i.Story_Point AS issue_story_point,
    i.Timespent AS issue_timespent,
    i.In_Progress_Minutes AS issue_in_progress_minutes,
    i.Total_Effort_Minutes AS issue_total_effort_minutes,
    i.Resolution_Time_Minutes AS issue_resolution_time_minutes,
    i.Title_Changed_After_Estimation AS issue_title_changed_after_estimation,
    i.Description_Changed_After_Estimation AS issue_description_changed_after_estimation,
    i.Story_Point_Changed_After_Estimation AS issue_story_point_changed_after_estimation,
    i.Pull_Request_URL AS issue_pull_request_url,
    i.Creator_ID AS issue_creator_id,
    i.Reporter_ID AS issue_reporter_id,
    i.Assignee_ID AS issue_assignee_id,
    i.Project_ID AS issue_project_id,
    i.Sprint_ID AS issue_sprint_id,

    -- JSON_ARRAYAGG for comments
    (
      SELECT JSON_ARRAYAGG(
        JSON_OBJECT(
          'ID', c.ID,
          'Comment', c.Comment,
          'Comment_Text', c.Comment_Text,
          'Comment_Code', c.Comment_Code,
          'Creation_Date', c.Creation_Date,
          'Author_ID', c.Author_ID
        )
      )
      FROM Comment c
      WHERE c.Issue_ID = i.ID
    ) AS comments,

    -- JSON_ARRAYAGG for change logs
    (
      SELECT JSON_ARRAYAGG(
        JSON_OBJECT(
          'ID', ch.ID,
          'Field', ch.Field,
          'From_Value', ch.From_Value,
          'To_Value', ch.To_Value,
          'From_String', ch.From_String,
          'To_String', ch.To_String,
          'Change_Type', ch.Change_Type,
          'Creation_Date', ch.Creation_Date,
          'Author_ID', ch.Author_ID
        )
      )
      FROM Change_Log ch
      WHERE ch.Issue_ID = i.ID
    ) AS change_logs

FROM Project p
LEFT JOIN Repository r
    ON p.Repository_ID = r.ID
LEFT JOIN Issue i
    ON i.Project_ID = p.ID
LEFT JOIN Sprint s
    ON i.Sprint_ID = s.ID
;
"""

df_query = pd.read_sql(query, con=engine)
df_query.head()


Unnamed: 0,project_name,project_description,repository_name,repository_description,sprint_id,sprint_jira_id,sprint_name,sprint_state,sprint_start_date,sprint_end_date,...,issue_description_changed_after_estimation,issue_story_point_changed_after_estimation,issue_pull_request_url,issue_creator_id,issue_reporter_id,issue_assignee_id,issue_project_id,issue_sprint_id,comments,change_logs
0,Spring XD,Spring XD makes it easy to solve common big da...,Spring,The Spring Framework is an application framewo...,,,,,,,...,0,0,,68.0,68.0,,1,,,
1,Spring XD,Spring XD makes it easy to solve common big da...,Spring,The Spring Framework is an application framewo...,,,,,,,...,0,0,,69.0,69.0,,1,,,
2,Spring XD,Spring XD makes it easy to solve common big da...,Spring,The Spring Framework is an application framewo...,,,,,,,...,0,0,,70.0,70.0,,1,,,
3,Spring XD,Spring XD makes it easy to solve common big da...,Spring,The Spring Framework is an application framewo...,,,,,,,...,0,0,,72.0,72.0,71.0,1,,"[{""ID"": 4441, ""Comment"": ""See https://github.c...","[{""ID"": 6, ""Field"": ""Fix Version"", ""To_Value"":..."
4,Spring XD,Spring XD makes it easy to solve common big da...,Spring,The Spring Framework is an application framewo...,,,,,,,...,0,0,,73.0,73.0,,1,,,


In [2]:
df = df_query
# Convert JSON columns to Python objects
df['comments'] = df['comments'].apply(lambda x: json.loads(x) if x else [])
df['change_logs'] = df['change_logs'].apply(lambda x: json.loads(x) if x else [])

# Drop columns that are primary/foreign keys (ending with '_id')
columns_to_drop = [col for col in df.columns if col.endswith('_id')]
#drop rows with none sprint_name
df = df.dropna(subset=['sprint_name'])

df_explore = df.drop(columns=columns_to_drop)

# Print the remaining columns and a few rows of data for exploration
print("Remaining columns:")
print(df_explore.columns.tolist())
print("\nSample data:")
print(df_explore.head())

Remaining columns:
['project_name', 'project_description', 'repository_name', 'repository_description', 'sprint_name', 'sprint_state', 'sprint_start_date', 'sprint_end_date', 'sprint_activated_date', 'sprint_complete_date', 'issue_issue_key', 'issue_url', 'issue_title', 'issue_description', 'issue_description_text', 'issue_description_code', 'issue_type', 'issue_priority', 'issue_status', 'issue_resolution', 'issue_creation_date', 'issue_estimation_date', 'issue_resolution_date', 'issue_last_updated', 'issue_story_point', 'issue_timespent', 'issue_in_progress_minutes', 'issue_total_effort_minutes', 'issue_resolution_time_minutes', 'issue_title_changed_after_estimation', 'issue_description_changed_after_estimation', 'issue_story_point_changed_after_estimation', 'issue_pull_request_url', 'comments', 'change_logs']

Sample data:
   project_name                                project_description  \
24    Spring XD  Spring XD makes it easy to solve common big da...   
25    Spring XD  Sprin

In [19]:
# Generate synthetic sprint goals from issue descriptions
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Use the Qwen2.5-3B-Instruct model as specified
model_name = "Qwen/Qwen2.5-1.5B-Instruct"

# Set up the model and tokenizer
print(f"Loading {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto" if device == "cuda" else None,
    trust_remote_code=True
)
print(f"Successfully loaded {model_name} for text generation")

def generate_sprint_goal(issue_descriptions, max_length=150):
    """
    Generate a synthetic sprint goal based on a collection of issue descriptions.
    
    Args:
        issue_descriptions (str): Combined issue descriptions from a sprint
        max_length (int): Maximum token length for the generated text
        
    Returns:
        str: A synthetic sprint goal
    """
    # Truncate issue descriptions if they are too long to avoid token limits
    truncated_descriptions = issue_descriptions[:2000] + "..." if len(issue_descriptions) > 2000 else issue_descriptions
    
    # Format as a message list for Qwen models (as per Hugging Face documentation)
    prompt = f"""Based on the following issues in our sprint backlog, create a clear and concise sprint goal.

    FORMAT YOUR RESPONSE EXACTLY AS:
    **Sprint Goal:** [your concise sprint goal here]

    DO NOT include any explanations, introductions, conclusions or additional notes.
    Issues:
    {truncated_descriptions}"""

    messages = [
        {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You help create precise sprint goals for development teams."},
        {"role": "user", "content": prompt}
    ]
    
    # Generate the sprint goal
    try:
        # Apply the chat template to format messages for the model
        input_text = tokenizer.apply_chat_template(messages, return_tensors="pt")
        
        # Determine which device the model is actually using
        # Get the device of the first parameter of the model
        model_device = next(model.parameters()).device
        
        # Move input_text to the same device as the model
        input_text = input_text.to(model_device)
        
        # Generate response
        outputs = model.generate(
            input_text,
            max_new_tokens=max_length,
            temperature=0.7,
            top_p=0.9,
            do_sample=True
        )
        
        # Decode only the newly generated tokens (not the prompt)
        generated_text = tokenizer.decode(outputs[0][input_text.shape[1]:], skip_special_tokens=True).strip()
        
        # Clean up the response - remove any introductory text
        sprint_goal = generated_text

        # Extract the content after "**Sprint Goal:**" if present
        sprint_goal_marker = "**Sprint Goal:**"
        if sprint_goal_marker in sprint_goal:
            sprint_goal = sprint_goal.split(sprint_goal_marker, 1)[1].strip()

        return sprint_goal
    except Exception as e:
        print(f"Error generating sprint goal: {e}")
        return "Improve system functionality and resolve critical issues."

Using device: cuda
Loading Qwen/Qwen2.5-1.5B-Instruct...
Successfully loaded Qwen/Qwen2.5-1.5B-Instruct for text generation
Successfully loaded Qwen/Qwen2.5-1.5B-Instruct for text generation


In [24]:
# Group issues by sprint and generate synthetic sprint goals

# First, let's create a dataframe with unique sprints and their issues
sprint_groups = df_explore.groupby('sprint_name')

# Create a dataframe to store sprint information and their aggregated issues
sprint_data = []

for sprint_name, group in sprint_groups:
    # Concatenate issue descriptions for this sprint
    issue_titles = "\n".join([f"Issue: {row['issue_title']}" for _, row in group.iterrows()])
    
    # Get other sprint information from the first row
    first_row = group.iloc[0]
    sprint_info = {
        'sprint_name': sprint_name,
        'project_name': first_row['project_name'],
        'sprint_state': first_row['sprint_state'],
        'sprint_start_date': first_row['sprint_start_date'],
        'sprint_end_date': first_row['sprint_end_date'],
        'issue_count': len(group),
        'aggregated_issue_titles': issue_titles
    }
    
    sprint_data.append(sprint_info)

# Convert to DataFrame
sprint_df = pd.DataFrame(sprint_data)

# Display some statistics about the sprints
print(f"Total number of sprints: {len(sprint_df)}")
print("\nSample of sprints:")
print(sprint_df.head())

Total number of sprints: 2945

Sample of sprints:
                     sprint_name                 project_name sprint_state  \
0       $.krypton - 6.3.1 part 2     Atlassian Software Cloud       CLOSED   
1  0 Blast off - Mar 13-17 - SYD  Atlassian Confluence Server       FUTURE   
2                             10        Hyperledger Indy Node       CLOSED   
3            10-Annamite range 2   Atlassian Confluence Cloud       CLOSED   
4                       10tative     Atlassian Software Cloud       CLOSED   

     sprint_start_date      sprint_end_date  issue_count  \
0  2013-08-26 00:37:12  2013-09-09 00:37:00           10   
1                 None                 None            1   
2  2017-08-03 08:00:32  2017-08-16 08:00:00           29   
3  2015-05-08 03:12:00  2015-05-15 03:12:00            3   
4  2012-06-18 01:51:56  2012-07-02 01:51:56           36   

                             aggregated_issue_titles  
0  Issue: "ConfigurationAction.doSetCardColor Per...  
1  Issue: 

In [27]:
# Generate synthetic sprint goals for a subset of sprints (to avoid long runtimes)
# You can adjust the sample size based on your computational resources

# Take a sample of sprints to generate goals for (adjust as needed)
sample_size = 10  # Start with a small number to test
sprints_for_goals = sprint_df.sample(sample_size, random_state=42) if len(sprint_df) > sample_size else sprint_df

# Generate sprint goals for the sampled sprints
for idx, row in sprints_for_goals.iterrows():
    sprint_name = row['sprint_name']
    issue_titles = row['aggregated_issue_titles']
    
    print(f"\nGenerating sprint goal for: {sprint_name}")
    goal = generate_sprint_goal(issue_titles)
    
    # Update the dataframe with the generated goal
    sprint_df.loc[idx, 'synthetic_sprint_goal'] = goal
    print(f"Generated goal: {goal}")
    # Print the issues for the current sprint
    sprint_issues = df_explore[df_explore['sprint_name'] == sprint_name]
    print(f"Issues for sprint '{sprint_name}':")
    for _, issue_row in sprint_issues.iterrows():
        print(f"- {issue_row['issue_title']}")



Generating sprint goal for: 3.10-m3
Generated goal: Address critical performance bottlenecks, update outdated software components, resolve user-reported errors, enhance security measures, and improve documentation across all project areas.
Issues for sprint '3.10-m3':
- "Update Java version bundled found in the installer to a version >= 1.8u51"
- "SVN operations can hang in some cases when using svnkit with file:// protocol and long commit messages "
- "Invisible error messages in admin pages"
- "Add a link to include/exclude and patterns CAC page "
- "OutOfMemoryError when Start Revision, Initial Import: ""No initial import"" and repository path are set"
- "Dead Link for Allow 2-Legged OAuth in FishEye Crucible"

Generating sprint goal for: ESB Dolomite 2
Generated goal: Address critical performance bottlenecks, update outdated software components, resolve user-reported errors, enhance security measures, and improve documentation across all project areas.
Issues for sprint '3.10-m3':

In [22]:
# Create a dataset for fine-tuning the reasoning model

# Combine sprint information and issues to form training examples
training_data = []

# Process only sprints that have synthetic goals
for idx, sprint_row in sprint_df[sprint_df['synthetic_sprint_goal'].notna()].iterrows():
    sprint_name = sprint_row['sprint_name']
    sprint_goal = sprint_row['synthetic_sprint_goal']
    
    # Get all issues for this sprint
    sprint_issues = df_explore[df_explore['sprint_name'] == sprint_name]
    
    for _, issue_row in sprint_issues.iterrows():
        # Create a training example with the sprint goal and issue
        example = {
            'sprint_name': sprint_name,
            'sprint_goal': sprint_goal,
            'issue_title': issue_row['issue_title'],
            'issue_description': issue_row['issue_description'],
            'issue_type': issue_row['issue_type'],
            'issue_priority': issue_row['issue_priority'],
            'story_points': issue_row.get('issue_story_point', None)
        }
        training_data.append(example)

# Convert to DataFrame
training_df = pd.DataFrame(training_data)

# Preview the training data
print(f"Total training examples: {len(training_df)}")
print("\nSample of training data:")
print(training_df[['sprint_name', 'sprint_goal', 'issue_title']].head())

# Save the training data to a file for later use with train.py
output_path = '../data/synthetic_sprint_training_data.jsonl'

# Convert DataFrame to JSONL format (one JSON object per line)
training_df.to_json(output_path, orient='records', lines=True)
print(f"\nTraining data saved to {output_path}")

Total training examples: 262

Sample of training data:
  sprint_name                                        sprint_goal  \
0     3.10-m3  Enhance the FishEye application's performance ...   
1     3.10-m3  Enhance the FishEye application's performance ...   
2     3.10-m3  Enhance the FishEye application's performance ...   
3     3.10-m3  Enhance the FishEye application's performance ...   
4     3.10-m3  Enhance the FishEye application's performance ...   

                                         issue_title  
0  "Update Java version bundled found in the inst...  
1  "SVN operations can hang in some cases when us...  
2          "Invisible error messages in admin pages"  
4  "Add a link to include/exclude and patterns CA...  

Training data saved to ../data/synthetic_sprint_training_data.jsonl


In [23]:
# Print sprint goal and issue title pairs
for _, row in training_df.iterrows():
    print(f"Sprint Goal: {row['sprint_goal']}")
    print(f"Issue Title: {row['issue_title']}")
    print("-" * 50)

Sprint Goal: Enhance the FishEye application's performance and usability by addressing critical bugs and improving configuration settings. Specifically, we will:

1. Update the bundled Java version to ensure compatibility and security updates.
2. Optimize SVN operations for better performance, especially when using the file:// protocol and large commit messages.
3. Resolve invisible error messages on admin pages.
5. Add a link to the Include/Exclude Patterns guide within the FishEye application.
6. Prevent OutOfMemoryErrors related to initial imports in large repositories.
Issue Title: "Update Java version bundled found in the installer to a version >= 1.8u51"
--------------------------------------------------
Sprint Goal: Enhance the FishEye application's performance and usability by addressing critical bugs and improving configuration settings. Specifically, we will:

1. Update the bundled Java version to ensure compatibility and security updates.
2. Optimize SVN operations for bette