In [2]:
from sqlalchemy import create_engine
import pandas as pd
import os
import urllib.parse
import json

# Format: mysql+pymysql://user:password@host/database
password = urllib.parse.quote_plus(os.environ['MYSQL_PASSWORD'])
engine = create_engine(
    f"mysql+pymysql://root:{password}@localhost/tawos"
)

query = """
SELECT
    p.Name AS project_name,
    p.Description AS project_description,

    r.Name AS repository_name,
    r.Description AS repository_description,

    s.ID AS sprint_id,
    s.JiraID AS sprint_jira_id,
    s.Name AS sprint_name,
    s.State AS sprint_state,
    s.Start_Date AS sprint_start_date,
    s.End_Date AS sprint_end_date,
    s.Activated_Date AS sprint_activated_date,
    s.Complete_Date AS sprint_complete_date,
    s.Project_ID AS sprint_project_id,

    i.ID AS issue_id,
    i.Jira_ID AS issue_jira_id,
    i.Issue_Key AS issue_issue_key,
    i.URL AS issue_url,
    i.Title AS issue_title,
    i.Description AS issue_description,
    i.Description_Text AS issue_description_text,
    i.Description_Code AS issue_description_code,
    i.Type AS issue_type,
    i.Priority AS issue_priority,
    i.Status AS issue_status,
    i.Resolution AS issue_resolution,
    i.Creation_Date AS issue_creation_date,
    i.Estimation_Date AS issue_estimation_date,
    i.Resolution_Date AS issue_resolution_date,
    i.Last_Updated AS issue_last_updated,
    i.Story_Point AS issue_story_point,
    i.Timespent AS issue_timespent,
    i.In_Progress_Minutes AS issue_in_progress_minutes,
    i.Total_Effort_Minutes AS issue_total_effort_minutes,
    i.Resolution_Time_Minutes AS issue_resolution_time_minutes,
    i.Title_Changed_After_Estimation AS issue_title_changed_after_estimation,
    i.Description_Changed_After_Estimation AS issue_description_changed_after_estimation,
    i.Story_Point_Changed_After_Estimation AS issue_story_point_changed_after_estimation,
    i.Pull_Request_URL AS issue_pull_request_url,
    i.Creator_ID AS issue_creator_id,
    i.Reporter_ID AS issue_reporter_id,
    i.Assignee_ID AS issue_assignee_id,
    i.Project_ID AS issue_project_id,
    i.Sprint_ID AS issue_sprint_id,

    -- JSON_ARRAYAGG for comments
    (
      SELECT JSON_ARRAYAGG(
        JSON_OBJECT(
          'ID', c.ID,
          'Comment', c.Comment,
          'Comment_Text', c.Comment_Text,
          'Comment_Code', c.Comment_Code,
          'Creation_Date', c.Creation_Date,
          'Author_ID', c.Author_ID
        )
      )
      FROM Comment c
      WHERE c.Issue_ID = i.ID
    ) AS comments,

    -- JSON_ARRAYAGG for change logs
    (
      SELECT JSON_ARRAYAGG(
        JSON_OBJECT(
          'ID', ch.ID,
          'Field', ch.Field,
          'From_Value', ch.From_Value,
          'To_Value', ch.To_Value,
          'From_String', ch.From_String,
          'To_String', ch.To_String,
          'Change_Type', ch.Change_Type,
          'Creation_Date', ch.Creation_Date,
          'Author_ID', ch.Author_ID
        )
      )
      FROM Change_Log ch
      WHERE ch.Issue_ID = i.ID
    ) AS change_logs

FROM Project p
LEFT JOIN Repository r
    ON p.Repository_ID = r.ID
LEFT JOIN Issue i
    ON i.Project_ID = p.ID
LEFT JOIN Sprint s
    ON i.Sprint_ID = s.ID
;
"""

df_query = pd.read_sql(query, con=engine)
df_query.head()


Unnamed: 0,project_name,project_description,repository_name,repository_description,sprint_id,sprint_jira_id,sprint_name,sprint_state,sprint_start_date,sprint_end_date,...,issue_description_changed_after_estimation,issue_story_point_changed_after_estimation,issue_pull_request_url,issue_creator_id,issue_reporter_id,issue_assignee_id,issue_project_id,issue_sprint_id,comments,change_logs
0,Spring XD,Spring XD makes it easy to solve common big da...,Spring,The Spring Framework is an application framewo...,,,,,,,...,0,0,,68.0,68.0,,1,,,
1,Spring XD,Spring XD makes it easy to solve common big da...,Spring,The Spring Framework is an application framewo...,,,,,,,...,0,0,,69.0,69.0,,1,,,
2,Spring XD,Spring XD makes it easy to solve common big da...,Spring,The Spring Framework is an application framewo...,,,,,,,...,0,0,,70.0,70.0,,1,,,
3,Spring XD,Spring XD makes it easy to solve common big da...,Spring,The Spring Framework is an application framewo...,,,,,,,...,0,0,,72.0,72.0,71.0,1,,"[{""ID"": 4441, ""Comment"": ""See https://github.c...","[{""ID"": 6, ""Field"": ""Fix Version"", ""To_Value"":..."
4,Spring XD,Spring XD makes it easy to solve common big da...,Spring,The Spring Framework is an application framewo...,,,,,,,...,0,0,,73.0,73.0,,1,,,


In [3]:
df = df_query
# Convert JSON columns to Python objects
df['comments'] = df['comments'].apply(lambda x: json.loads(x) if x else [])
df['change_logs'] = df['change_logs'].apply(lambda x: json.loads(x) if x else [])

# Drop columns that are primary/foreign keys (ending with '_id')
columns_to_drop = [col for col in df.columns if col.endswith('_id')]
#drop rows with none sprint_name
df = df.dropna(subset=['sprint_name'])

df_explore = df.drop(columns=columns_to_drop)

# Print the remaining columns and a few rows of data for exploration
print("Remaining columns:")
print(df_explore.columns.tolist())
print("\nSample data:")
print(df_explore.head())

Remaining columns:
['project_name', 'project_description', 'repository_name', 'repository_description', 'sprint_name', 'sprint_state', 'sprint_start_date', 'sprint_end_date', 'sprint_activated_date', 'sprint_complete_date', 'issue_issue_key', 'issue_url', 'issue_title', 'issue_description', 'issue_description_text', 'issue_description_code', 'issue_type', 'issue_priority', 'issue_status', 'issue_resolution', 'issue_creation_date', 'issue_estimation_date', 'issue_resolution_date', 'issue_last_updated', 'issue_story_point', 'issue_timespent', 'issue_in_progress_minutes', 'issue_total_effort_minutes', 'issue_resolution_time_minutes', 'issue_title_changed_after_estimation', 'issue_description_changed_after_estimation', 'issue_story_point_changed_after_estimation', 'issue_pull_request_url', 'comments', 'change_logs']

Sample data:
   project_name                                project_description  \
24    Spring XD  Spring XD makes it easy to solve common big da...   
25    Spring XD  Sprin

In [None]:
# Generate synthetic sprint goals from issue descriptions
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Choose a model suitable for text generation (you can replace with a more capable model)
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"  # Consider using a more advanced model like "facebook/opt-1.3b" if resources allow

# Set up a text generation pipeline
try:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    generator = pipeline("text-generation", model=model, tokenizer=tokenizer, device=0 if device == "cuda" else -1)
    print(f"Successfully loaded {model_name} for text generation")
except Exception as e:
    print(f"Error loading model: {e}")
    print("Falling back to smaller model for text generation")
    generator = pipeline("text-generation", model="distilgpt2", device=0 if device == "cuda" else -1)

def generate_sprint_goal(issue_descriptions, max_length=150):
    """
    Generate a synthetic sprint goal based on a collection of issue descriptions.
    
    Args:
        issue_descriptions (str): Combined issue descriptions from a sprint
        max_length (int): Maximum token length for the generated text
        
    Returns:
        str: A synthetic sprint goal
    """
    # Create a prompt for the model to generate a sprint goal
    # Truncate issue descriptions if they are too long to avoid token limits
    truncated_descriptions = issue_descriptions[:2000] + "..." if len(issue_descriptions) > 2000 else issue_descriptions
    
    prompt = f"""Based on the following issues in our sprint backlog, create a clear and concise sprint goal that captures the main objectives:
    
Issues:
{truncated_descriptions}
    
Sprint Goal:"""
    
    # Generate the sprint goal
    try:
        outputs = generator(prompt, max_length=len(prompt.split()) + max_length, num_return_sequences=1, 
                          temperature=0.7, top_p=0.9, do_sample=True)
        
        # Extract the generated sprint goal by removing the prompt
        generated_text = outputs[0]['generated_text']
        sprint_goal = generated_text[len(prompt):].strip()
        
        # Clean up the sprint goal - take just the first paragraph
        sprint_goal = sprint_goal.split('\n')[0].strip()
        
        # If the sprint goal is too short, try again with different parameters
        if len(sprint_goal) < 10:
            outputs = generator(prompt, max_length=len(prompt.split()) + max_length, num_return_sequences=1, 
                              temperature=0.9, top_p=0.95, do_sample=True)
            generated_text = outputs[0]['generated_text']
            sprint_goal = generated_text[len(prompt):].strip().split('\n')[0].strip()
        
        return sprint_goal
    except Exception as e:
        print(f"Error generating sprint goal: {e}")
        return "Improve system functionality and resolve critical issues."

In [None]:
# Group issues by sprint and generate synthetic sprint goals

# First, let's create a dataframe with unique sprints and their issues
sprint_groups = df_explore.groupby('sprint_name')

# Create a dataframe to store sprint information and their aggregated issues
sprint_data = []

for sprint_name, group in sprint_groups:
    # Concatenate issue descriptions for this sprint
    issue_descriptions = "\n".join([f"Issue: {row['issue_title']}\nDescription: {row['issue_description_text']}" 
                                  for _, row in group.iterrows()])
    
    # Get other sprint information from the first row
    first_row = group.iloc[0]
    sprint_info = {
        'sprint_name': sprint_name,
        'project_name': first_row['project_name'],
        'sprint_state': first_row['sprint_state'],
        'sprint_start_date': first_row['sprint_start_date'],
        'sprint_end_date': first_row['sprint_end_date'],
        'issue_count': len(group),
        'aggregated_issue_descriptions': issue_descriptions
    }
    
    sprint_data.append(sprint_info)

# Convert to DataFrame
sprint_df = pd.DataFrame(sprint_data)

# Display some statistics about the sprints
print(f"Total number of sprints: {len(sprint_df)}")
print("\nSample of sprints:")
print(sprint_df[['sprint_name', 'project_name', 'issue_count']].head())

In [None]:
# Generate synthetic sprint goals for a subset of sprints (to avoid long runtimes)
# You can adjust the sample size based on your computational resources

# Take a sample of sprints to generate goals for (adjust as needed)
sample_size = 10  # Start with a small number to test
sprints_for_goals = sprint_df.sample(sample_size, random_state=42) if len(sprint_df) > sample_size else sprint_df

# Generate sprint goals for the sampled sprints
for idx, row in sprints_for_goals.iterrows():
    sprint_name = row['sprint_name']
    issue_descriptions = row['aggregated_issue_descriptions']
    
    print(f"\nGenerating sprint goal for: {sprint_name}")
    goal = generate_sprint_goal(issue_descriptions)
    
    # Update the dataframe with the generated goal
    sprint_df.loc[idx, 'synthetic_sprint_goal'] = goal
    print(f"Generated goal: {goal}")

# Display the sprints with their synthetic goals
print("\nSprints with synthetic goals:")
print(sprint_df[sprint_df['synthetic_sprint_goal'].notna()][['sprint_name', 'project_name', 'synthetic_sprint_goal']].head(10))

In [None]:
# Create a dataset for fine-tuning the reasoning model

# Combine sprint information and issues to form training examples
training_data = []

# Process only sprints that have synthetic goals
for idx, sprint_row in sprint_df[sprint_df['synthetic_sprint_goal'].notna()].iterrows():
    sprint_name = sprint_row['sprint_name']
    sprint_goal = sprint_row['synthetic_sprint_goal']
    
    # Get all issues for this sprint
    sprint_issues = df_explore[df_explore['sprint_name'] == sprint_name]
    
    for _, issue_row in sprint_issues.iterrows():
        # Create a training example with the sprint goal and issue
        example = {
            'sprint_name': sprint_name,
            'sprint_goal': sprint_goal,
            'issue_title': issue_row['issue_title'],
            'issue_description': issue_row['issue_description'],
            'issue_type': issue_row['issue_type'],
            'issue_priority': issue_row['issue_priority'],
            'story_points': issue_row.get('issue_story_point', None)}

print("\nSample of training data:")
print(training_df[['sprint_name', 'sprint_goal', 'issue_title']].head())

# Save the training data to a file for later use with train.py
output_path = '../data/synthetic_sprint_training_data.jsonl'

# Convert DataFrame to JSONL format (one JSON object per line)
training_df.to_json(output_path, orient='records', lines=True)
print(f"\nTraining data saved to {output_path}")