In [448]:
import datetime
from icalendar import Calendar
import pandas as pd

## Datat preprocessing

In [449]:
def ics_to_dataframe(file_path):
    # Open and parse the .ics file
    with open(file_path, 'rb') as f:
        cal = Calendar.from_ical(f.read())

    # Prepare a list to hold event data
    events = []

    for component in cal.walk():
        if component.name == "VEVENT":
            # Extract relevant data from each event
            try:
                title, authors = component.get('summary').split(" - ", 1)  # Expecting exactly 2 items
                authors = authors.split('\,')
            except ValueError:  # If not exactly 2 items, then it's a ValueError
                title = component.get('summary')  # Take the whole summary as title
                authors = None  # Default, no authors found
                
            start_dt = component.get('dtstart').dt
            end_dt = component.get('dtend').dt
            title = component.get('summary')
            description = component.get('description')
            
            if isinstance(start_dt, datetime.datetime) and isinstance(end_dt, datetime.datetime):
                # Calculate duration
                duration = (end_dt - start_dt).total_seconds() / 60  # duration in minutes
                # Formatting date and start time
                date = start_dt.strftime('%Y-%m-%d')
                start_time = start_dt.strftime('%H:%M')
                

                # Append event info to the list
                events.append([date, start_time, duration, title, authors, description])

    # Create a DataFrame
    df = pd.DataFrame(events, columns=['date', 'start time', 'duration_mins', 'title', 'authors', 'description'])

    return df

file_path = "data/event-calendar-2022.ics"
df = ics_to_dataframe(file_path)
df.head()

Unnamed: 0,date,start time,duration_mins,title,authors,description
0,2022-05-18,02:00,4.0,[MSR Technical Papers] An Empirical Evaluation...,"[Nhan Nguyen, Sarah Nadi]",GitHub and OpenAI recently launched GitHub Cop...
1,2022-05-18,02:04,4.0,[MSR Technical Papers] Comments on Comments: W...,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince...",An important function of code review is to inc...
2,2022-05-18,02:08,7.0,[MSR Technical Papers] Does This Apply to Me? ...,"[Akalanka Galappaththi, Sarah Nadi, Christoph ...",Stack Overflow has become an essential technic...
3,2022-05-18,02:15,7.0,[MSR Technical Papers] Towards Reliable Agile ...,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan...","In agile iterative development, an agile team ..."
4,2022-05-18,02:22,7.0,[MSR Technical Papers] BotHunter: An Approach ...,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein...",Bots have become popular in software projects ...


In [450]:
def parse_sessions(file_path):
    with open(file_path, 'rb') as f:
        cal = Calendar.from_ical(f.read())

    sessions = {}
    
    for component in cal.walk('vevent'):
        start_dt = component.get('dtstart').dt
        end_dt = component.get('dtend').dt
        summary = str(component.get('summary'))
        session_name = summary.split(': ')[-1]  # Assuming session name is after last colon
        
        # Key by start date, end date, and location for simplicity; tweak as needed
        key = (start_dt, end_dt)
        sessions[key] = session_name
    
    return sessions

# Function to find a session for a given event
def find_session_for_event(sessions, event_start, event_end):
    # Note: This simplistic approach assumes exact match of start/end times. 
    # Consider more complex logic if sessions/events don't align exactly.
    key = (event_start, event_end)
    return sessions.get(key, "Unknown Session")

# Load sessions
session_calendar_path = "data/session-calendar-2022.ics"
sessions = parse_sessions(session_calendar_path)

# Map events to sessions and add a new column
df['session_name'] = df.apply(lambda row: find_session_for_event(sessions, 
                                             datetime.datetime.strptime(row['date'] + ' ' + row['start time'], '%Y-%m-%d %H:%M'),
                                             datetime.datetime.strptime(row['date'] + ' ' + row['start time'], '%Y-%m-%d %H:%M') + datetime.timedelta(minutes=row['duration_mins'])),
                                             axis=1)

df.head()

Unnamed: 0,date,start time,duration_mins,title,authors,description,session_name
0,2022-05-18,02:00,4.0,[MSR Technical Papers] An Empirical Evaluation...,"[Nhan Nguyen, Sarah Nadi]",GitHub and OpenAI recently launched GitHub Cop...,Unknown Session
1,2022-05-18,02:04,4.0,[MSR Technical Papers] Comments on Comments: W...,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince...",An important function of code review is to inc...,Unknown Session
2,2022-05-18,02:08,7.0,[MSR Technical Papers] Does This Apply to Me? ...,"[Akalanka Galappaththi, Sarah Nadi, Christoph ...",Stack Overflow has become an essential technic...,Unknown Session
3,2022-05-18,02:15,7.0,[MSR Technical Papers] Towards Reliable Agile ...,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan...","In agile iterative development, an agile team ...",Unknown Session
4,2022-05-18,02:22,7.0,[MSR Technical Papers] BotHunter: An Approach ...,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein...",Bots have become popular in software projects ...,Unknown Session


In [451]:
import pandas as pd
import re
from bs4 import BeautifulSoup

# Without error capture to ensure the output (whether success or failure) is reported back.
# Attempt to read tables from the HTML file using pandas
file_path = "data/Program - MSR 2022.html"

In [452]:
with open(file_path, 'r') as file:
    html_content = file.read()

In [453]:
soup = BeautifulSoup(html_content, 'lxml')

In [454]:
rows = soup.find_all('tr', class_='hidable')

In [455]:
date_pattern = re.compile(r"\b(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)\s\d{1,2}\s(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)")

In [456]:
extracted_rows = []

# Loop through each row to extract required details
for row in rows:
    temp_row = []
    
    # Extract session details from parent div if not already extracted
    session_div = row.find_previous('div', class_='session-info-in-table')
    session = session_div.get_text(strip=True) if session_div else ''
    
    datetime_info = row.find('td', class_='text-right')
    talk_info = row.find_all('td')[-1]
    
    if datetime_info and talk_info:
        time = datetime_info.find('div', class_='start-time').get_text(strip=True) if datetime_info.find('div', class_='start-time') else ''
        duration = datetime_info.find('strong').get_text(strip=True) if datetime_info.find('strong') else ''
        
        # Calculate end time based on start time and duration if needed
        
        temp_row.append(time)
        # temp_row.append(end_time)  # Calculate and append end time if needed
        temp_row.append(duration)
        
        talk_title = talk_info.find('strong')
        if talk_title:
            temp_row.append(talk_title.get_text(strip=True))
            
            link = talk_title.find('a', href=True)
            temp_row.append(link['href'] if link else '')
        else:
            temp_row.extend(['', ''])
            
        # Extract authors
        authors_div = talk_info.find('div', class_='performers')
        authors = [author.get_text(strip=True) for author in authors_div.find_all('a')] if authors_div else []
        temp_row.append(authors)
        
        # Append session information
        temp_row.append(session)
        
        # Locate date information
        # Tip: You may need to adjust how you locate the 'date information' based on your HTML structure
        date_info = row.find_previous('div', class_='day-wrapper')
        if date_info:
            date_text = date_info.get_text(strip=True)
            matched_date = date_pattern.search(date_text)
            date = matched_date.group(0) if matched_date else "Date Not Found"
        else:
            date = None
        temp_row.append(date)
        
        extracted_rows.append(temp_row)

columns = ['Time', 'Duration', 'Talk Title', 'Link', 'Authors', 'Session', 'Date']
df_1 = pd.DataFrame(extracted_rows, columns=columns)

df_1.head()

Unnamed: 0,Time,Duration,Talk Title,Link,Authors,Session,Date
0,22:00,4m,An Empirical Evaluation of GitHub Copilot’s Co...,#,"[Nhan Nguyen, Sarah Nadi]",Session 1Technical Papers/Registered Reportsat...,Tue 17 May
1,22:04,4m,Comments on Comments: Where Code Review and Do...,#,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May
2,22:08,7m,Does This Apply to Me? An Empirical Study of T...,#,"[Akalanka Galappaththi, Sarah Nadi, Christoph ...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May
3,22:15,7m,Towards Reliable Agile Iterative Planning via ...,#,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May
4,22:22,7m,BotHunter: An Approach to Detect Software Bots...,#,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May


In [457]:
df.columns

Index(['date', 'start time', 'duration_mins', 'title', 'authors',
       'description', 'session_name'],
      dtype='object')

In [458]:
df_merged = pd.concat([df_1, df[['description']]], axis=1)

In [459]:
df_merged.head(3)

Unnamed: 0,Time,Duration,Talk Title,Link,Authors,Session,Date,description
0,22:00,4m,An Empirical Evaluation of GitHub Copilot’s Co...,#,"[Nhan Nguyen, Sarah Nadi]",Session 1Technical Papers/Registered Reportsat...,Tue 17 May,GitHub and OpenAI recently launched GitHub Cop...
1,22:04,4m,Comments on Comments: Where Code Review and Do...,#,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May,An important function of code review is to inc...
2,22:08,7m,Does This Apply to Me? An Empirical Study of T...,#,"[Akalanka Galappaththi, Sarah Nadi, Christoph ...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May,Stack Overflow has become an essential technic...


In [810]:
df_merged.columns

Index(['Time', 'Duration', 'Talk Title', 'Link', 'Authors', 'Session', 'Date',
       'description', 'preprocessed_description'],
      dtype='object')

In [811]:
df_input = df_merged[['Time', 'Duration', 'Talk Title', 'Link', 'Authors', 'Session', 'Date',
       'description']]
df_input = df_input[(df_input['Talk Title']!="Discussions and Q&A")&(df_input.description!="")].reset_index(drop=True)
df_input = df_input.reset_index()
df_input = df_input.rename(columns={
    'Talk Title': 'title',
    'description': 'abstract',
    'index': 'paper_number'})
df_input_no_duplication = df_input[~df_input.Session.str.startswith("Blended")]

In [812]:
df_input_no_duplication.head(3)

Unnamed: 0,paper_number,Time,Duration,title,Link,Authors,Session,Date,abstract
0,0,22:00,4m,An Empirical Evaluation of GitHub Copilot’s Co...,#,"[Nhan Nguyen, Sarah Nadi]",Session 1Technical Papers/Registered Reportsat...,Tue 17 May,GitHub and OpenAI recently launched GitHub Cop...
1,1,22:04,4m,Comments on Comments: Where Code Review and Do...,#,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May,An important function of code review is to inc...
2,2,22:08,7m,Does This Apply to Me? An Empirical Study of T...,#,"[Akalanka Galappaththi, Sarah Nadi, Christoph ...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May,Stack Overflow has become an essential technic...


## LLM generate clustering result

In [836]:
df_llm_cluster_input = df_input_no_duplication.copy()

In [837]:
# Convert df_input to a string representation
SAMPLE_SIZE = len(df_llm_cluster_input)
df_input_str = df_llm_cluster_input[['title']].iloc[0: SAMPLE_SIZE].to_string(index=False).strip()
df_input_example_str = df_output_example.to_string(index=False).strip()


# Construct the prompt
prompt = f"""
Please give the clustering result of those papers given the title.
the clustering is based on if they are from the same topic. 
There are {SAMPLE_SIZE} papers in total. I'd like to cluster these papers into 5 common topics (you can use number to represent differnt topics). 
Output constraint will be:
1. The output should contain the clustering label for each paper in the form of the csv representation of a data frame. 
2. Index are the "title" and the value will be the clustering label (from 0 to 4).
3. This csv representation should be in three quotes (```) on both sides so that I can easily extract it from your result and make a data frame.
4. Please add "" for the title in the output
Input includes one column: title.
{df_input_str}

"""

# Print the prompt to verify
print(prompt[:1000])


Please give the clustering result of those papers given the title.
the clustering is based on if they are from the same topic. 
There are 117 papers in total. I'd like to cluster these papers into 5 common topics (you can use number to represent differnt topics). 
Output constraint will be:
1. The output should contain the clustering label for each paper in the form of the csv representation of a data frame. 
2. Index are the "title" and the value will be the clustering label (from 0 to 4).
3. This csv representation should be in three quotes (```) on both sides so that I can easily extract it from your result and make a data frame.
4. Please add "" for the title in the output
Input includes one column: title.
title
                                                                                       An Empirical Evaluation of GitHub Copilot’s Code Suggestions
                                                                                     Comments on Comments: Where Code Review 

In [838]:
len(prompt)

18044

In [839]:
import pyperclip
with open("prompt.txt", "w") as f:
    f.write(prompt)
    f.close()

In [840]:
print(len(prompt))

18044


In [1]:
import openai
from io import StringIO

with open("../api_key", 'r') as f:
    api_key = f.read().strip()

client = openai.OpenAI(api_key = api_key)

In [1211]:
score_result = []
cluster_results = []
for i in range(5):
    response = client.chat.completions.create(
        model= "gpt-4-0125-preview", #"gpt-3.5-turbo-0125",  # You can switch this to "gpt-4-turbo-preview", "gpt-3.5-turbo-0125"
        messages=[
            {
                "role": "user",
                "content": prompt
            },
        ],
        temperature=0.5,
        # max_tokens=256,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0
    )
    ans_string = response.choices[0].message.content
    
    match = re.search("```(.*?)```", ans_string, re.DOTALL)
    if match:
        csv_string = match.group(1)  # Extract the actual CSV data
        # print("Extracted CSV:\n", csv_string)

        # Convert the CSV string to a DataFrame
        # StringIO is used to convert the string to a file-like object
        df_llm_clustering_results = pd.read_csv(StringIO(csv_string))

        # Display the DataFrame
        print("Got the df!")
    else:
        print("No CSV data found in the string.")
    cluster_results.append(df_llm_clustering_results)
    cluster_ids, cluster_sizes = np.unique(df_llm_clustering_results.cluster, return_counts=True) 
    df_llm_cluster_res = pd.merge(df_llm_cluster_input, df_llm_clustering_results, on='title', how='left')
    print(f"Number of elements assigned to each cluster: {cluster_sizes}")
    completeness_score = metrics.completeness_score(df_llm_cluster_res['Session'], df_llm_cluster_res['cluster'])
    homegeneity_score = metrics.homogeneity_score(df_llm_cluster_res['Session'],df_llm_clustering_results['cluster'])
    score_result.append({
        'iteration': i,
        'completeness_score': completeness_score,
        'homegeneity_score': homegeneity_score
    print(f"completeness score: {completeness_score}")
    print(f"homogeneity score: {homegeneity_score}") 

Got the df!
Number of elements assigned to each cluster: [24 13 20 46 14]
Number of elements assigned to each cluster: [24 13 20 46 14]
completeness score: 0.3936026962224098
homogeneity score: 0.20177177366231341
Got the df!
Number of elements assigned to each cluster: [15 14 29 42 17]
Number of elements assigned to each cluster: [15 14 29 42 17]
completeness score: 0.40805638762343416
homogeneity score: 0.21185399253629883
Got the df!
Number of elements assigned to each cluster: [32 14 29 10 32]
Number of elements assigned to each cluster: [32 14 29 10 32]
completeness score: 0.40592373706860296
homogeneity score: 0.2118607093410775
Got the df!
Number of elements assigned to each cluster: [31 14 27 16 29]
Number of elements assigned to each cluster: [31 14 27 16 29]
completeness score: 0.37824579799056307
homogeneity score: 0.20300204083406098
Got the df!
Number of elements assigned to each cluster: [19 19 22 27 30]
Number of elements assigned to each cluster: [19 19 22 27 30]
comple

In [1213]:
# pd.merge(df_tfidf_input, df_llm_clustering_results, on='title', how='left')

In [1216]:
pd.DataFrame(score_result).mean()

iteration             2.000000
completeness_score    0.415895
homegeneity_score     0.219696
dtype: float64

In [846]:
from io import StringIO
match = re.search("```(.*?)```", ans_string, re.DOTALL)
if match:
    csv_string = match.group(1)  # Extract the actual CSV data
    # print("Extracted CSV:\n", csv_string)

    # Convert the CSV string to a DataFrame
    # StringIO is used to convert the string to a file-like object
    df_llm_clustering_results = pd.read_csv(StringIO(csv_string))

    # Display the DataFrame
    print("Got the df!")
else:
    print("No CSV data found in the string.")
    

Got the df!


In [847]:
df_llm_clustering_results.shape

(117, 2)

In [848]:
df_llm_clustering_results.head(5)

Unnamed: 0,title,cluster
0,An Empirical Evaluation of GitHub Copilot’s Co...,0
1,Comments on Comments: Where Code Review and Do...,1
2,Does This Apply to Me? An Empirical Study of T...,2
3,Towards Reliable Agile Iterative Planning via ...,1
4,BotHunter: An Approach to Detect Software Bots...,3


In [751]:
# df_clustering_results = df_clustering_results.drop('title').rename(columns={'csv': 'clustering_label'})

In [758]:
# df_clustering_results = df_clustering_results.reset_index().rename(columns={'index': 'title'})

In [None]:
df_clustering_results.title.value_counts()

In [761]:
# df_clustering_results[df_clustering_results.title=='Finding the Fun in Fundraising: Public Issues and Pull Requests in VC-backed Open-Core Companies']

In [1217]:
cluster_ids, cluster_sizes = np.unique(df_llm_clustering_results.cluster, return_counts=True)
print(f"Number of elements assigned to each cluster: {cluster_sizes}")

Number of elements assigned to each cluster: [19 19 22 27 30]


In [1265]:
metrics.completeness_score(df_llm_cluster_input['Session'], df_llm_clustering_results['cluster'])

0.4936476397153556

In [1266]:
df_llm_cluster_output = pd.merge(df_llm_cluster_input, df_llm_clustering_results, on='title', how='left')

In [1267]:
df_llm_cluster_output.cluster.isna().sum()

0

In [1221]:
df = df_llm_cluster_output.copy()

df.columns = [col.lower().replace(' ', '_') for col in df.columns]
df = df[['date'] + [col for col in df.columns if col != 'date']]
df['start_time'] = pd.to_datetime("2022 " + df['date'] + ' ' + df['time'], format='%Y %a %d %b %H:%M')
df['duration'] = df['duration'].str.extract('(\d+)').astype(int)  # Extract duration in minutes
df['end_time'] = df['start_time'] + pd.to_timedelta(df['duration'], unit='m')

# Delete the Link column
df = df.drop(['link', 'time'], axis=1)

# Move Authors column to the last position
df = df[[col for col in df.columns if col != 'authors'] + ['authors']]

df = df[['session', 'date', 'start_time', 'end_time', 'duration', 'title', 'authors', 'cluster']]

# Display the final data frame
df.head()

Unnamed: 0,session,date,start_time,end_time,duration,title,authors,cluster
0,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,2022-05-17 22:00:00,2022-05-17 22:04:00,4,An Empirical Evaluation of GitHub Copilot’s Co...,"[Nhan Nguyen, Sarah Nadi]",2
1,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,2022-05-17 22:04:00,2022-05-17 22:08:00,4,Comments on Comments: Where Code Review and Do...,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince...",3
2,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,2022-05-17 22:08:00,2022-05-17 22:15:00,7,Does This Apply to Me? An Empirical Study of T...,"[Akalanka Galappaththi, Sarah Nadi, Christoph ...",0
3,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,2022-05-17 22:15:00,2022-05-17 22:22:00,7,Towards Reliable Agile Iterative Planning via ...,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan...",3
4,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,2022-05-17 22:22:00,2022-05-17 22:29:00,7,BotHunter: An Approach to Detect Software Bots...,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein...",4


In [1222]:
df.date.value_counts()

date
Wed 18 May    47
Thu 19 May    38
Fri 20 May    26
Tue 17 May     6
Name: count, dtype: int64

In [1223]:
# df = df[df.date == 'Fri 20 May']

In [1224]:
df

Unnamed: 0,session,date,start_time,end_time,duration,title,authors,cluster
0,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,2022-05-17 22:00:00,2022-05-17 22:04:00,4,An Empirical Evaluation of GitHub Copilot’s Co...,"[Nhan Nguyen, Sarah Nadi]",2
1,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,2022-05-17 22:04:00,2022-05-17 22:08:00,4,Comments on Comments: Where Code Review and Do...,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince...",3
2,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,2022-05-17 22:08:00,2022-05-17 22:15:00,7,Does This Apply to Me? An Empirical Study of T...,"[Akalanka Galappaththi, Sarah Nadi, Christoph ...",0
3,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,2022-05-17 22:15:00,2022-05-17 22:22:00,7,Towards Reliable Agile Iterative Planning via ...,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan...",3
4,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,2022-05-17 22:22:00,2022-05-17 22:29:00,7,BotHunter: An Approach to Detect Software Bots...,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein...",4
...,...,...,...,...,...,...,...,...
112,Session 16: Non-functional Properties (Availab...,Fri 20 May,2022-05-20 14:07:00,2022-05-20 14:11:00,4,A Large-scale Dataset of (Open Source) License...,[Stefano Zacchiroli],4
113,Session 16: Non-functional Properties (Availab...,Fri 20 May,2022-05-20 14:11:00,2022-05-20 14:18:00,7,SECOM: Towards a convention for security commi...,"[Sofia Reis, Rui Abreu, Hakan Erdogmus, Corina...",4
114,Session 16: Non-functional Properties (Availab...,Fri 20 May,2022-05-20 14:18:00,2022-05-20 14:25:00,7,Varangian: A Git Bot for Augmented Static Anal...,"[Saurabh Pujar, Yunhui Zheng, Luca Buratti, Bu...",4
115,Session 16: Non-functional Properties (Availab...,Fri 20 May,2022-05-20 14:32:00,2022-05-20 14:36:00,4,Is GitHub's Copilot as Bad As Humans at Introd...,"[Owura Asare, Mei Nagappan, N. Asokan]",2


In [1225]:
df['start_time'] = pd.to_datetime(df['start_time']).astype(int) // 10**9 // 60
df['end_time'] = pd.to_datetime(df['end_time']).astype(int) // 10**9 // 60

df.head()

Unnamed: 0,session,date,start_time,end_time,duration,title,authors,cluster
0,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,27547080,27547084,4,An Empirical Evaluation of GitHub Copilot’s Co...,"[Nhan Nguyen, Sarah Nadi]",2
1,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,27547084,27547088,4,Comments on Comments: Where Code Review and Do...,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince...",3
2,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,27547088,27547095,7,Does This Apply to Me? An Empirical Study of T...,"[Akalanka Galappaththi, Sarah Nadi, Christoph ...",0
3,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,27547095,27547102,7,Towards Reliable Agile Iterative Planning via ...,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan...",3
4,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,27547102,27547109,7,BotHunter: An Approach to Detect Software Bots...,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein...",4


In [1226]:
df.dtypes

session       object
date          object
start_time     int64
end_time       int64
duration       int64
title         object
authors       object
cluster        int64
dtype: object

In [1227]:
print(df["session"].nunique())
df["session"].unique()[:3]

19


array(['Session 1Technical Papers/Registered ReportsatMSR Main room - even hoursChair(s):Hongyu ZhangUniversity of Newcastle,Masud RahmanDalhousie University',
       'Session 2: Maintenance (Issues & Smells)Technical Papers/Registered Reports/Data and Tool Showcase Track/Industry TrackatMSR Main room - odd hoursChair(s):Alessio FerrariCNR-ISTI',
       'Session 3: Introspection, Vision, and Human AspectsTechnical Papers/Data and Tool Showcase Track/Industry Track/Registered ReportsatMSR Main room - odd hoursChair(s):Alexander SerebrenikEindhoven University of Technology,Sebastian BaltesSAP SE & University of Adelaide'],
      dtype=object)

In [1228]:
# df[df["session"]=='Session 1Technical Papers/Registered ReportsatMSR Main room - even hoursChair(s):Hongyu ZhangUniversity of Newcastle,Masud RahmanDalhousie University']

In [1229]:
def merge_intervals(intervals):
    if not intervals:
        return 0
    
    # Sort intervals by the start time
    intervals.sort(key=lambda x: x[0])
    
    merged = [intervals[0]]
    for current_start, current_end in intervals[1:]:
        last_end = merged[-1][1]
        
        if current_start <= last_end:
            # There's an overlap, extend the previous interval
            merged[-1] = (merged[-1][0], max(last_end, current_end))
        else:
            # No overlap, add this interval as is
            merged.append((current_start, current_end))
    
    # Compute total duration in minutes (or another unit as desired)
    total_duration_minutes = sum((end - start) for start, end in merged)   # convert seconds to minutes
    return total_duration_minutes

total_durations = []

for date, group in df.groupby('date'):
    intervals = list(zip(group['start_time'], group['end_time']))
    total_duration = merge_intervals(intervals)
    total_durations.append({'date': date, 'total_duration': total_duration})

# Converting the result into a DataFrame
total_duration_df = pd.DataFrame(total_durations)

total_duration_df

Unnamed: 0,date,total_duration
0,Fri 20 May,140
1,Thu 19 May,212
2,Tue 17 May,36
3,Wed 18 May,252


In [1230]:
dates = df.groupby("date")["duration"].sum().reset_index()

dates


Unnamed: 0,date,duration
0,Fri 20 May,140
1,Thu 19 May,212
2,Tue 17 May,36
3,Wed 18 May,288


In [1231]:
df.head(5)

Unnamed: 0,session,date,start_time,end_time,duration,title,authors,cluster
0,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,27547080,27547084,4,An Empirical Evaluation of GitHub Copilot’s Co...,"[Nhan Nguyen, Sarah Nadi]",2
1,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,27547084,27547088,4,Comments on Comments: Where Code Review and Do...,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince...",3
2,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,27547088,27547095,7,Does This Apply to Me? An Empirical Study of T...,"[Akalanka Galappaththi, Sarah Nadi, Christoph ...",0
3,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,27547095,27547102,7,Towards Reliable Agile Iterative Planning via ...,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan...",3
4,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,27547102,27547109,7,BotHunter: An Approach to Detect Software Bots...,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein...",4


In [1232]:
df.cluster.value_counts()

cluster
4    30
3    27
2    22
0    19
1    19
Name: count, dtype: int64

In [1233]:
df.iloc[0]['cluster']

2

In [1234]:
# common_cluster_matrix = pd.DataFrame(np.zeros((len(df), len(df)), dtype=int), index=df['title'], columns=df['title'])
common_cluster_pairs = {}

# Populate the matrix
for i in range(len(df)):
    for j in range(len(df)):
        # Intersect authors lists, if not empty set cell to 1
        if i!=j and df.iloc[i]['cluster'] == df.iloc[j]['cluster']:
            common_cluster_pairs[(i,j)] = 1

# common_cluster_pairs

In [1235]:
common_author_matrix = pd.DataFrame(np.zeros((len(df), len(df)), dtype=int), index=df['title'], columns=df['title'])

# Populate the matrix
for i in range(len(df)):
    for j in range(len(df)):
        # Intersect authors lists, if not empty set cell to 1
        if set(df.iloc[i]['authors']) & set(df.iloc[j]['authors']):
            common_author_matrix.iloc[i, j] = 1

# common_author_matrix.head()

In [1236]:
papers = df[["title","authors","duration"]].reset_index(drop=True)
papers.head()

Unnamed: 0,title,authors,duration
0,An Empirical Evaluation of GitHub Copilot’s Co...,"[Nhan Nguyen, Sarah Nadi]",4
1,Comments on Comments: Where Code Review and Do...,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince...",4
2,Does This Apply to Me? An Empirical Study of T...,"[Akalanka Galappaththi, Sarah Nadi, Christoph ...",7
3,Towards Reliable Agile Iterative Planning via ...,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan...",7
4,BotHunter: An Approach to Detect Software Bots...,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein...",7


In [1237]:
df.head()

Unnamed: 0,session,date,start_time,end_time,duration,title,authors,cluster
0,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,27547080,27547084,4,An Empirical Evaluation of GitHub Copilot’s Co...,"[Nhan Nguyen, Sarah Nadi]",2
1,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,27547084,27547088,4,Comments on Comments: Where Code Review and Do...,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince...",3
2,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,27547088,27547095,7,Does This Apply to Me? An Empirical Study of T...,"[Akalanka Galappaththi, Sarah Nadi, Christoph ...",0
3,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,27547095,27547102,7,Towards Reliable Agile Iterative Planning via ...,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan...",3
4,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,27547102,27547109,7,BotHunter: An Approach to Detect Software Bots...,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein...",4


In [1238]:
sessions = df.groupby("session")["duration"].sum()

sessions.head()


session
HackathonHackathon/Technical PapersatMSR Main room - odd hoursChair(s):Gregorio RoblesUniversidad Rey Juan Carlos,Jesus M. Gonzalez-BarahonaUniversidad Rey Juan Carlos,Maëlick ClaesUniversity of Oulu    30
Mining ChallengeMining Challenge/Technical PapersatMSR Main room - even hoursChair(s):Steffen HerboldTU Clausthal                                                                                          28
Session 10: SecurityTechnical Papers/Data and Tool Showcase Track/Registered ReportsatMSR Main room - odd hoursChair(s):Triet Le Huynh MinhThe University of Adelaide                                      34
Session 11: Machine Learning & Information RetrievalTechnical PapersatMSR Main room - odd hoursChair(s):Phuong T. NguyenUniversity of L’Aquila                                                             39
Session 12: Integration & Large-Scale MiningTechnical Papers/Data and Tool Showcase TrackatMSR Main room - even hoursChair(s):Jin L.C. GuoMcGill University,Amjed TahirM

In [1239]:
df.groupby("session")["duration"].sum().values

array([30, 28, 34, 39, 36, 36, 36, 38, 36, 36, 33, 33, 36, 34, 37, 37, 37,
       30, 50])

In [1240]:
df.groupby("session").size().values

array([6, 7, 7, 6, 6, 6, 6, 8, 6, 6, 6, 6, 6, 7, 7, 7, 7, 6, 1])

In [1241]:
# df.session.value_counts()

In [1242]:
df[df.session=='Tutorial: Empirical Standards for Repository MiningTutorialsatMSR Tutorials room']

Unnamed: 0,session,date,start_time,end_time,duration,title,authors,cluster
31,Tutorial: Empirical Standards for Repository M...,Wed 18 May,27547980,27548030,50,Empirical Standards for Repository Mining,"[Paul Ralph, Tushar Sharma, Preetha Chatterjee]",3


In [1243]:
# Assumption of variables for illustration
num_sessions = len(sessions)  # Number of sessions as defined by PC chairs
session_lengths = sessions.values  # Length of each session in minutes
# num_tracks = 1 # FOR NOW, ASSUME WE ONLY HAVE A SINGLE TRACK

In [1244]:
len(papers)

117

In [1245]:
# Initiate the problem


from pulp import *
import numpy as np


prob = LpProblem("Conference_Schedule_Optimization", LpMaximize)

# Decision variables
x = LpVariable.dicts("schedule", 
                            ((i, j) 
                             for i in range(len(papers)) # paper number
                             for j in range(num_sessions) # session number
                            ) 
                            ,cat='Binary' # to make it efficient
                       )
z = LpVariable.dicts('product', ((i,j,m,n)
                                  for i in range(len(papers))
                                  for j in range(len(papers))
                                  for m in range(num_sessions)
                                  for n in range(num_sessions)
                                 
                                )
                                 , 0, 1, cat='Continuous')

obj = lpSum(common_cluster_pairs[pair] * z[(pair[0], pair[1],j, j)]
             for pair in common_cluster_pairs
             for j in range(num_sessions)
             )

prob += obj


for pair in common_cluster_pairs:
    for j in range(num_sessions):
        prob += z[(pair[0], pair[1], j, j)] >= x[(pair[0],j)] + x[(pair[1],j)] - 1
        prob += z[(pair[0], pair[1], j, j)] <= x[(pair[0],j)]
        prob += z[(pair[0], pair[1], j, j)] <=  x[(pair[1],j)]

# start_times = LpVariable.dicts("start_times", 
#                             (i 
#                              for i in range(len(papers)) # paper number
#                             ) # start time
#                             ,cat='Continuous'
#                        )
# end_times = LpVariable.dicts("end_times", 
#                             (i 
#                              for i in range(len(papers)) # paper number
#                             ) # end time
#                             ,cat='Continuous'
#                        )

# Objective function: For the basic structure, we make it a dummy one as our main focus is on satisfying constraints
# prob += 0, "ArbitraryObjective"


In [1246]:
len(papers)

117

In [1247]:
# give some flexibility by adding a few mins as a buffer since we removed all QA sessions
session_lengths_extended = [i + 10 for i in session_lengths]

In [1248]:
session_lengths_extended

[40, 38, 44, 49, 46, 46, 46, 48, 46, 46, 43, 43, 46, 44, 47, 47, 47, 40, 60]

In [1249]:
for i in range(len(papers)):
    for j in range(num_sessions):
            prob += x[(i, j)] >= 0
            prob += x[(i, j)] <= 1


# Ensure each paper is scheduled exactly once
for i in range(len(papers)):
    prob += lpSum(x[(i, j)] for j in range(num_sessions)) == 1, f"One_placement_paper_{i}"

# Do not exceed session length
for j in range(num_sessions):
    prob += lpSum(x[(i, j)] * papers.loc[i]["duration"] for i in range(len(papers))) <= session_lengths_extended[j], f"Session_length_limit_{j}"


In [1250]:
listSolvers(onlyAvailable=True)

['PULP_CBC_CMD']

In [1251]:
# Choose the solver and set the time limit
# solver = GLPK(msg=False, timeLimit=600)  # Set the time limit to 600 seconds
prob.solve(PULP_CBC_CMD(timeLimit=2000))

Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /opt/conda/lib/python3.10/site-packages/pulp/solverdir/cbc/linux/64/cbc /var/tmp/537b156849274e5e8d4c4037102dd7a9-pulp.mps -max -sec 2000 -timeMode elapsed -branch -printingOptions all -solution /var/tmp/537b156849274e5e8d4c4037102dd7a9-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 159513 COLUMNS
At line 585988 RHS
At line 745497 BOUNDS
At line 799363 ENDATA
Problem MODEL has 159508 rows, 53865 columns and 370386 elements
Coin0008I MODEL read with 0 errors
seconds was changed from 1e+100 to 2000
Option for timeMode changed from cpu to elapsed
Continuous objective value is 2718 - 65.34 seconds
Cgl0003I 1794 fixed, 0 tightened bounds, 4587 strengthened rows, 7346 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 14072 strengthened rows, 2087 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 5791 strengthened rows, 3302 substitutions
Cgl0003I 0 fixed, 0 tighte

1

In [1252]:
# Initialize an empty list to hold the results
results = []

# Iterate over each session and track
for j in range(num_sessions):
    session_number = j + 1
    # Iterate over each paper
    for i in range(len(papers)):
        if x[(i, j)].varValue is not None and  x[(i, j)].varValue== 1:
            # Prepare a dict with the required information
            paper_info = {
                'optimized_session': session_number,
                'title': papers.loc[i]['title'],
                'duration': papers.loc[i]['duration'],
                'validity': 1
            }
            # Append to the results list
            results.append(paper_info)
        elif x[(i, j)].varValue is not None and  x[(i, j)].varValue > 0:
            # Prepare a dict with the required information
            paper_info = {
                'optimized_session': session_number,
                'title': papers.loc[i]['title'],
                'duration': papers.loc[i]['duration'],
                'validity': 0
            }
            # Append to the results list
            results.append(paper_info)

# Convert the list of dictionaries into a DataFrame
df_results = pd.DataFrame(results)

# If you need to see the first few rows of the DataFrame to ensure it's correct
df_results.head()


Unnamed: 0,optimized_session,title,duration,validity
0,1,Does This Apply to Me? An Empirical Study of T...,7,1
1,1,Is Surprisal in Issue Trackers Actionable?,4,1
2,1,Exploring Apache Incubator Project Trajectorie...,4,1
3,1,A Culture of Productivity: Maximizing Producti...,7,1
4,1,DaSEA – A Dataset for Software Ecosystem Analysis,4,1


In [1253]:
df_results["validity"].mean()

1.0

In [1254]:
df_merged_result = pd.merge(df[['title', 'session', 'cluster']], df_results[['title', 'optimized_session']], on='title', how='left')

In [1255]:
df_merged_result

Unnamed: 0,title,session,cluster,optimized_session
0,An Empirical Evaluation of GitHub Copilot’s Co...,Session 1Technical Papers/Registered Reportsat...,2,13
1,Comments on Comments: Where Code Review and Do...,Session 1Technical Papers/Registered Reportsat...,3,18
2,Does This Apply to Me? An Empirical Study of T...,Session 1Technical Papers/Registered Reportsat...,0,1
3,Towards Reliable Agile Iterative Planning via ...,Session 1Technical Papers/Registered Reportsat...,3,4
4,BotHunter: An Approach to Detect Software Bots...,Session 1Technical Papers/Registered Reportsat...,4,6
...,...,...,...,...
112,A Large-scale Dataset of (Open Source) License...,Session 16: Non-functional Properties (Availab...,4,3
113,SECOM: Towards a convention for security commi...,Session 16: Non-functional Properties (Availab...,4,14
114,Varangian: A Git Bot for Augmented Static Anal...,Session 16: Non-functional Properties (Availab...,4,8
115,Is GitHub's Copilot as Bad As Humans at Introd...,Session 16: Non-functional Properties (Availab...,2,12


In [1256]:
df_merged_result.sort_values('optimized_session')

Unnamed: 0,title,session,cluster,optimized_session
58,DaSEA – A Dataset for Software Ecosystem Analysis,Session 8: Large-Scale Mining & Software Ecosy...,3,1
79,Is Open Source Eating the World’s Software? Me...,Session 12: Integration & Large-Scale MiningTe...,3,1
2,Does This Apply to Me? An Empirical Study of T...,Session 1Technical Papers/Registered Reportsat...,0,1
65,Toward Granular Automatic Unit Test Case Gener...,Session 9: Scaling & CloudIndustry Track/Regis...,2,1
60,SniP: An Efficient Stack Tracing Framework for...,Session 9: Scaling & CloudIndustry Track/Regis...,2,1
...,...,...,...,...
94,Using Bandit Algorithms for Selecting Feature ...,Session 14: Software QualityTechnical Papers/I...,0,18
53,An Empirical Study on the Survival Rate of Git...,Session 8: Large-Scale Mining & Software Ecosy...,4,18
44,npm-filter: Automating the mining of dynamic i...,Session 6: Maintenance & TestingData and Tool ...,1,19
90,Evaluating few shot and Contrastive learning M...,Session 13: Security & QualityTechnical Papers...,2,19


In [1257]:
new_session_lengths = (df_results.groupby("optimized_session")["duration"].sum() + pd.Series(index=range(1,1+num_sessions), data = 0)).fillna(0)
new_session_lengths

optimized_session
1     38
2     32
3     40
4     47
5     40
6     33
7     33
8     30
9     21
10    29
11    24
12    38
13    28
14    37
15    42
16    44
17    33
18    29
19    58
dtype: int64

In [1258]:
sorted(df.groupby("session")["duration"].sum().values)

[28, 30, 30, 33, 33, 34, 34, 36, 36, 36, 36, 36, 36, 37, 37, 37, 38, 39, 50]

In [1262]:
completeness_score = metrics.completeness_score(df_merged_result['session'], df_merged_result['optimized_session'])
homegeneity_score = metrics.homogeneity_score(df_merged_result['session'], df_merged_result['optimized_session'])
print(f"completeness score: {completeness_score}")
print(f"homogeneity score: {homegeneity_score}")  

completeness score: 0.41009481310511414
homogeneity score: 0.40999795364801067


In [1263]:
metrics.completeness_score(df_merged_result['session'], df_merged_result['cluster'])

0.4936476397153556

In [1264]:
metrics.homogeneity_score(df_merged_result['session'], df_merged_result['cluster'])

0.2699903612450044