In [1]:
import numpy as np
from io import StringIO

In [2]:
import openai

with open("secret/secret_key_file.txt", 'r') as file:
    api_key = file.read().strip()

client = openai.OpenAI(api_key = api_key)

In [3]:
import pandas as pd
import re
from bs4 import BeautifulSoup

# Without error capture to ensure the output (whether success or failure) is reported back.
# Attempt to read tables from the HTML file using pandas
file_path = "data/Program - MSR 2022.html"

In [4]:
with open(file_path, 'r') as file:
    html_content = file.read()

In [5]:
soup = BeautifulSoup(html_content, 'lxml')

In [6]:
rows = soup.find_all('tr', class_='hidable')

In [7]:
# Regex to match date pattern (e.g., "Tue 16 May")
date_pattern = re.compile(r"\b(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)\s\d{1,2}\s(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)")

In [8]:
extracted_rows = []

# Loop through each row to extract required details
for row in rows:
    temp_row = []
    
    # Extract session details from parent div if not already extracted
    session_div = row.find_previous('div', class_='session-info-in-table')
    session = session_div.get_text(strip=True) if session_div else ''
    
    datetime_info = row.find('td', class_='text-right')
    talk_info = row.find_all('td')[-1]
    
    if datetime_info and talk_info:
        time = datetime_info.find('div', class_='start-time').get_text(strip=True) if datetime_info.find('div', class_='start-time') else ''
        duration = datetime_info.find('strong').get_text(strip=True) if datetime_info.find('strong') else ''
        
        # Calculate end time based on start time and duration if needed
        
        temp_row.append(time)
        # temp_row.append(end_time)  # Calculate and append end time if needed
        temp_row.append(duration)
        
        talk_title = talk_info.find('strong')
        if talk_title:
            temp_row.append(talk_title.get_text(strip=True))
            
            link = talk_title.find('a', href=True)
            temp_row.append(link['href'] if link else '')
        else:
            temp_row.extend(['', ''])
            
        # Extract authors
        authors_div = talk_info.find('div', class_='performers')
        authors = [author.get_text(strip=True) for author in authors_div.find_all('a')] if authors_div else []
        temp_row.append(authors)
        
        # Append session information
        temp_row.append(session)
        
        # Locate date information
        # Tip: You may need to adjust how you locate the 'date information' based on your HTML structure
        date_info = row.find_previous('div', class_='day-wrapper')
        if date_info:
            date_text = date_info.get_text(strip=True)
            matched_date = date_pattern.search(date_text)
            date = matched_date.group(0) if matched_date else "Date Not Found"
        else:
            date = None
        temp_row.append(date)
        
        extracted_rows.append(temp_row)

columns = ['Time', 'Duration', 'Talk Title', 'Link', 'Authors', 'Session', 'Date']
df = pd.DataFrame(extracted_rows, columns=columns)

df.head()

Unnamed: 0,Time,Duration,Talk Title,Link,Authors,Session,Date
0,22:00,4m,An Empirical Evaluation of GitHub Copilot’s Co...,#,"[Nhan Nguyen, Sarah Nadi]",Session 1Technical Papers/Registered Reportsat...,Tue 17 May
1,22:04,4m,Comments on Comments: Where Code Review and Do...,#,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May
2,22:08,7m,Does This Apply to Me? An Empirical Study of T...,#,"[Akalanka Galappaththi, Sarah Nadi, Christoph ...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May
3,22:15,7m,Towards Reliable Agile Iterative Planning via ...,#,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May
4,22:22,7m,BotHunter: An Approach to Detect Software Bots...,#,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May


In [9]:
# df = df.sample(n=50, random_state=101)

In [10]:
# Convert all column names to lowercase and replace spaces with hyphens
df.columns = [col.lower().replace(' ', '_') for col in df.columns]

# Put date first
df = df[['date'] + [col for col in df.columns if col != 'date']]

# Convert Time to start_time as datetime and calculate end_time
df['start_time'] = pd.to_datetime(df['date'] + ' ' + df['time'], format='%a %d %b %H:%M')
df['duration'] = df['duration'].str.extract('(\d+)').astype(int)  # Extract duration in minutes
df['end_time'] = df['start_time'] + pd.to_timedelta(df['duration'], unit='m')

# Delete the Link column
df = df.drop(['link', 'time'], axis=1)

# Move Authors column to the last position
df = df[[col for col in df.columns if col != 'authors'] + ['authors']]

df = df[['session', 'date', 'start_time', 'end_time', 'duration', 'talk_title','authors']]

# Display the final data frame
df.head()

Unnamed: 0,session,date,start_time,end_time,duration,talk_title,authors
0,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,1900-05-17 22:00:00,1900-05-17 22:04:00,4,An Empirical Evaluation of GitHub Copilot’s Co...,"[Nhan Nguyen, Sarah Nadi]"
1,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,1900-05-17 22:04:00,1900-05-17 22:08:00,4,Comments on Comments: Where Code Review and Do...,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince..."
2,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,1900-05-17 22:08:00,1900-05-17 22:15:00,7,Does This Apply to Me? An Empirical Study of T...,"[Akalanka Galappaththi, Sarah Nadi, Christoph ..."
3,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,1900-05-17 22:15:00,1900-05-17 22:22:00,7,Towards Reliable Agile Iterative Planning via ...,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan..."
4,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,1900-05-17 22:22:00,1900-05-17 22:29:00,7,BotHunter: An Approach to Detect Software Bots...,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein..."


In [11]:
df['start_time'] = pd.to_datetime(df['start_time']).astype(int) // 10**9 // 60
df['end_time'] = pd.to_datetime(df['end_time']).astype(int) // 10**9 // 60

df.head()

Unnamed: 0,session,date,start_time,end_time,duration,talk_title,authors
0,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,-36619320,-36619316,4,An Empirical Evaluation of GitHub Copilot’s Co...,"[Nhan Nguyen, Sarah Nadi]"
1,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,-36619316,-36619312,4,Comments on Comments: Where Code Review and Do...,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince..."
2,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,-36619312,-36619305,7,Does This Apply to Me? An Empirical Study of T...,"[Akalanka Galappaththi, Sarah Nadi, Christoph ..."
3,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,-36619305,-36619298,7,Towards Reliable Agile Iterative Planning via ...,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan..."
4,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,-36619298,-36619291,7,BotHunter: An Approach to Detect Software Bots...,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein..."


In [12]:
df = df[df["date"]=="Fri 20 May"]

In [13]:
df.dtypes

session       object
date          object
start_time     int64
end_time       int64
duration       int64
talk_title    object
authors       object
dtype: object

In [14]:
len(df)

33

In [15]:
df["session"].unique()[:3]

array(['Session 14: Software QualityTechnical Papers/Industry Track/Data and Tool Showcase TrackatMSR Main room - even hoursChair(s):Kla TantithamthavornMonash University,Simone ScalabrinoUniversity of Molise',
       'Session 15: Collaboration & Open SourceRegistered Reports/Data and Tool Showcase Track/Technical Papers/Industry TrackatMSR Main room - odd hoursChair(s):Massimiliano Di PentaUniversity of Sannio, Italy,Fiorella ZampettiUniversity of Sannio, Italy',
       'Tutorial: Using Datalore for Reproducible ResearchTutorialsatMSR Main room - odd hours'],
      dtype=object)

In [16]:
df[df["session"]=='Session 1Technical Papers/Registered ReportsatMSR Main room - even hoursChair(s):Hongyu ZhangUniversity of Newcastle,Masud RahmanDalhousie University']

Unnamed: 0,session,date,start_time,end_time,duration,talk_title,authors


In [17]:
def merge_intervals(intervals):
    if not intervals:
        return 0
    
    # Sort intervals by the start time
    intervals.sort(key=lambda x: x[0])
    
    merged = [intervals[0]]
    for current_start, current_end in intervals[1:]:
        last_end = merged[-1][1]
        
        if current_start <= last_end:
            # There's an overlap, extend the previous interval
            merged[-1] = (merged[-1][0], max(last_end, current_end))
        else:
            # No overlap, add this interval as is
            merged.append((current_start, current_end))
    
    # Compute total duration in minutes (or another unit as desired)
    total_duration_minutes = sum((end - start) for start, end in merged)   # convert seconds to minutes
    return total_duration_minutes

total_durations = []

for date, group in df.groupby('date'):
    intervals = list(zip(group['start_time'], group['end_time']))
    total_duration = merge_intervals(intervals)
    total_durations.append({'date': date, 'total_duration': total_duration})

# Converting the result into a DataFrame
total_duration_df = pd.DataFrame(total_durations)

total_duration_df

Unnamed: 0,date,total_duration
0,Fri 20 May,260


In [18]:
dates = df.groupby("date")["duration"].sum().reset_index()


dates

Unnamed: 0,date,duration
0,Fri 20 May,310


## Perform linear programming

In [19]:
df_input = df[['talk_title', 'duration']].copy()

# Display the head of df_input to verify
df_input.head()

Unnamed: 0,talk_title,duration
116,Evaluating the effectiveness of local explanat...,4
117,Problems and Solutions in Applying Continuous ...,7
118,To Type or Not to Type? A Systematic Compariso...,7
119,Using Bandit Algorithms for Selecting Feature ...,7
120,Constructing Dataset of Functionally Equivalen...,4


In [20]:
sessions = df.groupby("session")["duration"].sum()

sessions.head()

session
HackathonHackathon/Technical PapersatMSR Main room - odd hoursChair(s):Gregorio RoblesUniversidad Rey Juan Carlos,Jesus M. Gonzalez-BarahonaUniversidad Rey Juan Carlos,Maëlick ClaesUniversity of Oulu                                                                                  50
Session 14: Software QualityTechnical Papers/Industry Track/Data and Tool Showcase TrackatMSR Main room - even hoursChair(s):Kla TantithamthavornMonash University,Simone ScalabrinoUniversity of Molise                                                                                 50
Session 15: Collaboration & Open SourceRegistered Reports/Data and Tool Showcase Track/Technical Papers/Industry TrackatMSR Main room - odd hoursChair(s):Massimiliano Di PentaUniversity of Sannio, Italy,Fiorella ZampettiUniversity of Sannio, Italy                                  50
Session 16: Non-functional Properties (Availability, Security, Legal Aspects)Industry Track/Technical Papers/Registered Reports/Data and Too

In [21]:
# Convert df_input to a string representation
df_input_str = df_input.sample(frac=1).to_string(index=False)

# Construct the prompt
prompt = f"""
Program creation is the process of taking all the accepted papers to a conference and allocating a presentation slot for each paper with parallel sessions. 
The PC chairs of a conference typically do this manually. 
Assign sessions to the following papers based on the following constraints:
1. The total length of all paper presentations within a session should be smaller than the length of the session they are in. 
2. No new sessions should be added. '
3. The output should contain the schedule in the form of the csv representation of a data frame. This csv representation should be in three quotes (```) on both sides so that I can easily extract it from your result and make a data frame. 

Example output (use as many rows as the actual number of papers):

```
session@talk_title@duration
231@An Empirical Study on Maintainable Method Size in Java@7
223@Improve Quality of Cloud Serverless Architectures through Software Repository Mining@7
15@Extracting corrective actions from code repositories@7
15@How to Improve Deep Learning for Software Analytics (a case study with code smell detection)@7
11@ReCover: a Curated Dataset for Regression Testing Research@4
```

The list of paper and session titles are below: 


Session Lengths:
{sessions.to_string(index=False)}

Paper durations:
{df_input_str}


Make sure all the papers are included in the schedule. 

"""

# Print the prompt to verify
print(prompt[:500])


Program creation is the process of taking all the accepted papers to a conference and allocating a presentation slot for each paper with parallel sessions. 
The PC chairs of a conference typically do this manually. 
Assign sessions to the following papers based on the following constraints:
1. The total length of all paper presentations within a session should be smaller than the length of the session they are in. 
2. No new sessions should be added. '
3. The output should contain the schedule 


In [22]:
len(prompt)

6480

In [23]:
response = client.chat.completions.create(
    model= "gpt-4-0125-preview", #"gpt-3.5-turbo-0125",  # You can switch this to "gpt-4-turbo-preview", "gpt-3.5-turbo-0125"
    messages=[
        {
            "role": "user",
            "content": prompt
        },
    ],
    temperature=0.8,
    # max_tokens=256,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
)
ans_string = response.choices[0].message.content

In [24]:
len(ans_string)

3025

In [25]:
print(ans_string)

To distribute the papers into the sessions while adhering to the constraints, we first note the session lengths and the sum of paper durations to ensure we do not exceed the session capacity. We aim to distribute longer discussions and Q&A sessions across different session slots since their lengths vary significantly, ensuring that no session is underutilized or exceeds its limit.

Given the session lengths are all 50 minutes, except one session which is 60 minutes, we can distribute the papers as follows, making sure to distribute the discussions and Q&A papers (14, 12, 20, 17 minutes) across different sessions to balance the time:

```
session@talk_title@duration
1@Quid Pro Quo: An Exploration of Reciprocity in Code Review@5
1@FixJS: A Dataset of Bug-fixing JavaScript Commits@4
1@Bot Detection in GitHub Repositories@5
1@Can instability variations warn developers when open-source projects boost?@4
1@A Time Series-Based Dataset of Open-Source Software Evolution@4
1@Constructing Dataset

In [26]:
# Use regular expressions to find the CSV string
# The pattern looks for a string enclosed in triple quotes
match = re.search("```\n(.*?)\n```", ans_string, re.DOTALL)
if match:
    csv_string = match.group(1)  # Extract the actual CSV data
    # print("Extracted CSV:\n", csv_string)

    # Convert the CSV string to a DataFrame
    # StringIO is used to convert the string to a file-like object
    df_results = pd.read_csv(StringIO(csv_string), sep="@")

    # Display the DataFrame
    print("Got the df!")
else:
    print("No CSV data found in the string.")
    
df_results.head()

Got the df!


Unnamed: 0,session,talk_title,duration
0,1,Quid Pro Quo: An Exploration of Reciprocity in...,5
1,1,FixJS: A Dataset of Bug-fixing JavaScript Commits,4
2,1,Bot Detection in GitHub Repositories,5
3,1,Can instability variations warn developers whe...,4
4,1,A Time Series-Based Dataset of Open-Source Sof...,4


In [27]:
df_results

Unnamed: 0,session,talk_title,duration
0,1,Quid Pro Quo: An Exploration of Reciprocity in...,5
1,1,FixJS: A Dataset of Bug-fixing JavaScript Commits,4
2,1,Bot Detection in GitHub Repositories,5
3,1,Can instability variations warn developers whe...,4
4,1,A Time Series-Based Dataset of Open-Source Sof...,4
5,1,Constructing Dataset of Functionally Equivalen...,4
6,1,A Versatile Dataset of Agile Open Source Softw...,4
7,1,A Large-scale Dataset of (Open Source) License...,4
8,1,Evaluating the effectiveness of local explanat...,4
9,1,Towards Understanding Barriers and Mitigation ...,4


In [28]:
df_results.shape

(33, 3)

## Check if the constraints are met

### Session Constraints

In [29]:
df_results.groupby("session")["duration"].sum().sort_values()

session
5     20
4     26
3     41
1     50
2     56
6    117
Name: duration, dtype: int64

In [30]:
np.array(sorted(sessions.values))

array([50, 50, 50, 50, 50, 60])

In [31]:
# Predictions, Trues. 
print("Checking the extent to which the constraints are met. If the numbers are different, then the ")
len(df_results.groupby("session")["duration"].sum().sort_values()), len(np.array(sorted(sessions.values)))

Checking the extent to which the constraints are met. If the numbers are different, then the 


(6, 6)

In [32]:
print("Checking the extent to which the constraints are met. Negative numbers are violations where the planned session is too long.")
session_count = 0
for truth, pred in zip(np.array(sorted(sessions.values))[::-1], df_results.groupby("session")["duration"].sum().sort_values().values[::-1]):
    session_count += 1
    print("Session ",session_count)
    if not truth:
        print(-pred)
    elif not pred:
        print(truth)
    else:
        print(truth-pred)

Checking the extent to which the constraints are met. Negative numbers are violations where the planned session is too long.
Session  1
-57
Session  2
-6
Session  3
0
Session  4
9
Session  5
24
Session  6
30


### Paper constraints

In [33]:
df_results.shape, df.shape

((33, 3), (33, 7))