In [1]:
import numpy as np
from io import StringIO

In [2]:
import openai

with open("secret/secret_key_file.txt", 'r') as file:
    api_key = file.read().strip()

client = openai.OpenAI(api_key = api_key)

In [3]:
import pandas as pd
import re
from bs4 import BeautifulSoup

# Without error capture to ensure the output (whether success or failure) is reported back.
# Attempt to read tables from the HTML file using pandas
file_path = "data/Program - MSR 2022.html"

In [4]:
with open(file_path, 'r') as file:
    html_content = file.read()

In [5]:
soup = BeautifulSoup(html_content, 'lxml')

In [6]:
rows = soup.find_all('tr', class_='hidable')

In [7]:
# Regex to match date pattern (e.g., "Tue 16 May")
date_pattern = re.compile(r"\b(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)\s\d{1,2}\s(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)")

In [8]:
extracted_rows = []

# Loop through each row to extract required details
for row in rows:
    temp_row = []
    
    # Extract session details from parent div if not already extracted
    session_div = row.find_previous('div', class_='session-info-in-table')
    session = session_div.get_text(strip=True) if session_div else ''
    
    datetime_info = row.find('td', class_='text-right')
    talk_info = row.find_all('td')[-1]
    
    if datetime_info and talk_info:
        time = datetime_info.find('div', class_='start-time').get_text(strip=True) if datetime_info.find('div', class_='start-time') else ''
        duration = datetime_info.find('strong').get_text(strip=True) if datetime_info.find('strong') else ''
        
        # Calculate end time based on start time and duration if needed
        
        temp_row.append(time)
        # temp_row.append(end_time)  # Calculate and append end time if needed
        temp_row.append(duration)
        
        talk_title = talk_info.find('strong')
        if talk_title:
            temp_row.append(talk_title.get_text(strip=True))
            
            link = talk_title.find('a', href=True)
            temp_row.append(link['href'] if link else '')
        else:
            temp_row.extend(['', ''])
            
        # Extract authors
        authors_div = talk_info.find('div', class_='performers')
        authors = [author.get_text(strip=True) for author in authors_div.find_all('a')] if authors_div else []
        temp_row.append(authors)
        
        # Append session information
        temp_row.append(session)
        
        # Locate date information
        # Tip: You may need to adjust how you locate the 'date information' based on your HTML structure
        date_info = row.find_previous('div', class_='day-wrapper')
        if date_info:
            date_text = date_info.get_text(strip=True)
            matched_date = date_pattern.search(date_text)
            date = matched_date.group(0) if matched_date else "Date Not Found"
        else:
            date = None
        temp_row.append(date)
        
        extracted_rows.append(temp_row)

columns = ['Time', 'Duration', 'Talk Title', 'Link', 'Authors', 'Session', 'Date']
df = pd.DataFrame(extracted_rows, columns=columns)

df.head()

Unnamed: 0,Time,Duration,Talk Title,Link,Authors,Session,Date
0,22:00,4m,An Empirical Evaluation of GitHub Copilot’s Co...,#,"[Nhan Nguyen, Sarah Nadi]",Session 1Technical Papers/Registered Reportsat...,Tue 17 May
1,22:04,4m,Comments on Comments: Where Code Review and Do...,#,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May
2,22:08,7m,Does This Apply to Me? An Empirical Study of T...,#,"[Akalanka Galappaththi, Sarah Nadi, Christoph ...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May
3,22:15,7m,Towards Reliable Agile Iterative Planning via ...,#,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May
4,22:22,7m,BotHunter: An Approach to Detect Software Bots...,#,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May


In [9]:
df = df.sample(n=10)

In [10]:
# Convert all column names to lowercase and replace spaces with hyphens
df.columns = [col.lower().replace(' ', '_') for col in df.columns]

# Put date first
df = df[['date'] + [col for col in df.columns if col != 'date']]

# Convert Time to start_time as datetime and calculate end_time
df['start_time'] = pd.to_datetime(df['date'] + ' ' + df['time'], format='%a %d %b %H:%M')
df['duration'] = df['duration'].str.extract('(\d+)').astype(int)  # Extract duration in minutes
df['end_time'] = df['start_time'] + pd.to_timedelta(df['duration'], unit='m')

# Delete the Link column
df = df.drop(['link', 'time'], axis=1)

# Move Authors column to the last position
df = df[[col for col in df.columns if col != 'authors'] + ['authors']]

df = df[['session', 'date', 'start_time', 'end_time', 'duration', 'talk_title','authors']]

# Display the final data frame
df.head()

Unnamed: 0,session,date,start_time,end_time,duration,talk_title,authors
85,Session 11: Machine Learning & Information Ret...,Thu 19 May,1900-05-19 11:00:00,1900-05-19 11:04:00,4,On the Naturalness of Fuzzer Generated Code,"[Rajeswari Hita Kambhamettu, John Billos, Caro..."
59,Session 7: Developer Wellbeing & Project Commu...,Wed 18 May,1900-05-18 21:26:00,1900-05-18 21:30:00,4,Exploring Apache Incubator Project Trajectorie...,"[Anirudh Ramchandran, Likang Yin, Vladimir Fil..."
179,"Blended Technical Session 4 (Introspection, Vi...",Tue 24 May,1900-05-24 11:54:00,1900-05-24 12:15:00,21,Discussions and Q&A,[]
178,"Blended Technical Session 4 (Introspection, Vi...",Tue 24 May,1900-05-24 11:46:00,1900-05-24 11:54:00,8,Investigating the Impact of Forgetting in Soft...,"[Utku Unal, Eray Tüzün, Tamer Gezici, Ausaf Ah..."
122,Session 14: Software QualityTechnical Papers/I...,Fri 20 May,1900-05-20 04:36:00,1900-05-20 04:50:00,14,Discussions and Q&A,[]


In [11]:
df['start_time'] = pd.to_datetime(df['start_time']).astype(int) // 10**9 // 60
df['end_time'] = pd.to_datetime(df['end_time']).astype(int) // 10**9 // 60

df.head()

Unnamed: 0,session,date,start_time,end_time,duration,talk_title,authors
85,Session 11: Machine Learning & Information Ret...,Thu 19 May,-36617100,-36617096,4,On the Naturalness of Fuzzer Generated Code,"[Rajeswari Hita Kambhamettu, John Billos, Caro..."
59,Session 7: Developer Wellbeing & Project Commu...,Wed 18 May,-36617914,-36617910,4,Exploring Apache Incubator Project Trajectorie...,"[Anirudh Ramchandran, Likang Yin, Vladimir Fil..."
179,"Blended Technical Session 4 (Introspection, Vi...",Tue 24 May,-36609846,-36609825,21,Discussions and Q&A,[]
178,"Blended Technical Session 4 (Introspection, Vi...",Tue 24 May,-36609854,-36609846,8,Investigating the Impact of Forgetting in Soft...,"[Utku Unal, Eray Tüzün, Tamer Gezici, Ausaf Ah..."
122,Session 14: Software QualityTechnical Papers/I...,Fri 20 May,-36616044,-36616030,14,Discussions and Q&A,[]


In [12]:
df.dtypes

session       object
date          object
start_time     int64
end_time       int64
duration       int64
talk_title    object
authors       object
dtype: object

In [13]:
len(df)

10

In [14]:
df["session"].unique()[:3]

array(['Session 11: Machine Learning & Information RetrievalTechnical PapersatMSR Main room - odd hoursChair(s):Phuong T. NguyenUniversity of L’Aquila',
       "Session 7: Developer Wellbeing & Project CommunicationTechnical Papers/Data and Tool Showcase Track/Industry TrackatMSR Main room - odd hoursChair(s):Bram AdamsQueen's University, Kingston, Ontario",
       'Blended Technical Session 4 (Introspection, Vision, and Human Aspects)Technical Papers/Registered Reports/Data and Tool Showcase TrackatRoom 315+316Chair(s):Ayushi RastogiUniversity of Groningen, The Netherlands'],
      dtype=object)

In [15]:
df[df["session"]=='Session 1Technical Papers/Registered ReportsatMSR Main room - even hoursChair(s):Hongyu ZhangUniversity of Newcastle,Masud RahmanDalhousie University']

Unnamed: 0,session,date,start_time,end_time,duration,talk_title,authors


In [16]:
def merge_intervals(intervals):
    if not intervals:
        return 0
    
    # Sort intervals by the start time
    intervals.sort(key=lambda x: x[0])
    
    merged = [intervals[0]]
    for current_start, current_end in intervals[1:]:
        last_end = merged[-1][1]
        
        if current_start <= last_end:
            # There's an overlap, extend the previous interval
            merged[-1] = (merged[-1][0], max(last_end, current_end))
        else:
            # No overlap, add this interval as is
            merged.append((current_start, current_end))
    
    # Compute total duration in minutes (or another unit as desired)
    total_duration_minutes = sum((end - start) for start, end in merged)   # convert seconds to minutes
    return total_duration_minutes

total_durations = []

for date, group in df.groupby('date'):
    intervals = list(zip(group['start_time'], group['end_time']))
    total_duration = merge_intervals(intervals)
    total_durations.append({'date': date, 'total_duration': total_duration})

# Converting the result into a DataFrame
total_duration_df = pd.DataFrame(total_durations)

total_duration_df

Unnamed: 0,date,total_duration
0,Fri 20 May,21
1,Thu 19 May,22
2,Tue 24 May,44
3,Wed 18 May,4


In [17]:
dates = df.groupby("date")["duration"].sum().reset_index()


dates

Unnamed: 0,date,duration
0,Fri 20 May,21
1,Thu 19 May,22
2,Tue 24 May,44
3,Wed 18 May,4


## Perform linear programming

In [18]:
df_input = df[['talk_title', 'duration']].copy()

# Display the head of df_input to verify
df_input.head()

Unnamed: 0,talk_title,duration
85,On the Naturalness of Fuzzer Generated Code,4
59,Exploring Apache Incubator Project Trajectorie...,4
179,Discussions and Q&A,21
178,Investigating the Impact of Forgetting in Soft...,8
122,Discussions and Q&A,14


In [19]:
sessions = df.groupby("session")["duration"].sum()

sessions.head()

session
Blended Technical Session 4 (Introspection, Vision, and Human Aspects)Technical Papers/Registered Reports/Data and Tool Showcase TrackatRoom 315+316Chair(s):Ayushi RastogiUniversity of Groningen, The Netherlands                                        44
Session 11: Machine Learning & Information RetrievalTechnical PapersatMSR Main room - odd hoursChair(s):Phuong T. NguyenUniversity of L’Aquila                                                                                                              4
Session 14: Software QualityTechnical Papers/Industry Track/Data and Tool Showcase TrackatMSR Main room - even hoursChair(s):Kla TantithamthavornMonash University,Simone ScalabrinoUniversity of Molise                                                   14
Session 15: Collaboration & Open SourceRegistered Reports/Data and Tool Showcase Track/Technical Papers/Industry TrackatMSR Main room - odd hoursChair(s):Massimiliano Di PentaUniversity of Sannio, Italy,Fiorella ZampettiUniversity

In [20]:
# Convert df_input to a string representation
df_input_str = df_input.to_string(index=False)

# Construct the prompt
prompt = f"""
Program creation is the process of taking all the accepted papers to a conference and allocating a presentation slot for each paper with parallel sessions. 
The PC chairs of a conference typically do this manually. 
Assign sessions to the following papers based on the following constraints:
1. The total length of all paper presentations within a session should be smaller than the length of the session they are in. 
2. No new sessions should be added. '
3. The output should contain the schedule in the form of the csv representation of a data frame. This csv representation should be in three quotes (```) on both sides so that I can easily extract it from your result and make a data frame. 


The list of paper and session titles are below: 

Paper durations:
{df_input_str}

Session Lengths:
{sessions.to_string(index=False)}



Make sure all the papers are included in the schedule. 

"""

# Print the prompt to verify
print(prompt[:500])


Program creation is the process of taking all the accepted papers to a conference and allocating a presentation slot for each paper with parallel sessions. 
The PC chairs of a conference typically do this manually. 
Assign sessions to the following papers based on the following constraints:
1. The total length of all paper presentations within a session should be smaller than the length of the session they are in. 
2. No new sessions should be added. '
3. The output should contain the schedule 


In [21]:
len(prompt)

2051

In [22]:
response = client.chat.completions.create(
    model= "gpt-4-0125-preview", #"gpt-3.5-turbo-0125",  # You can switch this to "gpt-4-turbo-preview", "gpt-3.5-turbo-0125"
    messages=[
        {
            "role": "user",
            "content": prompt
        },
    ],
    temperature=0.8,
    max_tokens=256,
    top_p=1,
    frequency_penalty=0,
    presence_penalty=0
)
ans_string = response.choices[0].message.content

In [23]:
len(ans_string)

647

In [24]:
print(ans_string)

```
session,talk_title,duration
44,On the Naturalness of Fuzzer Generated Code,4
44,Exploring Apache Incubator Project Trajectories with APEX,4
44,Investigating the Impact of Forgetting in Software Development,8
44,Toward Granular Automatic Unit Test Case Generation,4
44,Tooling for Time- and Space-efficient git Repository Mining,4
14,Discussions and Q&A,14
7,Code Review Practices for Refactoring Changes: An Empirical Study on OpenStack,7
10,Back to the future: Empirical Revolution(s) in Software Engineering,10
4,Discussions and Q&A,21
8,Challenges in Migrating Imperative Deep Learning Programs to Graph Execution: An Empirical Study,15
```


In [25]:
# Use regular expressions to find the CSV string
# The pattern looks for a string enclosed in triple quotes
match = re.search("```(.*?)```", ans_string, re.DOTALL)
if match:
    csv_string = match.group(1)  # Extract the actual CSV data
    print("Extracted CSV:\n", csv_string)

    # Convert the CSV string to a DataFrame
    # StringIO is used to convert the string to a file-like object
    df_results = pd.read_csv(StringIO(csv_string))

    # Display the DataFrame
    print("Got the df!")
else:
    print("No CSV data found in the string.")
    
df_results.head()

Extracted CSV:
 
session,talk_title,duration
44,On the Naturalness of Fuzzer Generated Code,4
44,Exploring Apache Incubator Project Trajectories with APEX,4
44,Investigating the Impact of Forgetting in Software Development,8
44,Toward Granular Automatic Unit Test Case Generation,4
44,Tooling for Time- and Space-efficient git Repository Mining,4
14,Discussions and Q&A,14
7,Code Review Practices for Refactoring Changes: An Empirical Study on OpenStack,7
10,Back to the future: Empirical Revolution(s) in Software Engineering,10
4,Discussions and Q&A,21
8,Challenges in Migrating Imperative Deep Learning Programs to Graph Execution: An Empirical Study,15

Got the df!


Unnamed: 0,session,talk_title,duration
0,44,On the Naturalness of Fuzzer Generated Code,4
1,44,Exploring Apache Incubator Project Trajectorie...,4
2,44,Investigating the Impact of Forgetting in Soft...,8
3,44,Toward Granular Automatic Unit Test Case Gener...,4
4,44,Tooling for Time- and Space-efficient git Repo...,4


In [26]:
df_results

Unnamed: 0,session,talk_title,duration
0,44,On the Naturalness of Fuzzer Generated Code,4
1,44,Exploring Apache Incubator Project Trajectorie...,4
2,44,Investigating the Impact of Forgetting in Soft...,8
3,44,Toward Granular Automatic Unit Test Case Gener...,4
4,44,Tooling for Time- and Space-efficient git Repo...,4
5,14,Discussions and Q&A,14
6,7,Code Review Practices for Refactoring Changes:...,7
7,10,Back to the future: Empirical Revolution(s) in...,10
8,4,Discussions and Q&A,21
9,8,Challenges in Migrating Imperative Deep Learni...,15
