In [1]:
from pulp import *
import numpy as np

In [2]:
import pandas as pd
import re
from bs4 import BeautifulSoup

# Without error capture to ensure the output (whether success or failure) is reported back.
# Attempt to read tables from the HTML file using pandas
file_path = "data/Program - MSR 2022.html"

In [3]:
with open(file_path, 'r') as file:
    html_content = file.read()

In [4]:
soup = BeautifulSoup(html_content, 'lxml')

In [5]:
rows = soup.find_all('tr', class_='hidable')

In [6]:
# Regex to match date pattern (e.g., "Tue 16 May")
date_pattern = re.compile(r"\b(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)\s\d{1,2}\s(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)")

In [7]:
extracted_rows = []

# Loop through each row to extract required details
for row in rows:
    temp_row = []
    
    # Extract session details from parent div if not already extracted
    session_div = row.find_previous('div', class_='session-info-in-table')
    session = session_div.get_text(strip=True) if session_div else ''
    
    datetime_info = row.find('td', class_='text-right')
    talk_info = row.find_all('td')[-1]
    
    if datetime_info and talk_info:
        time = datetime_info.find('div', class_='start-time').get_text(strip=True) if datetime_info.find('div', class_='start-time') else ''
        duration = datetime_info.find('strong').get_text(strip=True) if datetime_info.find('strong') else ''
        
        # Calculate end time based on start time and duration if needed
        
        temp_row.append(time)
        # temp_row.append(end_time)  # Calculate and append end time if needed
        temp_row.append(duration)
        
        talk_title = talk_info.find('strong')
        if talk_title:
            temp_row.append(talk_title.get_text(strip=True))
            
            link = talk_title.find('a', href=True)
            temp_row.append(link['href'] if link else '')
        else:
            temp_row.extend(['', ''])
            
        # Extract authors
        authors_div = talk_info.find('div', class_='performers')
        authors = [author.get_text(strip=True) for author in authors_div.find_all('a')] if authors_div else []
        temp_row.append(authors)
        
        # Append session information
        temp_row.append(session)
        
        # Locate date information
        # Tip: You may need to adjust how you locate the 'date information' based on your HTML structure
        date_info = row.find_previous('div', class_='day-wrapper')
        if date_info:
            date_text = date_info.get_text(strip=True)
            matched_date = date_pattern.search(date_text)
            date = matched_date.group(0) if matched_date else "Date Not Found"
        else:
            date = None
        temp_row.append(date)
        
        extracted_rows.append(temp_row)

columns = ['Time', 'Duration', 'Talk Title', 'Link', 'Authors', 'Session', 'Date']
df = pd.DataFrame(extracted_rows, columns=columns)

df.head()

Unnamed: 0,Time,Duration,Talk Title,Link,Authors,Session,Date
0,22:00,4m,An Empirical Evaluation of GitHub Copilot’s Co...,#,"[Nhan Nguyen, Sarah Nadi]",Session 1Technical Papers/Registered Reportsat...,Tue 17 May
1,22:04,4m,Comments on Comments: Where Code Review and Do...,#,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May
2,22:08,7m,Does This Apply to Me? An Empirical Study of T...,#,"[Akalanka Galappaththi, Sarah Nadi, Christoph ...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May
3,22:15,7m,Towards Reliable Agile Iterative Planning via ...,#,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May
4,22:22,7m,BotHunter: An Approach to Detect Software Bots...,#,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May


In [8]:
# Convert all column names to lowercase and replace spaces with hyphens
df.columns = [col.lower().replace(' ', '_') for col in df.columns]

# Put date first
df = df[['date'] + [col for col in df.columns if col != 'date']]

# Convert Time to start_time as datetime and calculate end_time
df['start_time'] = pd.to_datetime(df['date'] + ' ' + df['time'], format='%a %d %b %H:%M')
df['duration'] = df['duration'].str.extract('(\d+)').astype(int)  # Extract duration in minutes
df['end_time'] = df['start_time'] + pd.to_timedelta(df['duration'], unit='m')

# Delete the Link column
df = df.drop(['link', 'time'], axis=1)

# Move Authors column to the last position
df = df[[col for col in df.columns if col != 'authors'] + ['authors']]

df = df[['session', 'date', 'start_time', 'end_time', 'duration', 'talk_title','authors']]

# Display the final data frame
df.head()

Unnamed: 0,session,date,start_time,end_time,duration,talk_title,authors
0,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,1900-05-17 22:00:00,1900-05-17 22:04:00,4,An Empirical Evaluation of GitHub Copilot’s Co...,"[Nhan Nguyen, Sarah Nadi]"
1,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,1900-05-17 22:04:00,1900-05-17 22:08:00,4,Comments on Comments: Where Code Review and Do...,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince..."
2,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,1900-05-17 22:08:00,1900-05-17 22:15:00,7,Does This Apply to Me? An Empirical Study of T...,"[Akalanka Galappaththi, Sarah Nadi, Christoph ..."
3,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,1900-05-17 22:15:00,1900-05-17 22:22:00,7,Towards Reliable Agile Iterative Planning via ...,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan..."
4,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,1900-05-17 22:22:00,1900-05-17 22:29:00,7,BotHunter: An Approach to Detect Software Bots...,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein..."


In [9]:
df['start_time'] = pd.to_datetime(df['start_time']).astype(int) // 10**9 // 60
df['end_time'] = pd.to_datetime(df['end_time']).astype(int) // 10**9 // 60

df.head()

Unnamed: 0,session,date,start_time,end_time,duration,talk_title,authors
0,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,-36619320,-36619316,4,An Empirical Evaluation of GitHub Copilot’s Co...,"[Nhan Nguyen, Sarah Nadi]"
1,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,-36619316,-36619312,4,Comments on Comments: Where Code Review and Do...,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince..."
2,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,-36619312,-36619305,7,Does This Apply to Me? An Empirical Study of T...,"[Akalanka Galappaththi, Sarah Nadi, Christoph ..."
3,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,-36619305,-36619298,7,Towards Reliable Agile Iterative Planning via ...,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan..."
4,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,-36619298,-36619291,7,BotHunter: An Approach to Detect Software Bots...,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein..."


In [10]:
df.dtypes

session       object
date          object
start_time     int64
end_time       int64
duration       int64
talk_title    object
authors       object
dtype: object

In [11]:
len(df)

187

In [12]:
df["session"].unique()[:3]

array(['Session 1Technical Papers/Registered ReportsatMSR Main room - even hoursChair(s):Hongyu ZhangUniversity of Newcastle,Masud RahmanDalhousie University',
       'Session 2: Maintenance (Issues & Smells)Technical Papers/Registered Reports/Data and Tool Showcase Track/Industry TrackatMSR Main room - odd hoursChair(s):Alessio FerrariCNR-ISTI',
       'Session 3: Introspection, Vision, and Human AspectsTechnical Papers/Data and Tool Showcase Track/Industry Track/Registered ReportsatMSR Main room - odd hoursChair(s):Alexander SerebrenikEindhoven University of Technology,Sebastian BaltesSAP SE & University of Adelaide'],
      dtype=object)

In [13]:
df[df["session"]=='Session 1Technical Papers/Registered ReportsatMSR Main room - even hoursChair(s):Hongyu ZhangUniversity of Newcastle,Masud RahmanDalhousie University']

Unnamed: 0,session,date,start_time,end_time,duration,talk_title,authors
0,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,-36619320,-36619316,4,An Empirical Evaluation of GitHub Copilot’s Co...,"[Nhan Nguyen, Sarah Nadi]"
1,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,-36619316,-36619312,4,Comments on Comments: Where Code Review and Do...,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince..."
2,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,-36619312,-36619305,7,Does This Apply to Me? An Empirical Study of T...,"[Akalanka Galappaththi, Sarah Nadi, Christoph ..."
3,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,-36619305,-36619298,7,Towards Reliable Agile Iterative Planning via ...,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan..."
4,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,-36619298,-36619291,7,BotHunter: An Approach to Detect Software Bots...,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein..."
5,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,-36619291,-36619284,7,Recommending Code Improvements Based on Stack ...,"[Chaiyong Ragkhitwetsagul, Matheus Paixao]"
6,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,-36619284,-36619270,14,Discussions and Q&A,[]


In [14]:
def merge_intervals(intervals):
    if not intervals:
        return 0
    
    # Sort intervals by the start time
    intervals.sort(key=lambda x: x[0])
    
    merged = [intervals[0]]
    for current_start, current_end in intervals[1:]:
        last_end = merged[-1][1]
        
        if current_start <= last_end:
            # There's an overlap, extend the previous interval
            merged[-1] = (merged[-1][0], max(last_end, current_end))
        else:
            # No overlap, add this interval as is
            merged.append((current_start, current_end))
    
    # Compute total duration in minutes (or another unit as desired)
    total_duration_minutes = sum((end - start) for start, end in merged)   # convert seconds to minutes
    return total_duration_minutes

total_durations = []

for date, group in df.groupby('date'):
    intervals = list(zip(group['start_time'], group['end_time']))
    total_duration = merge_intervals(intervals)
    total_durations.append({'date': date, 'total_duration': total_duration})

# Converting the result into a DataFrame
total_duration_df = pd.DataFrame(total_durations)

total_duration_df

Unnamed: 0,date,total_duration
0,Fri 20 May,260
1,Mon 23 May,270
2,Thu 19 May,502
3,Tue 17 May,50
4,Tue 24 May,255
5,Wed 18 May,350


In [15]:
dates = df.groupby("date")["duration"].sum().reset_index()


dates

Unnamed: 0,date,duration
0,Fri 20 May,310
1,Mon 23 May,270
2,Thu 19 May,502
3,Tue 17 May,50
4,Tue 24 May,255
5,Wed 18 May,450


In [16]:
common_author_matrix = pd.DataFrame(np.zeros((len(df), len(df)), dtype=int), index=df['talk_title'], columns=df['talk_title'])

# Populate the matrix
for i in range(len(df)):
    for j in range(len(df)):
        # Intersect authors lists, if not empty set cell to 1
        if set(df.iloc[i]['authors']) & set(df.iloc[j]['authors']):
            common_author_matrix.iloc[i, j] = 1

common_author_matrix.head()

talk_title,An Empirical Evaluation of GitHub Copilot’s Code Suggestions,Comments on Comments: Where Code Review and Documentation Meet,Does This Apply to Me? An Empirical Study of Technical Context in Stack Overflow,Towards Reliable Agile Iterative Planning via Predicting Documentation Changes of Work Items,BotHunter: An Approach to Detect Software Bots in GitHub,Recommending Code Improvements Based on Stack Overflow Answer Edits,Discussions and Q&A,An Alternative Issue Tracking Dataset of Public Jira Repositories,"Smelly Variables in Ansible Infrastructure Code: Detection, Prevalence, and Lifetime",Beyond Duplicates: Towards Understanding and Predicting Link Types in Issue Tracking Systems,...,The General Index of Software Engineering Papers,Investigating the Impact of Forgetting in Software Development,Discussions and Q&A,Code Review Practices for Refactoring Changes: An Empirical Study on OpenStack,Painting the Landscape of Automotive Software in GitHub,SLNET: A Redistributable Corpus of 3rd-party Simulink Models,SoCCMiner: A Source Code-Comments and Comment-Context Miner,An Exploratory Study on Refactoring Documentation in Issues Handling,Between JIRA and GitHub: ASFBot and its Influence on Human Comments in Issue Trackers,Discussions and Q&A
talk_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
An Empirical Evaluation of GitHub Copilot’s Code Suggestions,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Comments on Comments: Where Code Review and Documentation Meet,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Does This Apply to Me? An Empirical Study of Technical Context in Stack Overflow,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Towards Reliable Agile Iterative Planning via Predicting Documentation Changes of Work Items,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BotHunter: An Approach to Detect Software Bots in GitHub,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Perform linear programming

In [17]:
papers = df[["talk_title","authors","duration"]]
papers.head()

Unnamed: 0,talk_title,authors,duration
0,An Empirical Evaluation of GitHub Copilot’s Co...,"[Nhan Nguyen, Sarah Nadi]",4
1,Comments on Comments: Where Code Review and Do...,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince...",4
2,Does This Apply to Me? An Empirical Study of T...,"[Akalanka Galappaththi, Sarah Nadi, Christoph ...",7
3,Towards Reliable Agile Iterative Planning via ...,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan...",7
4,BotHunter: An Approach to Detect Software Bots...,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein...",7


In [18]:
df.dtypes

session       object
date          object
start_time     int64
end_time       int64
duration       int64
talk_title    object
authors       object
dtype: object

In [19]:
sessions = df.groupby("session")["duration"].sum()

sessions.head()

session
Blended Technical Session 1 (Integration, Large-scale mining, and Software Ecosystems)Technical Papers/Data and Tool Showcase TrackatRoom 315+316Chair(s):Bogdan VasilescuCarnegie Mellon University, USA              90
Blended Technical Session 2 (Machine Learning and Information Retrieval)Technical Papers/Data and Tool Showcase TrackatRoom 315+316Chair(s):Preetha ChatterjeeDrexel University, USA                                   90
Blended Technical Session 3 (Smells and Maintenance)Technical Papers/Mining Challenge/Registered Reports/Data and Tool Showcase TrackatRoom 315+316Chair(s):Andy ZaidmanDelft University of Technology                 90
Blended Technical Session 4 (Introspection, Vision, and Human Aspects)Technical Papers/Registered Reports/Data and Tool Showcase TrackatRoom 315+316Chair(s):Ayushi RastogiUniversity of Groningen, The Netherlands    75
Blended Technical Session 5 (Miscellaneous)Technical Papers/Data and Tool Showcase Track/Mining ChallengeatRoom 315+316C

In [20]:
df.groupby("session")["duration"].sum().values

array([90, 90, 90, 75, 90, 50, 50, 90, 50, 50, 50, 50, 50, 50, 50, 50, 60,
       50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 50, 51, 51])

In [21]:
df.groupby("session").size().values

array([8, 7, 7, 6, 7, 1, 7, 3, 1, 8, 8, 7, 7, 7, 7, 9, 8, 7, 7, 7, 7, 8,
       8, 8, 8, 7, 1, 1, 1, 1, 4, 4])

In [22]:
len(sessions)

32

In [23]:
# Assumption of variables for illustration
num_sessions = len(sessions)  # Number of sessions as defined by PC chairs
session_lengths = sessions.values  # Length of each session in minutes
num_tracks = 1 # FOR NOW, ASSUME WE ONLY HAVE A SINGLE TRACK


In [24]:
# Initiate the problem
prob = LpProblem("Conference_Schedule_Optimization", LpMinimize)

# Decision variables
schedule = LpVariable.dicts("schedule", 
                            ((i, j, k) 
                             for i in range(len(papers)) # paper number
                             for j in range(num_sessions) # session number
                             for k in range(num_tracks) # track number
                            ) 
                            ,cat='Binary' # to make it precise
                       )
# start_times = LpVariable.dicts("start_times", 
#                             (i 
#                              for i in range(len(papers)) # paper number
#                             ) # start time
#                             ,cat='Continuous'
#                        )
# end_times = LpVariable.dicts("end_times", 
#                             (i 
#                              for i in range(len(papers)) # paper number
#                             ) # end time
#                             ,cat='Continuous'
#                        )

# Objective function: For the basic structure, we make it a dummy one as our main focus is on satisfying constraints
prob += 0, "ArbitraryObjective"

In [25]:
# Constraints

# for i in range(len(papers)):
#     for j in range(num_sessions):
#         for k in range(num_tracks):
#             prob += schedule[(i, j, k)] >= 0
#             prob += schedule[(i, j, k)] <= 1


# Ensure each paper is scheduled exactly once
for i in range(len(papers)):
    prob += lpSum(schedule[(i, j, k)] for j in range(num_sessions) for k in range(num_tracks)) == 1, f"One_placement_paper_{i}"

# Do not exceed session length
for j in range(num_sessions):
    for k in range(num_tracks):
        prob += lpSum(schedule[(i, j, k)] * papers.loc[i]["duration"] for i in range(len(papers))) <= session_lengths[j], f"Session_length_limit_{j}_{k}"

        
# # No parallel scheduling of papers with common authors
# # We implement it by adding constraints to make sure that no papers with common authors are scheduled on parallel sessions. 
# for i1 in range(len(papers)):
#     for i2 in range(len(papers)):
#         if i1 != i2 and set(papers.loc[i1]["authors"]).intersection(papers.loc[i1]["authors"]):
#             for j in range(num_sessions):
#                 for k1 in range(num_tracks):
#                     for k2 in range(num_tracks):
#                         prob += schedule[(i1, j, k1)] + schedule[(i2, j, k2)] <= 1, f"No_parallel_{j1}_{j2}_session_{i}"


In [26]:
# Solve the problem
prob.solve()

Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /opt/conda/lib/python3.10/site-packages/pulp/solverdir/cbc/linux/64/cbc /var/tmp/4c73a7df0b484a0fa438968e852b8778-pulp.mps -timeMode elapsed -branch -printingOptions all -solution /var/tmp/4c73a7df0b484a0fa438968e852b8778-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 224 COLUMNS
At line 24162 RHS
At line 24382 BOUNDS
At line 30368 ENDATA
Problem MODEL has 219 rows, 5985 columns and 11968 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 0 - 0.05 seconds
Cgl0005I 187 SOS with 5984 members
Cgl0004I processed model has 219 rows, 5984 columns (5984 integer (5984 of which binary)) and 11968 elements
Cbc0038I Initial state - 54 integers unsatisfied sum - 14.7971
Cbc0038I Pass   1: suminf.    8.86048 (42) obj. 0 iterations 188
Cbc0038I Pass   2: suminf.    8.54619 (42) obj. 0 iterations 8
Cbc0038I


KeyboardInterrupt



In [None]:
# Initialize an empty list to hold the results
results = []

# Iterate over each session and track
for j in range(num_sessions):
    for k in range(num_tracks):
        session_number = j + 1
        # Iterate over each paper
        for i in range(len(papers)):
            if schedule[(i, j, k)].varValue == 1:
                # Prepare a dict with the required information
                paper_info = {
                    'session': session_number,
                    'title': papers.loc[i]['talk_title'],
                    'duration': papers.loc[i]['duration'],
                    'validity': 1
                }
                # Append to the results list
                results.append(paper_info)
            elif schedule[(i, j, k)].varValue > 0:
                # Prepare a dict with the required information
                paper_info = {
                    'session': session_number,
                    'title': papers.loc[i]['talk_title'],
                    'duration': papers.loc[i]['duration'],
                    'validity': 0
                }
                # Append to the results list
                results.append(paper_info)

# Convert the list of dictionaries into a DataFrame
df_results = pd.DataFrame(results)

# If you need to see the first few rows of the DataFrame to ensure it's correct
df_results.head()

In [None]:
len(df_results), len(df)

In [None]:
df_results["validity"].mean()

In [None]:
new_session_lengths = (df_results.groupby("session")["duration"].sum() + pd.Series(index=range(1,1+num_sessions), data = 0)).fillna(0)
new_session_lengths

In [None]:
df.groupby("session")["duration"].sum().values

In [None]:
df.groupby("session")["duration"].sum().values - new_session_lengths