In [1]:
from pulp import *
import numpy as np

In [2]:
import pandas as pd
import re
from bs4 import BeautifulSoup

# Without error capture to ensure the output (whether success or failure) is reported back.
# Attempt to read tables from the HTML file using pandas
file_path = "data/Program - MSR 2022.html"

In [3]:
with open(file_path, 'r') as file:
    html_content = file.read()

In [4]:
soup = BeautifulSoup(html_content, 'lxml')

In [5]:
rows = soup.find_all('tr', class_='hidable')

In [6]:
# Regex to match date pattern (e.g., "Tue 16 May")
date_pattern = re.compile(r"\b(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)\s\d{1,2}\s(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)")

In [34]:
extracted_rows = []

# Loop through each row to extract required details
for row in rows:
    temp_row = []
    
    # Extract session details from parent div if not already extracted
    session_div = row.find_previous('div', class_='session-info-in-table')
    session = session_div.get_text(strip=True) if session_div else ''
    
    datetime_info = row.find('td', class_='text-right')
    talk_info = row.find_all('td')[-1]
    
    if datetime_info and talk_info:
        time = datetime_info.find('div', class_='start-time').get_text(strip=True) if datetime_info.find('div', class_='start-time') else ''
        duration = datetime_info.find('strong').get_text(strip=True) if datetime_info.find('strong') else ''
        
        # Calculate end time based on start time and duration if needed
        
        temp_row.append(time)
        # temp_row.append(end_time)  # Calculate and append end time if needed
        temp_row.append(duration)
        
        talk_title = talk_info.find('strong')
        if talk_title:
            temp_row.append(talk_title.get_text(strip=True))
            
            link = talk_title.find('a', href=True)
            temp_row.append(link['href'] if link else '')
        else:
            temp_row.extend(['', ''])
            
        # Extract authors
        authors_div = talk_info.find('div', class_='performers')
        authors = [author.get_text(strip=True) for author in authors_div.find_all('a')] if authors_div else []
        temp_row.append(authors)
        
        # Append session information
        temp_row.append(session)
        
        # Locate date information
        # Tip: You may need to adjust how you locate the 'date information' based on your HTML structure
        date_info = row.find_previous('div', class_='day-wrapper')
        if date_info:
            date_text = date_info.get_text(strip=True)
            matched_date = date_pattern.search(date_text)
            date = matched_date.group(0) if matched_date else "Date Not Found"
        else:
            date = None
        temp_row.append(date)
        
        extracted_rows.append(temp_row)

columns = ['Time', 'Duration', 'Talk Title', 'Link', 'Authors', 'Session', 'Date']
df = pd.DataFrame(extracted_rows, columns=columns)

df.head()

Unnamed: 0,Time,Duration,Talk Title,Link,Authors,Session,Date
0,22:00,4m,An Empirical Evaluation of GitHub Copilot’s Co...,#,"[Nhan Nguyen, Sarah Nadi]",Session 1Technical Papers/Registered Reportsat...,Tue 17 May
1,22:04,4m,Comments on Comments: Where Code Review and Do...,#,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May
2,22:08,7m,Does This Apply to Me? An Empirical Study of T...,#,"[Akalanka Galappaththi, Sarah Nadi, Christoph ...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May
3,22:15,7m,Towards Reliable Agile Iterative Planning via ...,#,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May
4,22:22,7m,BotHunter: An Approach to Detect Software Bots...,#,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May


In [37]:
df["Time"] = pd.to_datetime(df['Date'] + ' ' + df['Time'], format='%a %d %b %H:%M')
df.head()

  df["Time"] = pd.to_datetime(df['Date'] + ' ' + df['Time'], format='%a %d %b %H:%M')


TypeError: unsupported operand type(s) for +: 'Timestamp' and 'str'

In [8]:
# Convert all column names to lowercase and replace spaces with hyphens
df.columns = [col.lower().replace(' ', '_') for col in df.columns]

# Put date first
df = df[['date'] + [col for col in df.columns if col != 'date']]

# Convert Time to start_time as datetime and calculate end_time
df['start_time'] = pd.to_datetime(df['date'] + ' ' + df['time'], format='%a %d %b %H:%M')
df['duration'] = df['duration'].str.extract('(\d+)').astype(int)  # Extract duration in minutes
df['end_time'] = df['start_time'] + pd.to_timedelta(df['duration'], unit='m')

# Delete the Link column
df = df.drop(['link', 'time'], axis=1)

# Move Authors column to the last position
df = df[[col for col in df.columns if col != 'authors'] + ['authors']]

df = df[['session', 'date', 'start_time', 'end_time', 'duration', 'talk_title','authors']]

# Display the final data frame
df.head()

Unnamed: 0,session,date,start_time,end_time,duration,talk_title,authors
0,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,1900-05-17 22:00:00,1900-05-17 22:04:00,4,An Empirical Evaluation of GitHub Copilot’s Co...,"[Nhan Nguyen, Sarah Nadi]"
1,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,1900-05-17 22:04:00,1900-05-17 22:08:00,4,Comments on Comments: Where Code Review and Do...,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince..."
2,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,1900-05-17 22:08:00,1900-05-17 22:15:00,7,Does This Apply to Me? An Empirical Study of T...,"[Akalanka Galappaththi, Sarah Nadi, Christoph ..."
3,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,1900-05-17 22:15:00,1900-05-17 22:22:00,7,Towards Reliable Agile Iterative Planning via ...,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan..."
4,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,1900-05-17 22:22:00,1900-05-17 22:29:00,7,BotHunter: An Approach to Detect Software Bots...,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein..."


In [9]:
df.dtypes

session               object
date                  object
start_time    datetime64[ns]
end_time      datetime64[ns]
duration               int64
talk_title            object
authors               object
dtype: object

In [10]:
len(df)

187

In [11]:
df["session"].unique()[:3]

array(['Session 1Technical Papers/Registered ReportsatMSR Main room - even hoursChair(s):Hongyu ZhangUniversity of Newcastle,Masud RahmanDalhousie University',
       'Session 2: Maintenance (Issues & Smells)Technical Papers/Registered Reports/Data and Tool Showcase Track/Industry TrackatMSR Main room - odd hoursChair(s):Alessio FerrariCNR-ISTI',
       'Session 3: Introspection, Vision, and Human AspectsTechnical Papers/Data and Tool Showcase Track/Industry Track/Registered ReportsatMSR Main room - odd hoursChair(s):Alexander SerebrenikEindhoven University of Technology,Sebastian BaltesSAP SE & University of Adelaide'],
      dtype=object)

In [12]:
df[df["session"]=='Session 1Technical Papers/Registered ReportsatMSR Main room - even hoursChair(s):Hongyu ZhangUniversity of Newcastle,Masud RahmanDalhousie University']

Unnamed: 0,session,date,start_time,end_time,duration,talk_title,authors
0,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,1900-05-17 22:00:00,1900-05-17 22:04:00,4,An Empirical Evaluation of GitHub Copilot’s Co...,"[Nhan Nguyen, Sarah Nadi]"
1,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,1900-05-17 22:04:00,1900-05-17 22:08:00,4,Comments on Comments: Where Code Review and Do...,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince..."
2,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,1900-05-17 22:08:00,1900-05-17 22:15:00,7,Does This Apply to Me? An Empirical Study of T...,"[Akalanka Galappaththi, Sarah Nadi, Christoph ..."
3,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,1900-05-17 22:15:00,1900-05-17 22:22:00,7,Towards Reliable Agile Iterative Planning via ...,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan..."
4,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,1900-05-17 22:22:00,1900-05-17 22:29:00,7,BotHunter: An Approach to Detect Software Bots...,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein..."
5,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,1900-05-17 22:29:00,1900-05-17 22:36:00,7,Recommending Code Improvements Based on Stack ...,"[Chaiyong Ragkhitwetsagul, Matheus Paixao]"
6,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,1900-05-17 22:36:00,1900-05-17 22:50:00,14,Discussions and Q&A,[]


In [13]:
common_author_matrix = pd.DataFrame(np.zeros((len(df), len(df)), dtype=int), index=df['talk_title'], columns=df['talk_title'])

# Populate the matrix
for i in range(len(df)):
    for j in range(len(df)):
        # Intersect authors lists, if not empty set cell to 1
        if set(df.iloc[i]['authors']) & set(df.iloc[j]['authors']):
            common_author_matrix.iloc[i, j] = 1

common_author_matrix.head()

talk_title,An Empirical Evaluation of GitHub Copilot’s Code Suggestions,Comments on Comments: Where Code Review and Documentation Meet,Does This Apply to Me? An Empirical Study of Technical Context in Stack Overflow,Towards Reliable Agile Iterative Planning via Predicting Documentation Changes of Work Items,BotHunter: An Approach to Detect Software Bots in GitHub,Recommending Code Improvements Based on Stack Overflow Answer Edits,Discussions and Q&A,An Alternative Issue Tracking Dataset of Public Jira Repositories,"Smelly Variables in Ansible Infrastructure Code: Detection, Prevalence, and Lifetime",Beyond Duplicates: Towards Understanding and Predicting Link Types in Issue Tracking Systems,...,The General Index of Software Engineering Papers,Investigating the Impact of Forgetting in Software Development,Discussions and Q&A,Code Review Practices for Refactoring Changes: An Empirical Study on OpenStack,Painting the Landscape of Automotive Software in GitHub,SLNET: A Redistributable Corpus of 3rd-party Simulink Models,SoCCMiner: A Source Code-Comments and Comment-Context Miner,An Exploratory Study on Refactoring Documentation in Issues Handling,Between JIRA and GitHub: ASFBot and its Influence on Human Comments in Issue Trackers,Discussions and Q&A
talk_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
An Empirical Evaluation of GitHub Copilot’s Code Suggestions,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Comments on Comments: Where Code Review and Documentation Meet,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Does This Apply to Me? An Empirical Study of Technical Context in Stack Overflow,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Towards Reliable Agile Iterative Planning via Predicting Documentation Changes of Work Items,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
BotHunter: An Approach to Detect Software Bots in GitHub,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Perform linear programming

In [14]:
papers = df[["talk_title","authors","duration"]]
papers.head()

Unnamed: 0,talk_title,authors,duration
0,An Empirical Evaluation of GitHub Copilot’s Co...,"[Nhan Nguyen, Sarah Nadi]",4
1,Comments on Comments: Where Code Review and Do...,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince...",4
2,Does This Apply to Me? An Empirical Study of T...,"[Akalanka Galappaththi, Sarah Nadi, Christoph ...",7
3,Towards Reliable Agile Iterative Planning via ...,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan...",7
4,BotHunter: An Approach to Detect Software Bots...,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein...",7


In [15]:
df.dtypes

session               object
date                  object
start_time    datetime64[ns]
end_time      datetime64[ns]
duration               int64
talk_title            object
authors               object
dtype: object

In [16]:
sessions = df.groupby("session")["duration"].sum()

sessions.head()

session
Blended Technical Session 1 (Integration, Large-scale mining, and Software Ecosystems)Technical Papers/Data and Tool Showcase TrackatRoom 315+316Chair(s):Bogdan VasilescuCarnegie Mellon University, USA              90
Blended Technical Session 2 (Machine Learning and Information Retrieval)Technical Papers/Data and Tool Showcase TrackatRoom 315+316Chair(s):Preetha ChatterjeeDrexel University, USA                                   90
Blended Technical Session 3 (Smells and Maintenance)Technical Papers/Mining Challenge/Registered Reports/Data and Tool Showcase TrackatRoom 315+316Chair(s):Andy ZaidmanDelft University of Technology                 90
Blended Technical Session 4 (Introspection, Vision, and Human Aspects)Technical Papers/Registered Reports/Data and Tool Showcase TrackatRoom 315+316Chair(s):Ayushi RastogiUniversity of Groningen, The Netherlands    75
Blended Technical Session 5 (Miscellaneous)Technical Papers/Data and Tool Showcase Track/Mining ChallengeatRoom 315+316C

In [17]:
# Assumption of variables for illustration
num_sessions = len(sessions)  # Number of sessions as defined by PC chairs
session_lengths = sessions.values  # Length of each session in minutes
num_tracks = 2
# papers = [
#     {"title": "Paper1", "authors": ["Nhan Nguyen", "Sarah Nadi"], "duration": 20, "topics": ["GitHub", "Code Assistance"]},
#     {"title": "Paper2", "authors": ["Nikitha Rao"], "duration": 25, "topics": ["Code Review", "Documentation"]},
#     {"title": "Paper3", "authors": ["Akalanka Galappaththi"], "duration": 15, "topics": ["Empirical Study", "Type Systems"]},
#     # Add other papers similarly...
# ]

In [25]:
# Initiate the problem
prob = LpProblem("Conference_Schedule_Optimization", LpMinimize)

# Decision variables
schedule = LpVariable.dicts("schedule", ((i, j, k) for i in range(num_sessions) for j in range(len(papers)) for k in range(num_tracks)), cat='Binary')

# Objective function: For the basic structure, we make it a dummy one as our main focus is on satisfying constraints
prob += 0, "ArbitraryObjective"

In [26]:
# Constraints

# Ensure each paper is scheduled exactly once
for j in range(len(papers)):
    prob += lpSum(schedule[(i, j, k)] for i in range(num_sessions) for k in range(num_tracks)) == 1, f"One_placement_paper_{j}"

# Do not exceed session length
for i in range(num_sessions):
    for k in range(num_tracks):
        prob += lpSum(schedule[(i, j, k)] * papers.loc[j]["duration"] for j in range(len(papers))) <= session_lengths[i], f"Session_length_limit_{i}_{k}"

# No parallel scheduling of papers with common authors
# This part is tricky and depends on your exact requirement and how you've structured your data
# A simple and not very efficient way to approach it is to loop through every combination of papers and sessions
for i in range(num_sessions):
    for k1 in range(num_tracks):
        for k2 in range(k1+1, num_tracks):
            for j1, paper1 in enumerate(papers):
                for j2, paper2 in enumerate(papers):
                    if j1 != j2 and set(paper1["authors"]).intersection(paper2["authors"]):
                        print(k1, k2, j1, j2)
                        # If papers have at least one common author, they cannot be scheduled in parallel
                        prob += schedule[(i, j1, k1)] + schedule[(i, j2, k2)] <= 1, f"No_parallel_{j1}_{j2}_session_{i}"


In [27]:
# Solve the problem
prob.solve()

Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /opt/conda/lib/python3.10/site-packages/pulp/solverdir/cbc/linux/64/cbc /var/tmp/3484bf2d78414d6f92268f1e06ae3b11-pulp.mps -timeMode elapsed -branch -printingOptions all -solution /var/tmp/3484bf2d78414d6f92268f1e06ae3b11-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 256 COLUMNS
At line 48130 RHS
At line 48382 BOUNDS
At line 60352 ENDATA
Problem MODEL has 251 rows, 11969 columns and 23936 elements
Coin0008I MODEL read with 0 errors
Option for timeMode changed from cpu to elapsed
Continuous objective value is 0 - 0.07 seconds
Cgl0005I 187 SOS with 11968 members
Cgl0004I processed model has 251 rows, 11968 columns (11968 integer (11968 of which binary)) and 23936 elements
Cbc0038I Initial state - 28 integers unsatisfied sum - 7.15201
Cbc0038I Pass   1: suminf.    2.86487 (12) obj. 0 iterations 215
Cbc0038I Pass   2: suminf.    1.68000 (8) obj. 0 iterations 49
Cbc

1

In [30]:
# Output schedule (simple print, you'd need to format it more nicely)
for i in range(num_sessions):
    for k in range(num_tracks):
        print(f"Session {i+1}, Track {k+1}")
        for j in range(len(papers)):
            if schedule[(i, j, k)].varValue == 1:
                print(f"  Paper: {papers.loc[j]['talk_title']} | Duration: {papers.loc[j]['duration']} mins")
        print("\n")

Session 1, Track 1
  Paper: Senatus: A Fast and Accurate Code-to-Code Recommendation Engine | Duration: 7 mins
  Paper: From Models to Systems: Rethinking the Role of Software Engineering for Machine Learning | Duration: 35 mins


Session 1, Track 2
  Paper: Discussions and Q&A | Duration: 14 mins


Session 2, Track 1
  Paper: TSSB-3M: Mining single statement bugs at massive scale | Duration: 8 mins
  Paper: Investigating the Impact of Forgetting in Software Development | Duration: 8 mins


Session 2, Track 2
  Paper: A Large-scale Dataset of (Open Source) License Text VariantsData and Tool Showcase Award | Duration: 8 mins
  Paper: Smelly Variables in Ansible Infrastructure Code: Detection, Prevalence, and Lifetime | Duration: 15 mins
  Paper: npm-filter: Automating the mining of dynamic information from npm packages | Duration: 8 mins
  Paper: Discussions and Q&A | Duration: 28 mins


Session 3, Track 1
  Paper: Operationalizing Threats to MSR Studies by Simulation-Based TestingDisti