In [1]:
import datetime
from icalendar import Calendar
import pandas as pd

## Datat preprocessing

In [2]:
def ics_to_dataframe(file_path):
    # Open and parse the .ics file
    with open(file_path, 'rb') as f:
        cal = Calendar.from_ical(f.read())

    # Prepare a list to hold event data
    events = []

    for component in cal.walk():
        if component.name == "VEVENT":
            # Extract relevant data from each event
            try:
                title, authors = component.get('summary').split(" - ", 1)  # Expecting exactly 2 items
                authors = authors.split('\,')
            except ValueError:  # If not exactly 2 items, then it's a ValueError
                title = component.get('summary')  # Take the whole summary as title
                authors = None  # Default, no authors found
                
            start_dt = component.get('dtstart').dt
            end_dt = component.get('dtend').dt
            title = component.get('summary')
            description = component.get('description')
            
            if isinstance(start_dt, datetime.datetime) and isinstance(end_dt, datetime.datetime):
                # Calculate duration
                duration = (end_dt - start_dt).total_seconds() / 60  # duration in minutes
                # Formatting date and start time
                date = start_dt.strftime('%Y-%m-%d')
                start_time = start_dt.strftime('%H:%M')
                

                # Append event info to the list
                events.append([date, start_time, duration, title, authors, description])

    # Create a DataFrame
    df = pd.DataFrame(events, columns=['date', 'start time', 'duration_mins', 'title', 'authors', 'description'])

    return df

file_path = "data/event-calendar-2022.ics"
df = ics_to_dataframe(file_path)
df.head()

Unnamed: 0,date,start time,duration_mins,title,authors,description
0,2022-05-18,02:00,4.0,[MSR Technical Papers] An Empirical Evaluation...,"[Nhan Nguyen, Sarah Nadi]",GitHub and OpenAI recently launched GitHub Cop...
1,2022-05-18,02:04,4.0,[MSR Technical Papers] Comments on Comments: W...,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince...",An important function of code review is to inc...
2,2022-05-18,02:08,7.0,[MSR Technical Papers] Does This Apply to Me? ...,"[Akalanka Galappaththi, Sarah Nadi, Christoph ...",Stack Overflow has become an essential technic...
3,2022-05-18,02:15,7.0,[MSR Technical Papers] Towards Reliable Agile ...,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan...","In agile iterative development, an agile team ..."
4,2022-05-18,02:22,7.0,[MSR Technical Papers] BotHunter: An Approach ...,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein...",Bots have become popular in software projects ...


In [3]:
def parse_sessions(file_path):
    with open(file_path, 'rb') as f:
        cal = Calendar.from_ical(f.read())

    sessions = {}
    
    for component in cal.walk('vevent'):
        start_dt = component.get('dtstart').dt
        end_dt = component.get('dtend').dt
        summary = str(component.get('summary'))
        session_name = summary.split(': ')[-1]  # Assuming session name is after last colon
        
        # Key by start date, end date, and location for simplicity; tweak as needed
        key = (start_dt, end_dt)
        sessions[key] = session_name
    
    return sessions

# Function to find a session for a given event
def find_session_for_event(sessions, event_start, event_end):
    # Note: This simplistic approach assumes exact match of start/end times. 
    # Consider more complex logic if sessions/events don't align exactly.
    key = (event_start, event_end)
    return sessions.get(key, "Unknown Session")

# Load sessions
session_calendar_path = "data/session-calendar-2022.ics"
sessions = parse_sessions(session_calendar_path)

# Map events to sessions and add a new column
df['session_name'] = df.apply(lambda row: find_session_for_event(sessions, 
                                             datetime.datetime.strptime(row['date'] + ' ' + row['start time'], '%Y-%m-%d %H:%M'),
                                             datetime.datetime.strptime(row['date'] + ' ' + row['start time'], '%Y-%m-%d %H:%M') + datetime.timedelta(minutes=row['duration_mins'])),
                                             axis=1)

df.head()

Unnamed: 0,date,start time,duration_mins,title,authors,description,session_name
0,2022-05-18,02:00,4.0,[MSR Technical Papers] An Empirical Evaluation...,"[Nhan Nguyen, Sarah Nadi]",GitHub and OpenAI recently launched GitHub Cop...,Unknown Session
1,2022-05-18,02:04,4.0,[MSR Technical Papers] Comments on Comments: W...,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince...",An important function of code review is to inc...,Unknown Session
2,2022-05-18,02:08,7.0,[MSR Technical Papers] Does This Apply to Me? ...,"[Akalanka Galappaththi, Sarah Nadi, Christoph ...",Stack Overflow has become an essential technic...,Unknown Session
3,2022-05-18,02:15,7.0,[MSR Technical Papers] Towards Reliable Agile ...,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan...","In agile iterative development, an agile team ...",Unknown Session
4,2022-05-18,02:22,7.0,[MSR Technical Papers] BotHunter: An Approach ...,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein...",Bots have become popular in software projects ...,Unknown Session


In [4]:
import pandas as pd
import re
from bs4 import BeautifulSoup

# Without error capture to ensure the output (whether success or failure) is reported back.
# Attempt to read tables from the HTML file using pandas
file_path = "data/Program - MSR 2022.html"

In [5]:
with open(file_path, 'r') as file:
    html_content = file.read()

In [6]:
soup = BeautifulSoup(html_content, 'lxml')

In [7]:
rows = soup.find_all('tr', class_='hidable')

In [8]:
date_pattern = re.compile(r"\b(?:Mon|Tue|Wed|Thu|Fri|Sat|Sun)\s\d{1,2}\s(?:Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)")

In [9]:
extracted_rows = []

# Loop through each row to extract required details
for row in rows:
    temp_row = []
    
    # Extract session details from parent div if not already extracted
    session_div = row.find_previous('div', class_='session-info-in-table')
    session = session_div.get_text(strip=True) if session_div else ''
    
    datetime_info = row.find('td', class_='text-right')
    talk_info = row.find_all('td')[-1]
    
    if datetime_info and talk_info:
        time = datetime_info.find('div', class_='start-time').get_text(strip=True) if datetime_info.find('div', class_='start-time') else ''
        duration = datetime_info.find('strong').get_text(strip=True) if datetime_info.find('strong') else ''
        
        # Calculate end time based on start time and duration if needed
        
        temp_row.append(time)
        # temp_row.append(end_time)  # Calculate and append end time if needed
        temp_row.append(duration)
        
        talk_title = talk_info.find('strong')
        if talk_title:
            temp_row.append(talk_title.get_text(strip=True))
            
            link = talk_title.find('a', href=True)
            temp_row.append(link['href'] if link else '')
        else:
            temp_row.extend(['', ''])
            
        # Extract authors
        authors_div = talk_info.find('div', class_='performers')
        authors = [author.get_text(strip=True) for author in authors_div.find_all('a')] if authors_div else []
        temp_row.append(authors)
        
        # Append session information
        temp_row.append(session)
        
        # Locate date information
        # Tip: You may need to adjust how you locate the 'date information' based on your HTML structure
        date_info = row.find_previous('div', class_='day-wrapper')
        if date_info:
            date_text = date_info.get_text(strip=True)
            matched_date = date_pattern.search(date_text)
            date = matched_date.group(0) if matched_date else "Date Not Found"
        else:
            date = None
        temp_row.append(date)
        
        extracted_rows.append(temp_row)

columns = ['Time', 'Duration', 'Talk Title', 'Link', 'Authors', 'Session', 'Date']
df_1 = pd.DataFrame(extracted_rows, columns=columns)

df_1.head()

Unnamed: 0,Time,Duration,Talk Title,Link,Authors,Session,Date
0,22:00,4m,An Empirical Evaluation of GitHub Copilot’s Co...,#,"[Nhan Nguyen, Sarah Nadi]",Session 1Technical Papers/Registered Reportsat...,Tue 17 May
1,22:04,4m,Comments on Comments: Where Code Review and Do...,#,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May
2,22:08,7m,Does This Apply to Me? An Empirical Study of T...,#,"[Akalanka Galappaththi, Sarah Nadi, Christoph ...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May
3,22:15,7m,Towards Reliable Agile Iterative Planning via ...,#,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May
4,22:22,7m,BotHunter: An Approach to Detect Software Bots...,#,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May


In [10]:
df.columns

Index(['date', 'start time', 'duration_mins', 'title', 'authors',
       'description', 'session_name'],
      dtype='object')

In [11]:
df_merged = pd.concat([df_1, df[['description']]], axis=1)

In [12]:
df_merged.head(3)

Unnamed: 0,Time,Duration,Talk Title,Link,Authors,Session,Date,description
0,22:00,4m,An Empirical Evaluation of GitHub Copilot’s Co...,#,"[Nhan Nguyen, Sarah Nadi]",Session 1Technical Papers/Registered Reportsat...,Tue 17 May,GitHub and OpenAI recently launched GitHub Cop...
1,22:04,4m,Comments on Comments: Where Code Review and Do...,#,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May,An important function of code review is to inc...
2,22:08,7m,Does This Apply to Me? An Empirical Study of T...,#,"[Akalanka Galappaththi, Sarah Nadi, Christoph ...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May,Stack Overflow has become an essential technic...


In [13]:
df_merged.columns

Index(['Time', 'Duration', 'Talk Title', 'Link', 'Authors', 'Session', 'Date',
       'description'],
      dtype='object')

In [14]:
df_input = df_merged[['Time', 'Duration', 'Talk Title', 'Link', 'Authors', 'Session', 'Date',
       'description']]
df_input = df_input[(df_input['Talk Title']!="Discussions and Q&A")&(df_input.description!="")].reset_index(drop=True)
df_input = df_input.reset_index()
df_input = df_input.rename(columns={
    'Talk Title': 'title',
    'description': 'abstract',
    'index': 'paper_number'})
df_input_no_duplication = df_input[~df_input.Session.str.startswith("Blended")]

In [15]:
df_input_no_duplication.head(3)

Unnamed: 0,paper_number,Time,Duration,title,Link,Authors,Session,Date,abstract
0,0,22:00,4m,An Empirical Evaluation of GitHub Copilot’s Co...,#,"[Nhan Nguyen, Sarah Nadi]",Session 1Technical Papers/Registered Reportsat...,Tue 17 May,GitHub and OpenAI recently launched GitHub Cop...
1,1,22:04,4m,Comments on Comments: Where Code Review and Do...,#,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May,An important function of code review is to inc...
2,2,22:08,7m,Does This Apply to Me? An Empirical Study of T...,#,"[Akalanka Galappaththi, Sarah Nadi, Christoph ...",Session 1Technical Papers/Registered Reportsat...,Tue 17 May,Stack Overflow has become an essential technic...


In [16]:
# !python3 -m spacy download en_core_web_sm

In [17]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
import spacy
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nlp = spacy.load("en_core_web_sm")

    
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [word.lower() for word in tokens if word.isalnum()]
    filtered_tokens = [word for word in tokens if word not in stopwords.words('english')]
    # doc = nlp(" ".join(filtered_tokens))
    # technical_words = [token.text for token in doc if token.pos_ == "NOUN"]
    lemmatizer = WordNetLemmatizer()
    normalized_text = ' '.join([lemmatizer.lemmatize(token) for token in filtered_tokens])
    return normalized_text

def calculate_similarity(text1, text2):
    processed_text1 = preprocess_text(text1)
    processed_text2 = preprocess_text(text2)
    
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform([processed_text1, processed_text2])
    
    similarity = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
    return similarity

# Example text for two papers
# paper_text1 = df_merged.iloc[0]['description']
# paper_text2 = df.iloc[1]['description']

# similarity_score = calculate_similarity(paper_text1, paper_text2)
# print("Similarity Score:", similarity_score)

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [42]:
df_tfidf_input = df_input_no_duplication.copy()
df_tfidf_input['preprocessed_abstract'] = df_tfidf_input.abstract.apply(preprocess_text)

In [134]:
df_tfidf_input.columns

Index(['paper_number', 'Time', 'Duration', 'title', 'Link', 'Authors',
       'Session', 'Date', 'abstract', 'preprocessed_abstract'],
      dtype='object')

In [154]:
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import defaultdict
from time import time

from sklearn import metrics

vectorizer = TfidfVectorizer(
    max_df=0.5,
    min_df=5,
    stop_words="english",
)
t0 = time()
X_tfidf = vectorizer.fit_transform(df_tfidf_input['title'] + " " + df_tfidf_input['abstract'])

print(f"vectorization done in {time() - t0:.3f} s")
print(f"n_samples: {X_tfidf.shape[0]}, n_features: {X_tfidf.shape[1]}")

vectorization done in 0.015 s
n_samples: 117, n_features: 502


In [155]:
X_tfidf[0]

<1x502 sparse matrix of type '<class 'numpy.float64'>'
	with 37 stored elements in Compressed Sparse Row format>

In [156]:
from sklearn.cluster import KMeans
import numpy as np
score_result = []
for i in range(5):
    kmeans = KMeans(
        n_clusters=5,
        max_iter=100,
        n_init=1,
        random_state=i,
    ).fit(X_tfidf)
    cluster_ids, cluster_sizes = np.unique(kmeans.labels_, return_counts=True)
    completeness_score = metrics.completeness_score(df_tfidf_input['Session'], kmeans.labels_)
    homegeneity_score = metrics.homogeneity_score(df_tfidf_input['Session'], kmeans.labels_)
    score_result.append({
        'iteration': i,
        'completeness_score': completeness_score,
        'homegeneity_score': homegeneity_score
    })
    
    print(f"Number of elements assigned to each cluster: {cluster_sizes}")
    print(f"completeness score: {completeness_score}")
    print(f"homogeneity score: {homegeneity_score}") 

Number of elements assigned to each cluster: [34 20 27 22 14]
completeness score: 0.2815845559290487
homogeneity score: 0.15166431862779559
Number of elements assigned to each cluster: [43 21 15 29  9]
completeness score: 0.3349498336124785
homogeneity score: 0.1706057174075869
Number of elements assigned to each cluster: [15 45 16 15 26]
completeness score: 0.34067971516855966
homogeneity score: 0.1756256702238799
Number of elements assigned to each cluster: [25 29 25 12 26]
completeness score: 0.29586349729247186
homogeneity score: 0.1598967588460001
Number of elements assigned to each cluster: [15 28 46 20  8]
completeness score: 0.3710156387851561
homogeneity score: 0.18584223091115898


In [157]:
pd.DataFrame(score_result).mean()

iteration             2.000000
completeness_score    0.324819
homegeneity_score     0.168727
dtype: float64

In [158]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

lsa = make_pipeline(TruncatedSVD(n_components=10), Normalizer(copy=False))
t0 = time()
X_lsa = lsa.fit_transform(X_tfidf)
explained_variance = lsa[0].explained_variance_ratio_.sum()

print(f"LSA done in {time() - t0:.3f} s")
print(f"Explained variance of the SVD step: {explained_variance * 100:.1f}%")

LSA done in 0.020 s
Explained variance of the SVD step: 21.2%


In [159]:
df_tfidf_input['Session'].nunique()

19

In [160]:
from sklearn.cluster import KMeans
import numpy as np
score_result = []
for i in range(5):
    kmeans = KMeans(
        n_clusters=5,
        max_iter=100,
        n_init=1,
        random_state=i,
    ).fit(X_lsa)
    cluster_ids, cluster_sizes = np.unique(kmeans.labels_, return_counts=True)
    completeness_score = metrics.completeness_score(df_tfidf_input['Session'], kmeans.labels_)
    homegeneity_score = metrics.homogeneity_score(df_tfidf_input['Session'], kmeans.labels_)
    
    score_result.append({
        'iteration': i,
        'completeness_score': completeness_score,
        'homegeneity_score': homegeneity_score
    })
    
    print(f"Number of elements assigned to each cluster: {cluster_sizes}")
    print(f"completeness score: {completeness_score}")
    print(f"homogeneity score: {homegeneity_score}")    

Number of elements assigned to each cluster: [38 12 29 17 21]
completeness score: 0.3463367085973266
homogeneity score: 0.18242162770683176
Number of elements assigned to each cluster: [32 23 14 29 19]
completeness score: 0.4232906688922764
homogeneity score: 0.2282193659457069
Number of elements assigned to each cluster: [24 23 22 25 23]
completeness score: 0.38071812166306357
homogeneity score: 0.2103882957829479
Number of elements assigned to each cluster: [21 21 28 22 25]
completeness score: 0.3725703277862058
homogeneity score: 0.20515753025676708
Number of elements assigned to each cluster: [25 29 34 19 10]
completeness score: 0.3370356410435383
homogeneity score: 0.17832264599615777


In [161]:
pd.DataFrame(score_result).mean()

iteration             2.000000
completeness_score    0.371990
homegeneity_score     0.200902
dtype: float64

In [50]:
kmeans = KMeans(
        n_clusters=5,
        max_iter=100,
        n_init=1,
        random_state=1,
    ).fit(X_lsa)
cluster_ids, cluster_sizes = np.unique(kmeans.labels_, return_counts=True)

In [51]:
df_tfidf_output = df_tfidf_input.copy()
df_tfidf_output['cluster'] = kmeans.labels_

In [53]:
metrics.completeness_score(df_tfidf_output['Session'], df_tfidf_output['cluster'])

0.37183301417743797

In [96]:
df = df_tfidf_output.copy()

df.columns = [col.lower().replace(' ', '_') for col in df.columns]
df = df[['date'] + [col for col in df.columns if col != 'date']]
df['start_time'] = pd.to_datetime("2022 " + df['date'] + ' ' + df['time'], format='%Y %a %d %b %H:%M')
df['duration'] = df['duration'].str.extract('(\d+)').astype(int)  # Extract duration in minutes
df['end_time'] = df['start_time'] + pd.to_timedelta(df['duration'], unit='m')

# Delete the Link column
df = df.drop(['link', 'time'], axis=1)

# Move Authors column to the last position
df = df[[col for col in df.columns if col != 'authors'] + ['authors']]

df = df[['session', 'date', 'start_time', 'end_time', 'duration', 'title', 'authors', 'cluster']]

# Display the final data frame
df.head()

Unnamed: 0,session,date,start_time,end_time,duration,title,authors,cluster
0,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,2022-05-17 22:00:00,2022-05-17 22:04:00,4,An Empirical Evaluation of GitHub Copilot’s Co...,"[Nhan Nguyen, Sarah Nadi]",1
1,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,2022-05-17 22:04:00,2022-05-17 22:08:00,4,Comments on Comments: Where Code Review and Do...,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince...",2
2,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,2022-05-17 22:08:00,2022-05-17 22:15:00,7,Does This Apply to Me? An Empirical Study of T...,"[Akalanka Galappaththi, Sarah Nadi, Christoph ...",1
3,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,2022-05-17 22:15:00,2022-05-17 22:22:00,7,Towards Reliable Agile Iterative Planning via ...,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan...",2
4,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,2022-05-17 22:22:00,2022-05-17 22:29:00,7,BotHunter: An Approach to Detect Software Bots...,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein...",3


In [97]:
df.date.value_counts()

date
Wed 18 May    47
Thu 19 May    38
Fri 20 May    26
Tue 17 May     6
Name: count, dtype: int64

In [98]:
# df = df[df.date == 'Fri 20 May']

In [99]:
df

Unnamed: 0,session,date,start_time,end_time,duration,title,authors,cluster
0,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,2022-05-17 22:00:00,2022-05-17 22:04:00,4,An Empirical Evaluation of GitHub Copilot’s Co...,"[Nhan Nguyen, Sarah Nadi]",1
1,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,2022-05-17 22:04:00,2022-05-17 22:08:00,4,Comments on Comments: Where Code Review and Do...,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince...",2
2,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,2022-05-17 22:08:00,2022-05-17 22:15:00,7,Does This Apply to Me? An Empirical Study of T...,"[Akalanka Galappaththi, Sarah Nadi, Christoph ...",1
3,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,2022-05-17 22:15:00,2022-05-17 22:22:00,7,Towards Reliable Agile Iterative Planning via ...,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan...",2
4,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,2022-05-17 22:22:00,2022-05-17 22:29:00,7,BotHunter: An Approach to Detect Software Bots...,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein...",3
...,...,...,...,...,...,...,...,...
112,Session 16: Non-functional Properties (Availab...,Fri 20 May,2022-05-20 14:07:00,2022-05-20 14:11:00,4,A Large-scale Dataset of (Open Source) License...,[Stefano Zacchiroli],4
113,Session 16: Non-functional Properties (Availab...,Fri 20 May,2022-05-20 14:11:00,2022-05-20 14:18:00,7,SECOM: Towards a convention for security commi...,"[Sofia Reis, Rui Abreu, Hakan Erdogmus, Corina...",3
114,Session 16: Non-functional Properties (Availab...,Fri 20 May,2022-05-20 14:18:00,2022-05-20 14:25:00,7,Varangian: A Git Bot for Augmented Static Anal...,"[Saurabh Pujar, Yunhui Zheng, Luca Buratti, Bu...",3
115,Session 16: Non-functional Properties (Availab...,Fri 20 May,2022-05-20 14:32:00,2022-05-20 14:36:00,4,Is GitHub's Copilot as Bad As Humans at Introd...,"[Owura Asare, Mei Nagappan, N. Asokan]",3


In [100]:
df['start_time'] = pd.to_datetime(df['start_time']).astype(int) // 10**9 // 60
df['end_time'] = pd.to_datetime(df['end_time']).astype(int) // 10**9 // 60

df.head()

Unnamed: 0,session,date,start_time,end_time,duration,title,authors,cluster
0,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,27547080,27547084,4,An Empirical Evaluation of GitHub Copilot’s Co...,"[Nhan Nguyen, Sarah Nadi]",1
1,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,27547084,27547088,4,Comments on Comments: Where Code Review and Do...,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince...",2
2,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,27547088,27547095,7,Does This Apply to Me? An Empirical Study of T...,"[Akalanka Galappaththi, Sarah Nadi, Christoph ...",1
3,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,27547095,27547102,7,Towards Reliable Agile Iterative Planning via ...,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan...",2
4,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,27547102,27547109,7,BotHunter: An Approach to Detect Software Bots...,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein...",3


In [101]:
df.dtypes

session       object
date          object
start_time     int64
end_time       int64
duration       int64
title         object
authors       object
cluster        int32
dtype: object

In [102]:
print(df["session"].nunique())
df["session"].unique()[:3]

19


array(['Session 1Technical Papers/Registered ReportsatMSR Main room - even hoursChair(s):Hongyu ZhangUniversity of Newcastle,Masud RahmanDalhousie University',
       'Session 2: Maintenance (Issues & Smells)Technical Papers/Registered Reports/Data and Tool Showcase Track/Industry TrackatMSR Main room - odd hoursChair(s):Alessio FerrariCNR-ISTI',
       'Session 3: Introspection, Vision, and Human AspectsTechnical Papers/Data and Tool Showcase Track/Industry Track/Registered ReportsatMSR Main room - odd hoursChair(s):Alexander SerebrenikEindhoven University of Technology,Sebastian BaltesSAP SE & University of Adelaide'],
      dtype=object)

In [103]:
# df[df["session"]=='Session 1Technical Papers/Registered ReportsatMSR Main room - even hoursChair(s):Hongyu ZhangUniversity of Newcastle,Masud RahmanDalhousie University']

In [104]:
def merge_intervals(intervals):
    if not intervals:
        return 0
    
    # Sort intervals by the start time
    intervals.sort(key=lambda x: x[0])
    
    merged = [intervals[0]]
    for current_start, current_end in intervals[1:]:
        last_end = merged[-1][1]
        
        if current_start <= last_end:
            # There's an overlap, extend the previous interval
            merged[-1] = (merged[-1][0], max(last_end, current_end))
        else:
            # No overlap, add this interval as is
            merged.append((current_start, current_end))
    
    # Compute total duration in minutes (or another unit as desired)
    total_duration_minutes = sum((end - start) for start, end in merged)   # convert seconds to minutes
    return total_duration_minutes

total_durations = []

for date, group in df.groupby('date'):
    intervals = list(zip(group['start_time'], group['end_time']))
    total_duration = merge_intervals(intervals)
    total_durations.append({'date': date, 'total_duration': total_duration})

# Converting the result into a DataFrame
total_duration_df = pd.DataFrame(total_durations)

total_duration_df

Unnamed: 0,date,total_duration
0,Fri 20 May,140
1,Thu 19 May,212
2,Tue 17 May,36
3,Wed 18 May,252


In [105]:
dates = df.groupby("date")["duration"].sum().reset_index()

dates


Unnamed: 0,date,duration
0,Fri 20 May,140
1,Thu 19 May,212
2,Tue 17 May,36
3,Wed 18 May,288


In [106]:
df.head(5)

Unnamed: 0,session,date,start_time,end_time,duration,title,authors,cluster
0,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,27547080,27547084,4,An Empirical Evaluation of GitHub Copilot’s Co...,"[Nhan Nguyen, Sarah Nadi]",1
1,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,27547084,27547088,4,Comments on Comments: Where Code Review and Do...,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince...",2
2,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,27547088,27547095,7,Does This Apply to Me? An Empirical Study of T...,"[Akalanka Galappaththi, Sarah Nadi, Christoph ...",1
3,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,27547095,27547102,7,Towards Reliable Agile Iterative Planning via ...,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan...",2
4,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,27547102,27547109,7,BotHunter: An Approach to Detect Software Bots...,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein...",3


In [107]:
df.cluster.value_counts()

cluster
3    35
0    25
1    22
2    20
4    15
Name: count, dtype: int64

In [108]:
df.iloc[0]['cluster']

1

In [109]:
# common_cluster_matrix = pd.DataFrame(np.zeros((len(df), len(df)), dtype=int), index=df['title'], columns=df['title'])
common_cluster_pairs = {}

# Populate the matrix
for i in range(len(df)):
    for j in range(len(df)):
        # Intersect authors lists, if not empty set cell to 1
        if i!=j and df.iloc[i]['cluster'] == df.iloc[j]['cluster']:
            common_cluster_pairs[(i,j)] = 1

# common_cluster_pairs

In [110]:
common_author_matrix = pd.DataFrame(np.zeros((len(df), len(df)), dtype=int), index=df['title'], columns=df['title'])

# Populate the matrix
for i in range(len(df)):
    for j in range(len(df)):
        # Intersect authors lists, if not empty set cell to 1
        if set(df.iloc[i]['authors']) & set(df.iloc[j]['authors']):
            common_author_matrix.iloc[i, j] = 1

# common_author_matrix.head()

In [111]:
papers = df[["title","authors","duration"]].reset_index(drop=True)
papers.head()

Unnamed: 0,title,authors,duration
0,An Empirical Evaluation of GitHub Copilot’s Co...,"[Nhan Nguyen, Sarah Nadi]",4
1,Comments on Comments: Where Code Review and Do...,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince...",4
2,Does This Apply to Me? An Empirical Study of T...,"[Akalanka Galappaththi, Sarah Nadi, Christoph ...",7
3,Towards Reliable Agile Iterative Planning via ...,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan...",7
4,BotHunter: An Approach to Detect Software Bots...,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein...",7


In [112]:
df.head()

Unnamed: 0,session,date,start_time,end_time,duration,title,authors,cluster
0,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,27547080,27547084,4,An Empirical Evaluation of GitHub Copilot’s Co...,"[Nhan Nguyen, Sarah Nadi]",1
1,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,27547084,27547088,4,Comments on Comments: Where Code Review and Do...,"[Nikitha Rao, Jason Tsay, Martin Hirzel, Vince...",2
2,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,27547088,27547095,7,Does This Apply to Me? An Empirical Study of T...,"[Akalanka Galappaththi, Sarah Nadi, Christoph ...",1
3,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,27547095,27547102,7,Towards Reliable Agile Iterative Planning via ...,"[Jirat Pasuksmit, Patanamon Thongtanunam, Shan...",2
4,Session 1Technical Papers/Registered Reportsat...,Tue 17 May,27547102,27547109,7,BotHunter: An Approach to Detect Software Bots...,"[Ahmad Abdellatif, Mairieli Wessel, Igor Stein...",3


In [113]:
sessions = df.groupby("session")["duration"].sum()

sessions.head()


session
HackathonHackathon/Technical PapersatMSR Main room - odd hoursChair(s):Gregorio RoblesUniversidad Rey Juan Carlos,Jesus M. Gonzalez-BarahonaUniversidad Rey Juan Carlos,Maëlick ClaesUniversity of Oulu    30
Mining ChallengeMining Challenge/Technical PapersatMSR Main room - even hoursChair(s):Steffen HerboldTU Clausthal                                                                                          28
Session 10: SecurityTechnical Papers/Data and Tool Showcase Track/Registered ReportsatMSR Main room - odd hoursChair(s):Triet Le Huynh MinhThe University of Adelaide                                      34
Session 11: Machine Learning & Information RetrievalTechnical PapersatMSR Main room - odd hoursChair(s):Phuong T. NguyenUniversity of L’Aquila                                                             39
Session 12: Integration & Large-Scale MiningTechnical Papers/Data and Tool Showcase TrackatMSR Main room - even hoursChair(s):Jin L.C. GuoMcGill University,Amjed TahirM

In [114]:
df.groupby("session")["duration"].sum().values

array([30, 28, 34, 39, 36, 36, 36, 38, 36, 36, 33, 33, 36, 34, 37, 37, 37,
       30, 50])

In [115]:
df.groupby("session").size().values

array([6, 7, 7, 6, 6, 6, 6, 8, 6, 6, 6, 6, 6, 7, 7, 7, 7, 6, 1])

In [116]:
# Assumption of variables for illustration
num_sessions = len(sessions)  # Number of sessions as defined by PC chairs
session_lengths = sessions.values  # Length of each session in minutes
# num_tracks = 1 # FOR NOW, ASSUME WE ONLY HAVE A SINGLE TRACK

In [117]:
len(papers)

117

In [118]:
# Initiate the problem


from pulp import *
import numpy as np


prob = LpProblem("Conference_Schedule_Optimization", LpMaximize)

# Decision variables
x = LpVariable.dicts("schedule", 
                            ((i, j) 
                             for i in range(len(papers)) # paper number
                             for j in range(num_sessions) # session number
                            ) 
                            ,cat='Binary' # to make it efficient
                       )
z = LpVariable.dicts('product', ((i,j,m,n)
                                  for i in range(len(papers))
                                  for j in range(len(papers))
                                  for m in range(num_sessions)
                                  for n in range(num_sessions)
                                 
                                )
                                 , 0, 1, cat='Continuous')

obj = lpSum(common_cluster_pairs[pair] * z[(pair[0], pair[1],j, j)]
             for pair in common_cluster_pairs
             for j in range(num_sessions)
             )

prob += obj


for pair in common_cluster_pairs:
    for j in range(num_sessions):
        prob += z[(pair[0], pair[1], j, j)] >= x[(pair[0],j)] + x[(pair[1],j)] - 1
        prob += z[(pair[0], pair[1], j, j)] <= x[(pair[0],j)]
        prob += z[(pair[0], pair[1], j, j)] <=  x[(pair[1],j)]

# start_times = LpVariable.dicts("start_times", 
#                             (i 
#                              for i in range(len(papers)) # paper number
#                             ) # start time
#                             ,cat='Continuous'
#                        )
# end_times = LpVariable.dicts("end_times", 
#                             (i 
#                              for i in range(len(papers)) # paper number
#                             ) # end time
#                             ,cat='Continuous'
#                        )

# Objective function: For the basic structure, we make it a dummy one as our main focus is on satisfying constraints
# prob += 0, "ArbitraryObjective"


In [119]:
len(papers)

117

In [120]:
 
session_lengths_extended = [i + 10 for i in session_lengths]

In [121]:
session_lengths_extended

[40, 38, 44, 49, 46, 46, 46, 48, 46, 46, 43, 43, 46, 44, 47, 47, 47, 40, 60]

In [122]:
for i in range(len(papers)):
    for j in range(num_sessions):
            prob += x[(i, j)] >= 0
            prob += x[(i, j)] <= 1


# Ensure each paper is scheduled exactly once
for i in range(len(papers)):
    prob += lpSum(x[(i, j)] for j in range(num_sessions)) == 1, f"One_placement_paper_{i}"

# Do not exceed session length
for j in range(num_sessions):
    prob += lpSum(x[(i, j)] * papers.loc[i]["duration"] for i in range(len(papers))) <= session_lengths_extended[j], f"Session_length_limit_{j}"


In [123]:
listSolvers(onlyAvailable=True)

['PULP_CBC_CMD']

In [124]:
# Choose the solver and set the time limit
# solver = GLPK(msg=False, timeLimit=600)  # Set the time limit to 600 seconds
prob.solve(PULP_CBC_CMD(timeLimit=2000))

Welcome to the CBC MILP Solver 
Version: 2.10.3 
Build Date: Dec 15 2019 

command line - /opt/conda/lib/python3.10/site-packages/pulp/solverdir/cbc/linux/64/cbc /var/tmp/8109478a86b9474ca703512aecbe80d2-pulp.mps -max -sec 2000 -timeMode elapsed -branch -printingOptions all -solution /var/tmp/8109478a86b9474ca703512aecbe80d2-pulp.sol (default strategy 1)
At line 2 NAME          MODEL
At line 3 ROWS
At line 166581 COLUMNS
At line 611904 RHS
At line 778481 BOUNDS
At line 834703 ENDATA
Problem MODEL has 166576 rows, 56221 columns and 386878 elements
Coin0008I MODEL read with 0 errors
seconds was changed from 1e+100 to 2000
Option for timeMode changed from cpu to elapsed
Continuous objective value is 2842 - 113.67 seconds
Cgl0003I 2032 fixed, 0 tightened bounds, 6610 strengthened rows, 6820 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 14696 strengthened rows, 1987 substitutions
Cgl0003I 0 fixed, 0 tightened bounds, 8751 strengthened rows, 3516 substitutions
Cgl0003I 0 fixed, 0 tight

1

In [125]:

# Initialize an empty list to hold the results
results = []

# Iterate over each session and track
for j in range(num_sessions):
    session_number = j + 1
    # Iterate over each paper
    for i in range(len(papers)):
        if x[(i, j)].varValue is not None and  x[(i, j)].varValue== 1:
            # Prepare a dict with the required information
            paper_info = {
                'optimized_session': session_number,
                'title': papers.loc[i]['title'],
                'duration': papers.loc[i]['duration'],
                'validity': 1
            }
            # Append to the results list
            results.append(paper_info)
        elif x[(i, j)].varValue is not None and  x[(i, j)].varValue > 0:
            # Prepare a dict with the required information
            paper_info = {
                'optimized_session': session_number,
                'title': papers.loc[i]['title'],
                'duration': papers.loc[i]['duration'],
                'validity': 0
            }
            # Append to the results list
            results.append(paper_info)

# Convert the list of dictionaries into a DataFrame
df_results = pd.DataFrame(results)

# If you need to see the first few rows of the DataFrame to ensure it's correct
df_results.head()


Unnamed: 0,optimized_session,title,duration,validity
0,1,Starting the InnerSource Journey: Key Goals an...,7,1
1,1,Extracting corrective actions from code reposi...,7,1
2,1,Code Review Practices for Refactoring Changes:...,7,1
3,1,Automatically Prioritizing and Assigning Tasks...,7,1
4,2,Investigating the Impact of Forgetting in Soft...,4,1


In [126]:
df_results["validity"].mean()

1.0

In [127]:
df_merged_result = pd.merge(df[['title', 'session', 'cluster']], df_results[['title', 'optimized_session']], on='title', how='left')

In [128]:
df_merged_result.sort_values('optimized_session')

Unnamed: 0,title,session,cluster,optimized_session
97,Code Review Practices for Refactoring Changes:...,Session 15: Collaboration & Open SourceRegiste...,2,1
96,Extracting corrective actions from code reposi...,Session 14: Software QualityTechnical Papers/I...,0,1
16,Starting the InnerSource Journey: Key Goals an...,"Session 3: Introspection, Vision, and Human As...",0,1
102,Automatically Prioritizing and Assigning Tasks...,Session 15: Collaboration & Open SourceRegiste...,1,1
24,Which bugs are missed in code reviews: An empi...,Mining ChallengeMining Challenge/Technical Pap...,2,2
...,...,...,...,...
83,FaST: A linear time stack trace alignment heur...,Session 12: Integration & Large-Scale MiningTe...,1,18
60,SniP: An Efficient Stack Tracing Framework for...,Session 9: Scaling & CloudIndustry Track/Regis...,1,18
31,Empirical Standards for Repository Mining,Tutorial: Empirical Standards for Repository M...,0,19
50,The Unsolvable Problem or the Unheard Answer? ...,Session 7: Developer Wellbeing & Project Commu...,0,19


In [129]:
new_session_lengths = (df_results.groupby("optimized_session")["duration"].sum() + pd.Series(index=range(1,1+num_sessions), data = 0)).fillna(0)
new_session_lengths

optimized_session
1     28
2     27
3     28
4     48
5     45
6     26
7     36
8     25
9     28
10    33
11    43
12    37
13    21
14    36
15    42
16    41
17    41
18    33
19    58
dtype: int64

In [130]:
df.groupby("session")["duration"].sum().values

array([30, 28, 34, 39, 36, 36, 36, 38, 36, 36, 33, 33, 36, 34, 37, 37, 37,
       30, 50])

In [131]:
completeness_score = metrics.completeness_score(df_merged_result['session'], df_merged_result['optimized_session'])
homegeneity_score = metrics.homogeneity_score(df_merged_result['session'], df_merged_result['optimized_session'])
print(f"completeness score: {completeness_score}")
print(f"homogeneity score: {homegeneity_score}")  

completeness score: 0.4234040953172233
homogeneity score: 0.4211182858948719


In [132]:
metrics.completeness_score(df_merged_result['session'], df_merged_result['cluster'])

0.37183301417743797

In [133]:
metrics.homogeneity_score(df_merged_result['session'], df_merged_result['cluster'])

0.20060151486657662