#### Importing libraries

In [24]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive/')

# Add project path to system path
import sys
sys.path.append('/content/drive/MyDrive/ADS2001 project/')

# Core libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning tools
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.neighbors import KNeighborsClassifier

# Reading might be very slow since this csv file is the largest (around 500 MB)
df = pd.read_csv('/content/drive/MyDrive/ADS2001 project/data/virtualInternshipData_processed.csv')
df = df.drop(columns=['content'])

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## **Data Processing & Aggregation**

The requirement of this assignment is to do data analysis on a team-level statistics.
Therefore, it is necessary to have aggregation up to team-level so that we could procceed it for our next step, namely **Baseline Modelling**

We do the following addaptation for each colums available:
1. First, we drop `RoleName`,`content` and `userIDs` as they are no longer needed for further steps.
2. We aggregate at the level of `groupIDs`, then `roomName`.
3. For `m_experimental_testing`, `m_making_design_choices`, `m_asking_questions`, `j_customer_consultants_requests`, `j_performance_parameters_requirements` and  `j_communication`, we are taking up the sum.
4. For `mentor_to_player_ratio`, we take the mean value of it.
5. Then, we drop `mentor_count` and `player_count`.
5. For `sentiment`, we are taking the mean value of it.
6. For `activeness`, we are taking the mean value.
7. We drop `content`.
8. For the rest features, we take the sum of them.

One big question appeared here ---- How about `OutcomeScore`?



### **Method 1: Mean, Median and Mode**


In [25]:
unique_groupid = df.groupIDs.unique()
unique_stage = df.roomName.unique()
temp_1 = df.copy()[['groupIDs', 'roomName', 'userIDs', 'OutcomeScore']][df['RoleName']!='Mentor'].drop_duplicates()


def aggregate(agg_function, agg_function_Name):
    '''
    Takes in an aggregation function, and apply it onto OutcomeScore column

    Args:
        agg_function: The aggregation function to be implemented
        agg_function_Name: The name of the aggregation function

    Returns:
        agg_df: A pandas dataframe containing 'groupIDs', 'roomName', and aggregated 'OutcomeScore'
    '''
    agg_df = pd.DataFrame(columns=['groupIDs', 'roomName', f'OutcomeScore_{agg_function_Name}'])
    for i in range(len(unique_groupid)):
        for j in range(len(unique_stage)):
            cur_df = temp_1[(temp_1['groupIDs'] == unique_groupid[i]) & (temp_1['roomName'] == unique_stage[j])]
            agg_val = cur_df.OutcomeScore.agg(agg_function)
            agg_df.loc[len(agg_df)] = [unique_groupid[i], unique_stage[j], agg_val]

    return agg_df


mean_agg = aggregate('mean', 'mean')
mean_agg.dropna(axis=0, inplace=True)
mean_agg['OutcomeScore_mean'] = mean_agg['OutcomeScore_mean'].astype(int)

median_agg = aggregate('median', 'median')
median_agg.dropna(axis=0, inplace=True)
median_agg['OutcomeScore_median'] = median_agg['OutcomeScore_median'].astype(int)

mode_agg = pd.DataFrame(columns=['groupIDs', 'roomName', f'OutcomeScore_mode'])
for i in range(len(unique_groupid)):
    for j in range(len(unique_stage)):
        cur_df = temp_1[(temp_1['groupIDs'] == unique_groupid[i]) & (temp_1['roomName'] == unique_stage[j])]
        if list(cur_df.OutcomeScore.agg('mode')):
            agg_val = list(cur_df.OutcomeScore.agg('mode'))[0]
        else:
            continue
        mode_agg.loc[len(mode_agg)] = [unique_groupid[i], unique_stage[j], agg_val]


print(mode_agg.shape)
print(median_agg.shape)
print(mean_agg.shape)

(594, 3)
(594, 3)
(594, 3)


In [26]:
mean_agg.head()

Unnamed: 0,groupIDs,roomName,OutcomeScore_mean
0,2a,Introduction and Workflow Tutorial with Entran...,3
1,2a,Background research on dialysis,5
3,2a,Reflection team discussion of surfactants,3
4,2a,Summarize internal consultant requirements,3
5,2a,Choose consultants to analyze,3


### **Method 2: Weighted Average Marks**

In [29]:
temp_2 = df.copy()[df['RoleName']!='Mentor'].drop_duplicates()

def wam(group, indicator: str):
    w = group[indicator].values
    x = group['OutcomeScore'].values
    return (np.dot(w, x) / np.sum(w)).astype(int)

indicator = 'player_activeness'
wam_agg = temp_2.groupby(['groupIDs', 'roomName'])[['groupIDs',indicator, 'OutcomeScore']].apply(wam, indicator = indicator).reset_index(name='OutcomeScore_wam')

wam_agg.shape

(594, 3)

In [30]:
wam_agg.head(3)

Unnamed: 0,groupIDs,roomName,OutcomeScore_wam
0,2a,Background research on dialysis,5
1,2a,Choose consultants to analyze,3
2,2a,Individual analysis of first batch,3


### **Method 3: Gradient Descent & Least Square**

In [31]:
import numpy as np

np.random.seed(0)
X = np.array(temp_2['OutcomeScore'])
y = 2 * X + 1  # Add some noise

# Add bias term (x0 = 1)
X_b = np.c_[np.ones((X.shape[0], 1)), X]  # Shape: (100, 2)

# Initialize parameters
theta = np.zeros(2)
alpha = 0.01
epochs = 20
m = len(y)

# Stochastic Gradient Descent
for epoch in range(epochs):
    for i in range(m):
        xi = X_b[i].reshape(1, -1)  # (1, 2)
        yi = y[i]
        prediction = xi.dot(theta)
        error = (prediction - yi)**2
        gradient = xi.T.dot(error)  # shape (2, 1)
        theta -= alpha * gradient.flatten()

# Final parameters
print("Learned theta:", theta)

# Evaluation using least squares cost function
predictions = X_b.dot(theta)
cost = (1 / (2 * m)) * np.sum((predictions - y) ** 2)
print("Final least squares cost:", cost)

  error = (prediction - yi)**2


Learned theta: [nan nan]
Final least squares cost: nan


In [32]:
class BGD:
    def __init__(self, learning_rate=0.01, epochs=100):
        self.lr = learning_rate
        self.epochs = epochs
        self.group_weights = {}

    def fit(self, df):
        # Filter and group data
        player_data = df.copy()[df['RoleName'] != 'Mentor']
        groups = player_data.groupby(['groupIDs', 'roomName'])

        # Process each group
        for (group, room), gdf in groups:
            # Get user scores and initialise weights
            users = gdf['userIDs'].values
            scores = gdf['OutcomeScore'].values
            scores = 0 if np.isnan(scores).any() else scores
            n_users = len(users)

            # Initialise equal weights (sum=1)
            weights = np.ones(n_users) / n_users

            user_weight_map = dict(zip(users, weights))

            # SGD optimisation
            for _ in range(self.epochs):
                # Calculate weighted aggregate
                agg_score = np.dot(weights, scores)
                agg_score = 0 if np.isnan(agg_score) else agg_score



                # Compute gradient (dL/dw = 2*(agg - x_i)*x_i)
                error = (agg_score - scores)

                grad = 2 * scores * error.mean()

                # Update weights
                weights -= self.lr * grad

                # Project to probability simplex
                weights = np.maximum(weights, 0)
                weights /= weights.sum()


            # Store final weights
            self.group_weights[(group, room)] = dict(zip(users, weights))


    def run(self, group, room, user_scores):
        """Aggregate scores using learned weights"""
        if (group, room) not in self.group_weights:
            return int(round(np.mean(list(user_scores.values()))))

        weights = self.group_weights[(group, room)]
        weights = {k: (0 if np.isnan(v) else v) for k, v in weights.items()}
        valid_users = [u for u in user_scores if u in weights]
        # print(weights)
        # print(valid_users)
        # print(user_scores)

        if not valid_users:
            return int(round(np.mean(list(user_scores.values()))))

        w = np.array([weights[u] for u in valid_users])
        s = np.array([user_scores[u] for u in valid_users])

        return int(round(np.dot(w, s)))

    def run_all(self, df):
        """Apply aggregation to entire dataframe"""
        results = []
        groups = df.groupby(['groupIDs', 'roomName'])

        for (group, room), gdf in groups:
            user_scores = dict(zip(gdf['userIDs'], gdf['OutcomeScore']))
            agg_score = self.run(group, room, user_scores)
            results.append({
                'groupIDs': group,
                'roomName': room,
                'OutcomeScore_bgd': agg_score
            })

        return pd.DataFrame(results)

In [33]:
data = df.copy()[df['RoleName'] != 'Mentor'][['groupIDs', 'roomName','RoleName', 'userIDs' ,'OutcomeScore']].drop_duplicates()
bgd = BGD(learning_rate=0.01, epochs = 100)
bgd.fit(data)

bgd_agg = bgd.run_all(data)

In [34]:
print(bgd_agg.shape)
bgd_agg.head(3)

(594, 3)


Unnamed: 0,groupIDs,roomName,OutcomeScore_bgd
0,2a,Background research on dialysis,5
1,2a,Choose consultants to analyze,4
2,2a,Individual analysis of first batch,3


### **Evaluation**

Let's think about a way to evaluate which is better.
Currently we have `OutcomeScore` before aggregation, which is our original data, and `OutcomeScore` after the aggregation.

The best way of evaluating it is by mean-squared-error.


In [35]:
def evaluate_aggregated_outcome(aggregated_df, method: str):
    """
    Computes the squared error between original and pre-aggregated OutcomeScore.

    Args:
        method (str): Aggregation method used.
        aggregated_df (pd.DataFrame): Aggregated DataFrame with precomputed OutcomeScores.

    Returns:
        float: Total squared error between original and aggregated scores.
    """
    ########## Pre-processing: original dataframe

    # First, we take only "Player" scores. And take unique columns so that everything dont be duplicated due to chatlog frequencies.
    original_scores = df.copy()[['groupIDs', 'roomName', 'userIDs', 'OutcomeScore']][df['RoleName']!='Mentor'].drop_duplicates()
    # Then, we aggregate it up to 'list' for further analysis
    original_scores = original_scores.groupby(['groupIDs', 'roomName'])['OutcomeScore'].apply(list)


    ########## Pre-processing: aggregated dataframe

    # then we make a dictionary based on aggregated dataframe
    outcome_score_name = f'OutcomeScore_{method}' if method != '' else 'OutcomeScore'
    aggregated_scores = aggregated_df.copy()[['groupIDs', 'roomName', outcome_score_name]].drop_duplicates()
    aggregated_scores = aggregated_scores.set_index(['groupIDs', 'roomName'])[outcome_score_name].to_dict()


    ########## Calculate MSE
    total_squared_error = 0.0
    total_observations = 0  # Count individual scores

    for group, scores in original_scores.items():
        if group in aggregated_scores:
            for score in scores:
                total_squared_error += (score - aggregated_scores[group]) ** 2
                total_observations += 1

    mean_squared_error = round(total_squared_error / total_observations, 2)

    return 0 if method == '' else mean_squared_error

In [36]:
def evaluate_all_methods():
    """
    Run evaluate_aggregated_outcome functions on all aggregated types

    Returns:
        evaluation (pd.DataFrame): A dataframe containing evaluation results
    """

    evaluation = pd.DataFrame({
        'Method': ['Original dataframe','Mode', 'Mean',  'Median','WAM', 'Gradient Descent'],
        'Mean Squared Error (MSE)': [evaluate_aggregated_outcome(df, ''),evaluate_aggregated_outcome( mode_agg, 'mode'), evaluate_aggregated_outcome(mean_agg, 'mean'), evaluate_aggregated_outcome(median_agg, 'median'),  evaluate_aggregated_outcome(wam_agg, 'wam'),  evaluate_aggregated_outcome(bgd_agg, 'bgd')],
        'row': [df.shape[0],mode_agg.shape[0], mean_agg.shape[0], median_agg.shape[0], wam_agg.shape[0], bgd_agg.shape[0] ],
        'aggregated': [df, mode_agg, mean_agg, median_agg, wam_agg, bgd_agg]
    })

    return evaluation

We will run the evaluation(s) after all methods are developed

In [37]:
evaluate_all_methods().drop(columns=['aggregated'])

Unnamed: 0,Method,Mean Squared Error (MSE),row
0,Original dataframe,0.0,19180
1,Mode,2.97,594
2,Mean,1.86,594
3,Median,1.87,594
4,WAM,2.07,594
5,Gradient Descent,1.75,594


### **Final Aggregation**

In [38]:
cols_to_drop = ['RoleName', 'content', 'userIDs']
df_cleaned = df.copy()[df['RoleName']!='Mentor'].drop(columns=[col for col in cols_to_drop if col in df.columns]).drop_duplicates()


# --- Define aggregation rules ---
agg_dict = {
    'm_experimental_testing': 'sum',
    'm_making_design_choices': 'sum',
    'm_asking_questions': 'sum',
    'j_customer_consultants_requests': 'sum',
    'j_performance_parameters_requirements': 'sum',
    'j_communication': 'sum',
    'mentor_to_player_ratio': 'mean',
    #'sentiment_score': 'mean',
    'player_activeness': 'sum',
    'mentor_activeness': 'sum',
    'wordCount': 'sum'
}

# Apply 'sum' for all other remaining numeric columns (excluding those already handled)
rest_dict = {
    col: 'sum' for col in df_cleaned.columns
    if col not in agg_dict and col not in ['groupIDs', 'roomName', 'OutcomeScore', 'mentor_count', 'player_count']
}

# Combine all aggregation functions
full_agg_dict = {**agg_dict, **rest_dict}

# --- Apply aggregation ---
aggregated = df_cleaned.groupby(['groupIDs', 'roomName']).agg(full_agg_dict).reset_index()
final_df = aggregated.copy()


In [39]:
best_agg = evaluate_all_methods().sort_values(by='Mean Squared Error (MSE)').iloc[1,]
final_df['OutcomeScore'] = best_agg['aggregated'].iloc[:,-1]

In [40]:
final_df.head(3)

Unnamed: 0,groupIDs,roomName,m_experimental_testing,m_making_design_choices,m_asking_questions,j_customer_consultants_requests,j_performance_parameters_requirements,j_communication,mentor_to_player_ratio,player_activeness,...,yo,yooo,yoooooo,yup,zach,zachary,zane,zelin,zero,OutcomeScore
0,2a,Background research on dialysis,0,0,0,0,0,0,0.142857,88.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
1,2a,Choose consultants to analyze,0,0,2,0,1,0,0.142857,327.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
2,2a,Individual analysis of first batch,0,0,0,0,0,0,0.142857,175.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3


### **Export of aggregated data for further analysis**

In [41]:
final_df.to_csv(path_or_buf='/content/drive/MyDrive/ADS2001 project/data/virtualInternshipData_aggregated.csv')