#### Importing libraries

In [2]:
# Install required packages
!pip install contractions better_profanity nltk wordcloud textblob

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive/')

# Add project path to system path
import sys
sys.path.append('/content/drive/MyDrive/ADS2001 project/')

# Core libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# NLP tools
import nltk
from nltk import pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
import contractions
from better_profanity import profanity
from wordcloud import WordCloud
import re
import contractions
import spacy
from textblob import Word
from functools import lru_cache
import requests

# Download required NLTK data
nltk.download('vader_lexicon', quiet=True)
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('averaged_perceptron_tagger_eng', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('punkt_tab', quiet = True)

# Machine Learning tools
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.neighbors import KNeighborsClassifier

# Utility libraries
from tqdm.notebook import tqdm
import re
import string

df = pd.read_csv('/content/drive/MyDrive/ADS2001 project/data/virtualInternshipData_preprocessed.csv')
processed_text = pd.read_csv('/content/drive/MyDrive/ADS2001 project/data/processed_word.csv')

Mounted at /content/drive/


## **Exploratory Data Analysis (EDA) & Data Processing**

### Structural



#### Group Size, Number of Mentor, Player Mentor Ratio

In [3]:
# Calculate the player/mentor ratio, and merge it into the main dataframe
(mentor_sum, player_sum) = (
    df[df['RoleName'] == 'Mentor'][['groupIDs','userIDs']]
      .drop_duplicates()
      .groupby('groupIDs')
      .agg(mentor_count=('userIDs', 'count')),

    df[df['RoleName'] == 'Player'][['groupIDs','userIDs']]
      .drop_duplicates()
      .groupby('groupIDs')
      .agg(group_size=('userIDs', 'count'))
)

# Calculate mentor-player ratio
mentor_player_ratio = pd.concat([
    mentor_sum,
    player_sum,
    (mentor_sum['mentor_count'] / player_sum['group_size']).rename('mentor_to_player_ratio')
], axis=1).fillna({'mentor_to_player_ratio': 0, 'mentor_sum': 0})

# Merge only needed columns, maintain final position
temp = pd.merge(
    df[['groupIDs']],
    mentor_player_ratio[['group_size','mentor_count', 'mentor_to_player_ratio']],
    how='left',
    left_on='groupIDs',
    right_index=True
)

# Fill NaNs in ratio
temp['mentor_to_player_ratio'] = temp['mentor_to_player_ratio'].fillna(0)

# Append to the right end of df
df = pd.concat([df, temp[['group_size','mentor_count', 'mentor_to_player_ratio']]], axis=1).fillna(0)


### Progress

#### Activeness

In [4]:
# Player Activeness
player_activeness = (
    df.copy()[df['RoleName'] == 'Player']
    .groupby('userIDs')
    .size()
    .reset_index(name='player_activity_count')
)

# Mentor Activeness
mentor_activeness = (
    df.copy()[df['RoleName'] == 'Mentor']
    .groupby('userIDs')
    .size()
    .reset_index(name='mentor_activity_count')
)

# Merge back to original dataframe
df = df.merge(
    player_activeness,
    on='userIDs',
    how='left'
).merge(
    mentor_activeness,
    on='userIDs',
    how='left'
)

# Fill NaN with 0 for users who never acted in that role
df['player_activeness'] = df['player_activity_count'].fillna(0)
df['mentor_activeness'] = df['mentor_activity_count'].fillna(0)
df.drop(columns=['player_activity_count', 'mentor_activity_count'], inplace=True)

#### Engagement

In [5]:
# 1. Calculate total engagement (sum of all metrics)
engagement_metrics = [
    'm_experimental_testing',
    'm_making_design_choices',
    'm_asking_questions',
    'j_customer_consultants_requests',
    'j_performance_parameters_requirements',
    'j_communication'
]
df['engagement'] = df[engagement_metrics].sum(axis=1)

# 2. Create role-specific engagement columns
df['player_engagement'] = np.where(df['RoleName'] == 'Player', df['engagement'], 0)
df['mentor_engagement'] = np.where(df['RoleName'] == 'Mentor', df['engagement'], 0)

#### Mentor behaviour

In [6]:
# Create mentor-specific features directly from mentor rows
mentor_mask = df['RoleName'] == 'Mentor'

# 1. Create questioning and directiveness columns (only populated for mentors)
df['mentor_questioning'] = df.loc[mentor_mask, 'm_asking_questions']
df['mentor_directiveness'] = df.loc[mentor_mask, 'm_making_design_choices']

# 2. Forward-fill within groups to share mentor values with all group members
df[['mentor_questioning', 'mentor_directiveness']] = df.groupby('groupIDs')[
    ['mentor_questioning', 'mentor_directiveness']
].ffill()

# 3. Fill any remaining NA values (for groups without mentors) with 0 or mean
df[['mentor_questioning', 'mentor_directiveness']] = df[
    ['mentor_questioning', 'mentor_directiveness']
].fillna(0)  # or .fillna(df[mentor_mask][['m_asking_questions', 'm_making_design_choices']].mean())

### Natural Language Processing


**<span style="background-color: #fff3b0">Text Preprocessing Steps:</span>**

**Removed punctuations:**  
Example: "hello!" → "hello", "word." → "word"

**Lowercased all characters:**  
Ensures uniformity (e.g., "Hello" → "hello").

**Expanded contractions:**  
Example: "don't" → "do not", "won't" → "will not"

---

**<span style="background-color: #fff3b0">Key Observations & Challenges</span>**

**TF-IDF Limitation:**  
**Problem:** Words with low frequency (e.g., typos, slang like "yeahhhhhh", "uyo", "yup") get high IDF scores, falsely marking them as "unique".  
**Impact:** Noise in the model due to non-standard words.  
**Discussion Point:** How to handle slang/typos? (e.g., regex rules, custom dictionaries?)

**Stemming Needed:**  
Example: "craziest" → "crazy" (reduces redundancy).

**Open Question:** Should we use lemmatization?  
(May alter semantics—e.g., "running" → "run" could affect model accuracy.)






We have done some Lemmatizations on each sentences in the "Contents" column.

Lemmitazation means that we change each word in each string to its root form, for example, "running" becomes "run", and "booking" becomes "book".

Then we have removed all the stopwords from the string (e.g. "hi", "the", "is", etc, these are the words that contain little infomation)

In [7]:
class TextAnalysis:
    def __init__(self, text):
        self.original_text = text
        self.processed_text = text
        self.stop_words = self._custom_stopwords()
        self.lemmatizer = WordNetLemmatizer()
        self.negation_words = {'not', 'no', 'never', 'none'}
        self.nlp = spacy.load("en_core_web_sm")
        # Custom slang dictionary since API calls are unreliable
        self.slang_dict = {
            'ur': 'your',
            'thx': 'thanks',
            'gr8': 'great',
            'u': 'you',
            'r': 'are',
            'plz': 'please',
            'btw': 'by the way'
        }

    def _custom_stopwords(self):
        stop_words = set(stopwords.words('english'))
        keep_words = {'no', 'not', 'just', 'only', 'same'}
        return stop_words - keep_words

    def clean_text(self):
        """Basic text cleaning"""
        self.processed_text = self.processed_text.encode('ascii', 'ignore').decode('ascii')
        # Keep apostrophes for contractions
        self.processed_text = re.sub(r'[^\w\s\']', ' ', self.processed_text)
        self.processed_text = re.sub(r'\s+', ' ', self.processed_text).strip()
        return self

    def expand_contractions(self):
        """Expand all contractions"""
        self.processed_text = contractions.fix(self.processed_text)
        return self

    def normalize_case(self):
        """Convert to lowercase except proper nouns"""
        words = word_tokenize(self.processed_text)
        tagged = pos_tag(words)
        normalized = []
        for w, tag in tagged:
            if tag == 'NNP':
                normalized.append(w)
            else:
                normalized.append(w.lower())
        self.processed_text = ' '.join(normalized)
        return self

    def normalize_slang(self):
        """Replace slang with formal equivalents using custom dictionary"""
        words = word_tokenize(self.processed_text)
        normalized = [self.slang_dict.get(word, word) for word in words]
        self.processed_text = ' '.join(normalized)
        return self

    def spell_check(self):
        """More careful spell checking using TextBlob"""
        words = word_tokenize(self.processed_text)
        corrected = []
        for word in words:
            # Don't correct proper nouns or already corrected words
            if word.istitle() or word in self.slang_dict.values():
                corrected.append(word)
                continue

            w = Word(word)
            # Only correct if confidence is high
            if w.spellcheck()[0][1] > 0.8:
                corrected.append(w.spellcheck()[0][0])
            else:
                corrected.append(word)
        self.processed_text = ' '.join(corrected)
        return self

    def handle_negation(self):
        """Improved negation handling"""
        words = word_tokenize(self.processed_text)
        processed = []
        i = 0
        while i < len(words):
            if words[i] in self.negation_words and i+1 < len(words):
                # Only combine with following word if it's not punctuation
                if words[i+1].isalpha():
                    processed.append(f"{words[i]}_{words[i+1]}")
                    i += 2
                else:
                    processed.append(words[i])
                    i += 1
            else:
                processed.append(words[i])
                i += 1
        self.processed_text = ' '.join(processed)
        return self

    def lemmatize(self):
        """More accurate lemmatization"""
        words = word_tokenize(self.processed_text)
        tagged = pos_tag(words)
        lemmatized = []
        for word, tag in tagged:
            # Skip negated terms (contains underscore)
            if '_' in word:
                lemmatized.append(word)
                continue

            pos = self._get_wordnet_pos(tag)
            if pos:
                lemma = self.lemmatizer.lemmatize(word, pos=pos)
                lemmatized.append(lemma)
            else:
                lemmatized.append(word)
        self.processed_text = ' '.join(lemmatized)
        return self

    def extract_entities(self):
        """Entity recognition without removing text"""
        doc = self.nlp(self.processed_text)
        self.entities = [(ent.text, ent.label_) for ent in doc.ents]
        return self

    def remove_stopwords(self):
        """Stopword removal that preserves negations"""
        words = word_tokenize(self.processed_text)
        filtered = []
        for word in words:
            # Keep words with underscores (negations)
            if '_' in word:
                filtered.append(word)
            elif word.lower() not in self.stop_words:
                filtered.append(word)
        self.processed_text = ' '.join(filtered)
        return self

    def _get_wordnet_pos(self, treebank_tag):
        if treebank_tag.startswith('J'):
            return 'a'  # Adjective
        elif treebank_tag.startswith('V'):
            return 'v'  # Verb
        elif treebank_tag.startswith('N'):
            return 'n'  # Noun
        elif treebank_tag.startswith('R'):
            return 'r'  # Adverb
        return None

    def execute(self):
        """Optimized processing pipeline"""
        return (self.clean_text()
                .expand_contractions()
                .normalize_case()
                .normalize_slang()
                .spell_check()
                .handle_negation()
                .lemmatize()
                .extract_entities()
                .remove_stopwords()
                .clean_text()
                .processed_text)

In [8]:
# Initialize with our example sentence
analyzer = TextAnalysis("I can't believe ur not using thx awesome NLP tools, they're gr8 for analyzin' data!")

# Run the full processing pipeline
result = analyzer.execute()

# Display the results
print("Original:", analyzer.original_text)
print("Processed:", result)

Original: I can't believe ur not using thx awesome NLP tools, they're gr8 for analyzin' data!
Processed: not_believe not_using thanks awesome NLP tool great analyze ' data


Then, what we gonna do is just run the whole class for the entire dataframe by:



```
# The following code requires 14 hours to run. So, I have ran it before and extracted the result.

!pip install swifter
import swifter
df['content'] = df['content'].swifter.apply(lambda text: TextAnalysis(text).execute())

```

But I did it previously and saved the result. So it's no longer necessary to run the code again.



In [None]:
df['content'] = processed_text['content_processed']

In [9]:
from collections import Counter
import pandas as pd

def get_term_freq(series):
    all_text = ' '.join(series.dropna().astype(str))
    return Counter(all_text.split())

raw_freq = get_term_freq(df.copy()['content'])
processed_freq = get_term_freq(processed_text.copy()['content_processed'])

# Create DataFrames for comparison
raw_df = pd.DataFrame.from_dict(raw_freq, orient='index', columns=['raw_count'])
processed_df = pd.DataFrame.from_dict(processed_freq, orient='index', columns=['processed_count'])

# Combine and calculate differences
comparison = raw_df.join(processed_df, how='outer').fillna(0)
comparison['difference'] = comparison['raw_count'] - comparison['processed_count']
comparison

Unnamed: 0,raw_count,processed_count,difference
!,3.0,0.0,3.0
!0%,1.0,0.0,1.0
"""",2.0,0.0,2.0
"""11""",1.0,0.0,1.0
"""2"".",1.0,0.0,1.0
...,...,...,...
|3s|The,1.0,0.0,1.0
|4s|If,1.0,0.0,1.0
~,1.0,0.0,1.0
ÛÏLow,1.0,0.0,1.0


In [10]:
raw_terms = set(' '.join(df['content'].dropna().astype(str)).split())
processed_terms = set(' '.join(processed_text['content_processed'].dropna().astype(str)).split())

# Terms removed during processing
removed_terms = raw_terms - processed_terms

# New terms added during processing (unlikely, but possible if processing adds markers)
added_terms = processed_terms - raw_terms

print(f"Number of terms removed: {len(removed_terms)}")
print(f"Number of terms added: {len(added_terms)}")

Number of terms removed: 9191
Number of terms added: 1692


In [11]:
df['raw_length'] = df['content'].str.split().str.len()
df['processed_length'] = processed_text['content_processed'].str.split().str.len()
df['length_diff'] = df['raw_length'] - df['processed_length']

print(f"Average length reduction: {df['length_diff'].mean()}")

Average length reduction: 6.0694473409801875


In [12]:
def jaccard_similarity(text1, text2):
    set1 = set(text1.split())
    set2 = set(text2.split())
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union if union != 0 else 0

df['jaccard_sim'] = [jaccard_similarity(raw, proc)
                    for raw, proc in zip(df['content'], processed_text['content_processed'])]

print(f"Average Jaccard similarity: {df['jaccard_sim'].mean()}")

Average Jaccard similarity: 0.2451673191729663


#### Sentiment Analysis

**Sentiment Analysis** is a technique that identifies and extracts subjective information from text, such as opinions, emotions, and attitudes. It classifies sentiment as positive, negative, or neutral (and sometimes more granular emotions like happy, angry, or sad).

<br>

For a document:
$$
\text{Sentiment} = \frac{\sum_{i=1}^n \text{polarity} \cdot \text{weight}}{\sum_{i=1}^n \text{weight}}
$$

Where:
- $\text{polarity}\ \in [-1,1]$: It is the predefined sentiment score available in the library. It measures sentiment polarity on a continuous scale from **-1 (negative)** to **+1 (positive)**. For exampple, terrible is coded to a sentiment score of -0.9.
- $\text{weight}$: Term importance (frequency)



For the content column, we converted the text to strings and applied sentiment analysis using TextBlob. We then calculated the sentiment polarity score for each message, where values close to -1 or equal to 0 indicated negative sentiment and values close to 1 indicated positive sentiment. We used -1 and 1 as binary values to ensure no other values will be taken into account.

In [None]:
df['sentiment_score'] = df['content'].astype(str).apply(lambda x: TextBlob(x).sentiment.polarity)

# convert sentiment score to binary (-1 for neutral/negative, 1 for positive)
df['binary_sentiment'] = df['sentiment_score'].apply(lambda x: 1 if x > 0 else -1)

# display first few rows
print(df[['content', 'sentiment_score', 'binary_sentiment']].head())

                                             content  sentiment_score  \
0                       hello team welcome nephrotex         0.800000   
1           maria williams design adviser internship         0.000000   
2                                      help question         0.000000   
3  please introduce name prefer call workpro reco...         0.083333   
4  just want make sure everyone find chat interfa...         0.321429   

   binary_sentiment  
0                 1  
1                -1  
2                -1  
3                 1  
4                 1  


Then, we created a violin plot to visualize the distribution of binary sentiment scores across different groups. The plot displays the sentiment distribution for each group in the group_id column, using the binary_sentiment column as the variable of interest. This helps identify the sentiment spread for each group, where the inner points represent individual data points.

Next, we calculated the average sentiment score for each group by grouping the data by group_id and computing the mean of sentiment_score. This calculation helps to understand the overall sentiment of each group. The result was stored in avg_sentiment_score.

Following that, we created a bar plot to show the average binary sentiment score for each group. The binary_sentiment column was grouped by group_id, and the mean was calculated to display the average sentiment score for each group. This provides insight into whether each group is mostly positive or negative based on the binary sentiment values.

Finally, we generated a histogram to visualize the count of binary sentiment scores across the entire dataset. This plot shows the frequency of each binary sentiment value (-1 and 1), helping to understand how many positive and negative sentiments were recorded across all chat messages.

#### TF-IDF

**Term Frequency (TF)**


Term Frequency measures how often a term $t$ appears in a document $d$:
<br>  
$$
\text{TF}(t, d) = \frac{\text{Count of } t \text{ in } d}{\text{Total terms in } d}
$$
<br>

**Inverse Document Frequency (IDF)**

Downweights terms common across all documents $D$:
<br>  
$$
\text{IDF}(t, D) = \log \left( \frac{|D|}{\text{Documents containing } t} \right)
$$
<br>

**TF-IDF**
Combines TF and IDF:  
<br>
$$
\text{TF-IDF}(t, d, D) = \text{TF}(t, d) \times \text{IDF}(t, D)
$$


In [None]:
# Step 4: Apply TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(df['content'])

# Step 5: Check feature names and TF-IDF vectors
features = vectorizer.get_feature_names_out()
print("Features:", len(features))

#print("TF-IDF Shape:", X.shape)
print("Sum of each row:", len(X.toarray()[1]))

tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Optional: if you want to keep index aligned with original df
tfidf_df.index = df.index

# Merge it with your original DataFrame
df_combined = pd.concat([df, tfidf_df], axis=1)
df = df_combined
df.head(3)
#\ Done!


Features: 4438
Sum of each row: 4438


Unnamed: 0,groupIDs,roomName,RoleName,userIDs,content,m_experimental_testing,m_making_design_choices,m_asking_questions,j_customer_consultants_requests,j_performance_parameters_requirements,...,yippee,yo,yooo,yoooooo,yup,zach,zachary,zane,zelin,zero
0,2a,Introduction and Workflow Tutorial with Entran...,Mentor,1,hello team welcome nephrotex,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2a,Introduction and Workflow Tutorial with Entran...,Mentor,1,maria williams design adviser internship,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2a,Introduction and Workflow Tutorial with Entran...,Mentor,1,help question,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### **Exporting**

In [None]:
df.to_csv(path_or_buf='/content/drive/MyDrive/ADS2001 project/data/virtualInternshipData_processed.csv', index = False) #dont run this