In [39]:
import pandas as pd
import numpy as np
import statistics as stat
import seaborn as sns
import matplotlib.pyplot as plt
import random
import re

In [79]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

# Preprocessing function for transcripts
def preprocess_text(text):
    if text is None:  # Check if the text is None
        return ""  # Return an empty string if text is None
    text = text.lower()  # Convert text to lowercase
    # # Removing Punctuation
    # text = re.sub(r'[^\w\s]', '', text)
    # Tokenization
    words = text.split()
    # Removing Stop Words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # Stemming
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    return ' '.join(words)  # Return the processed text as a single string

In [90]:
import pandas as pd

# Step 1: Load the Transcripts
scores_file = "data/scores.csv"  # target
transcripts_file = 'data/transcripts.csv'  # Replace with your transcripts file path
scores_df = pd.read_csv(scores_file)
transcripts_df = pd.read_csv(transcripts_file, header=None)

# Step 2: Initialize a list to hold structured data
structured_data = []

# Step 3: Process each row in the DataFrame
for index, row in transcripts_df.iterrows():
    # participant_id = row[0]  # Extract participant ID from the first column using iloc
    # transcript = row[1]  # Get the transcript from the second column using iloc
    participant_id = row[0]
    transcript = row[1]
    parts = transcript.split('|')  # Split the transcript based on '|'
    
    # Initialize variables to hold questions and answers
    current_question = ""
    current_answer = ""
    question_number = 0  # Initialize question number for the participant
    
    # Step 4: Iterate through the parts and extract questions and answers
    for part in parts:
        part = part.strip()  # Remove leading/trailing whitespace
        if part.startswith("Interviewer:"):
            # If there's an ongoing answer, save it before starting a new question
            if current_answer:
                structured_data.append({
                    'Participant': participant_id,  # Create unique question ID
                    'participant&question': f"{participant_id}q{question_number}",  # Create unique question ID
                    'question': current_question,
                    'answer': current_answer.strip()
                })
                current_answer = ""  # Reset current answer
            # Set the current question and increment question number
            current_question = part.replace("Interviewer:", "").strip()
            question_number += 1  # Increment question number
        elif part.startswith("Interviewee:"):
            # Append the answer
            current_answer += part.replace("Interviewee:", "").strip() + " "
    
    # After the loop, check if there's an ongoing answer to save
    if current_question and current_answer:
        structured_data.append({
            'Participant': participant_id,  # Create unique question ID
            'participant&question': f"{participant_id}q{question_number}",  # Create unique question ID
            'question': current_question,
            'answer': current_answer.strip()
        })

# Step 5: Create a DataFrame from the structured data
structured_df = pd.DataFrame(structured_data)

# Step 6: Display the structured DataFrame
print("Structured DataFrame:")
print(structured_df.head())
# Apply preprocessing to the transcript data
structured_df['processed_answers'] = structured_df['answer'].apply(preprocess_text)
# Create the merged df for directly for later use
merged_df = structured_df.merge(scores_df[['Participant', 'Overall', 'Excited']], on='Participant', how='left')
# Save to a new CSV file
merged_df.to_csv('data/clean_transcripts.csv', index=False)  
print(merged_df.head())
# Step 7: Save the Structured Data to a CSV file
# structured_df.to_csv('data/clean_transcripts.csv', index=False)  # Save to a new CSV file

Structured DataFrame:
  Participant participant&question  \
0          p1                 p1q1   
1          p1                 p1q2   
2          p1                 p1q3   
3          p1                 p1q4   
4          p1                 p1q5   

                                            question  \
0                              So how are you doing?   
1         Ok well  so please tell me about yourself.   
2                                              mhhmm   
3  So please tell me about a time that you demons...   
4  Tell me about a time when your working on a te...   

                                              answer  
0                                    Im pretty good.  
1  ok  uhm  so have you looked at my resume or sh...  
2  So ah  my interest kinda laid both in a little...  
3  Ok  uhm  one of the things we have to do for C...  
4  Ahh  I guess the easiest team project I just I...  
  Participant participant&question  \
0          p1                 p1q1   
1     

In [82]:
scores_file = "data/scores.csv" # target
transcript_file = "data/transcripts.csv" # inpput data
scores_data = pd.read_csv(scores_file)
transcript_data = pd.read_csv(transcript_file)
# Make the entire transcript lower case
transcript_data['transcript'] = transcript_data['transcript'].str.lower()

# drop the unnecesary columns 
columns_to_drop = ['Answer', 'Question']
transcript_data = transcript_data.drop(columns=[col for col in columns_to_drop if col in transcript_data.columns])

#creating structured dataset for improving regression based scoring; since the scoring is based on question wise
structured_data = []
# Iterate through each row in the transcripts DataFrame
for index, row in transcript_data.iterrows():
    transcript = row[1]  # Get the transcript from the first column
    participant_id = row[0]  # Extract participant ID from the transcript
    # print(f"Raw Transcript for {participant_id}: {transcript}")  # Debugging: Print the raw transcript
    parts = transcript.split('|')  # Split the transcript based on '|'
    # Print the split parts for debugging
    # print(f"Split Parts for {participant_id}: {parts}")  # Debugging: Print the split parts
    # Initialize variables to hold questions and answers
    current_question = ""
    current_answer = ""
    question_number = 0  # Initialize question number for the participant
    # Step 4: Iterate through the parts and extract questions and answers
    for part in parts:
        part = part.strip()  # Remove leading/trailing whitespace
        if part.startswith("Interviewer:"):
            # If there's an ongoing answer, save it before starting a new question
            if current_answer:
                structured_data.append({
                    'participant_id': f"{participant_id}q{question_number}",  # Create unique question ID
                    'question': current_question,
                    'answer': current_answer.strip()
                })
                current_answer = ""  # Reset current answer
            
            # Set the current question and increment question number
            current_question = part.replace("Interviewer:", "").strip()
            question_number += 1  # Increment question number
        elif part.startswith("Interviewee:"):
            # Append the answer
            current_answer += part.replace("Interviewee:", "").strip() + " "
    
    # After the loop, check if there's an ongoing answer to save
    if current_question and current_answer:
        structured_data.append({
            'participant_id': f"{participant_id}q{question_number}",  # Create unique question ID
            'question': current_question,
            'answer': current_answer.strip()
        })
# Convert structured data to a DataFrame for better visualization (optional)
structured_df = pd.DataFrame(structured_data)

# Apply preprocessing to the transcript data
structured_df['processed_answers'] = structured_df['answer'].apply(preprocess_text)
#create the merged df for directly for later use
merged_df = structured_df.merge(scores_data[['Participant', 'Overall', 'Excited']], on='Participant', how='left')
# print(merged_df.head())
merged_df.to_csv('data/clean_transcripts.csv', index=False)  # Save to a new CSV file
print(merged_df.head())
# transcript_data["Overall"] = scores_data["Overall"]

Structured DataFrame:
Empty DataFrame
Columns: []
Index: []


KeyError: 'answer'

Cleaning the transcripts dataset

In [38]:

# we use cleaning the stop words and stemmer for improving the textual data





  Participant                                         transcript   Overall  \
0          p1  interviewer: so how are you doing?|interviewee...  5.297316   
1         p10  interviewer: so  how you doing?|interviewee: g...  4.725115   
2         p11  interviewer: so  tell me about yourself. |inte...  5.010430   
3         p12  interviewer: so how are you doing today?|inter...  5.038526   
4         p13  interviewer: how are you doing today?|intervie...  4.251251   

    Excited                               processed_transcript  
0  5.043890  interviewer: doing?|interviewee: im pretti goo...  
1  4.383947  interviewer: doing?|interviewee: great you?|in...  
2  4.297760  interviewer: tell yourself. |interviewee: uhh ...  
3  5.322526  interviewer: today?|interviewee: i'm good you?...  
4  3.579510  interviewer: today?|interviewee: good.|intervi...  


In [35]:
# Splitting the participants into 5 folds
num_folds = 5

# Grab participant numbers from the scores csv file
interviews = scores_data['Participant'].unique()
participants = list(set([re.sub(r'^pp?|q\d+', '', item) for item in interviews]))
random.shuffle(participants)
participant_folds = [participants[i::num_folds] for i in range(num_folds)]

for i, fold in enumerate(participant_folds):
  print(f"Fold {i + 1}: {fold}")
  print("\tLength: ", len(fold))

# Grab all the correct interview names associated with each participant
interview_folds = []
for fold in participant_folds:
  interview_folds.append([item for num in fold for item in (f"p{num}", f"pp{num}")])

Fold 1: ['11', '89', '65', '42', '57', '29', '17', '7', '1', '10', '67', '47', '49', '81']
	Length:  14
Fold 2: ['24', '16', '52', '69', '12', '21', '73', '53', '37', '44', '78', '64', '3', '31']
	Length:  14
Fold 3: ['6', '86', '50', '45', '13', '20', '74', '58', '60', '85', '70', '56', '30', '8']
	Length:  14
Fold 4: ['72', '15', '33', '32', '27', '25', '43', '79', '62', '61', '71', '84', '34', '4']
	Length:  14
Fold 5: ['5', '48', '22', '59', '55', '66', '76', '83', '14', '80', '35', '77', '63']
	Length:  13


In [36]:
def get_data_splits(data, fold_number):
    """
    Split data into training, validation, and testing sets based on a specified fold.
    
    Parameters:
        data (DataFrame): The complete dataset.
        fold_number (int): The fold to use for testing (0-based index).
        
    Returns:
        tuple: (training_set, validation_set, testing_set)
    """
    # Quick check on fold number
    assert 0 <= fold_number < len(interview_folds), "Fold_number must be between 0 and len(folds) - 1"

    # Split the data
    test_set = data[data['Participant'].isin( interview_folds[fold_number] )]
    val_set = data[data['Participant'].isin( interview_folds[(fold_number + 1) % len(interview_folds)] )]
    train_set_parts = [item for i, fold in enumerate(interview_folds) if i not in [fold_number, (fold_number + 1) % len(interview_folds)] for item in fold]
    train_set = data[data['Participant'].isin(train_set_parts)]
    return train_set, val_set, test_set

Creating the text embedding

In [None]:

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import torch
from transformers import BertTokenizer, BertModel

nltk.download('punkt')  # Tokenizer
nltk.download('punkt_tab')
# nltk.download('averaged_perceptron_tagger_eng')  # POS Tagger
nltk.download('averaged_perceptron_tagger')
nltk.download('vader_lexicon') # Vader