In [None]:
# Step 1: Install VADER and import libraries
!pip install vaderSentiment

import pandas as pd
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from google.colab import files

# Step 2: Upload CSV from R
uploaded = files.upload()  # Select your .csv

# Step 3: Load the CSV
df = pd.read_csv("interview_corpus.csv")

# Step 4: Remove standalone "yes", "no", "yeah", etc. from text
def clean_affirmations(text):
    if pd.isna(text):
        return ""
    # Break text by period
    sentences = text.split('.')
    # Remove short affirmation-only responses
    filtered = [s.strip() for s in sentences if s.strip().lower() not in {"yes", "no", "yeah"}]
    # Rejoin into full string
    return '. '.join(filtered)

# Apply cleaning function
df['text_cleaned'] = df['text'].apply(clean_affirmations)

# Step 5: Initialise VADER
analyzer = SentimentIntensityAnalyzer()

# Step 6: Apply VADER to cleaned text
df['vader_score'] = df['text_cleaned'].apply(lambda x: analyzer.polarity_scores(str(x))['compound'])

# Step 7: Assign basic sentiment label
def label_sentiment(score):
    if score >= 0.05:
        return 'positive'
    elif score <= -0.05:
        return 'negative'
    else:
        return 'neutral'

df['vader_label'] = df['vader_score'].apply(label_sentiment)

# Step 8: Preview results
df[['participant_id', 'timepoint', 'vader_score', 'vader_label']].head(10)

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/126.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m122.9/126.0 kB[0m [31m4.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


Saving interview_corpus.csv to interview_corpus.csv


Unnamed: 0,participant_id,timepoint,vader_score,vader_label
0,A003,midline,0.9998,positive
1,A004,midline,0.999,positive
2,A005,midline,0.999,positive
3,A013,midline,0.9849,positive
4,A015,midline,0.9999,positive
5,A017,midline,0.9994,positive
6,B006,midline,0.9999,positive
7,B007,midline,0.9989,positive
8,B018,midline,0.999,positive
9,A003,endline,0.9999,positive


In [None]:
# Show all rows
pd.set_option('display.max_rows', None)
print(df)

   participant_id timepoint  \
0            A003   midline   
1            A004   midline   
2            A005   midline   
3            A013   midline   
4            A015   midline   
5            A017   midline   
6            B006   midline   
7            B007   midline   
8            B018   midline   
9            A003   endline   
10           A004   endline   
11           A005   endline   
12           A013   endline   
13           A015   endline   
14           A017   endline   
15           B006   endline   
16           B007   endline   
17           B018   endline   

                                                 text  \
0   Yes. Yes. Before PRB began I never had a five-...   
1   Yes. Before PRB began, I seen myself in five y...   
2   Yes. Before PRB, I ain't really see myself now...   
3   Yeah. No, I ain't have a fucking plan. I was j...   
4   Yes. To be honest, I was trying to get back in...   
5   A017. Yes. Five years, I probably see myself i...   
6   Yes Bef

In [None]:
# Example: calculate sentiment change
df_wide = df.pivot(index='participant_id', columns='timepoint', values='vader_score')
df_wide['change'] = df_wide['endline'] - df_wide['midline']
print(df_wide)

timepoint       endline  midline  change
participant_id                          
A003             0.9999   0.9998  0.0001
A004             0.9998   0.9990  0.0008
A005             0.9997   0.9990  0.0007
A013             0.9966   0.9849  0.0117
A015             1.0000   0.9999  0.0001
A017             0.9999   0.9994  0.0005
B006             0.8777   0.9999 -0.1222
B007             0.9996   0.9989  0.0007
B018             0.9993   0.9990  0.0003
