In [None]:
import pandas as pd
import numpy as np
import spacy
import seaborn as sns
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz,process
import re
from sentence_transformers import SentenceTransformer
#berttopic, 
from transformers import pipeline
from gensim.corpora import Dictionary
from nltk.corpus import stopwords
import string
import glob

In [None]:
#Glob Files
 
# list all csv files only
csv_files = glob.glob('./Raw Files/*.{}'.format('csv'))
csv_files

In [None]:
df=pd.read_csv('./Raw Files/raw_survey_data.csv',index_col=0)

# Initial Preprocessing code


In [None]:
#remove shorter, less informative survey responses - apply to remove shorter responses
short_responses_df=df[df['raw_translation'].str.len()<7]
medium_responses_df=df[(df['raw_translation'].str.len()>=7) & (df['raw_translation'].str.len()<=35)]
df=df[df['raw_translation'].str.len()>35]

In [None]:
# Convert the 'date' column to datetime format 
df['date'] = pd.to_datetime(df['Day of Survey Date'].str.replace(r'\b(\d{1,2})(st|nd|rd|th)\b', r'\1', regex=True), format='%B %d, %Y')
# Extract month and year
df['month_year'] = df['date'].dt.to_period('M') 

In [None]:
df['month_year'].value_counts()

# Preprocessing Functions 

In [None]:
#from transformers import BertTokenizer
nlp=spacy.load('en_core_web_sm')

#add as needed depending on context
contractions_dict={
    'dr':'doctor',
    'dr.':'doctor',
    'doc':'doctor',
    'doc.':'doctor',
    'prov':'provider',
    'prov.':'provider',
    'mr':'mister',
    'mr.':'mister',
    'mrs.':'misses',
    'mrs':'misses',
    'rep':'representative',
    'rep.':'representative',
     "i'm": "i am",
    "you're": "you are",
    "he's": "he is",
    "she's": "she is",
    "it's": "it is",
    "we're": "we are",
    "they're": "they are",
    "i've": "i have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "i'd": "i would",
    "you'd": "you would",
    "he'd": "he would",
    "she'd": "she would",
    "we'd": "we would",
    "they'd": "they would",
    "i'll": "i will",
    "you'll": "you will",
    "he'll": "he will",
    "she'll": "she will",
    "we'll": "we will",
    "they'll": "they will",
    "can't": "cannot",
    "won't": "will not",
    "ain't": "is not",
    "aren't": "are not",
    "isn't": "is not",
    "wasn't": "was not",
    "weren't": "were not",
    "haven't": "have not",
    "hasn't": "has not",
    "hadn't": "had not",
    "don't": "do not",
    "doesn't": "does not",
    "didn't": "did not",
    "won't": "will not",
    "wouldn't": "would not",
    "shan't": "shall not",
    "shouldn't": "should not",
    "mightn't": "might not",
    "mustn't": "must not",
    "let's": "let us",
    "who's": "who is", 
    "what's": "what is",
    "here's": "here is",
    "there's": "there is",
    "where's": "where is",
    "when's": "when is",
    "why's": "why is",
    "how's": "how is",
    "y'all": "you all",
    "n't": " not",
    "'re": " are",
    "'s": " is",
    "'d": " would",
    "'ll": " will",
    "'ve": " have",
    "'m": " am" }

#useful for non-bert models
def in_depth_preprocess_text(text,contractions_dict,lowercase=True):

    #can omit if using cased model    
    if lowercase: #lowercasing
        text=text.lower()
    
    for contraction, expanded in contractions_dict.items(): #handle contractions
        text=re.sub(r'\b' + contraction + r'\b',expanded,text,flags=re.IGNORECASE)
    
    text=re.sub(r'(.)\1{2,}',r'\1',text) #fix common character repetitions (eg. iiii -> i)
    text=re.sub(r'[^\w\s]', '',text) #remove punctuation
    text=re.sub(r'\s+', ' ',text).strip() #normalize whitespace
    
    words=text.split()
    #unique_words=set(word for word in words if re.match(r'^[a-zA-Z]+$',word)) #remove duplicate words
    cleaned_text=' '.join(words)
        
    doc=nlp(cleaned_text) #additional preprocessing & lemmatization
    lemmatized_tokens=[token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
    
    cleaned_text= ' '.join(lemmatized_tokens)
    return cleaned_text

#use for trained models like Bert/Transformers
def simplified_preprocess_text(text,contractions_dict,lowercase=True):
    
    #can omit if using cased model
    if lowercase: #lowercasing
        text=text.lower()
        
    for contraction, expanded in contractions_dict.items(): #handle contractions
        text=re.sub(r'\b' + contraction + r'\b',expanded,text,flags=re.IGNORECASE)
    
    text=re.sub(r'(.)\1{2,}',r'\1',text) #fix common character repetitions (eg. iiii -> i)
    text=re.sub(r'\s+', ' ',text).strip() #normalize whitespace
    
    #tokenizer=BertTokenizer.from_pretrained('bert-base-uncased') #bert tokenizer
    #tokens=tokenizer.tokenize(cleaned_text)
    
    return text
    

In [None]:
#Different preprocessing is better for different cased/uncased models
df['simplified_preprocess_text_lower']=df['raw_translation'].apply(lambda x:simplified_preprocess_text(x,contractions_dict))
df['simplified_preprocess_text_orig']=df['raw_translation'].apply(lambda x:simplified_preprocess_text(x,contractions_dict,lowercase=False))
df['in_depth_preprocess_text']=df['raw_translation'].apply(lambda x:in_depth_preprocess_text(x,contractions_dict))

# EDA - Extra 

In [None]:
df['raw_character_count']=df['Open Ended Response'].apply(len)
df['raw_word_count']=df['Open Ended Response'].apply(lambda x: len(x.split()))

In [None]:
#readability score
import textstat

df['readiability_score']=df['raw_translation'].apply(textstat.flesch_kincaid_grade)

# Sentiment Analysis

In [None]:
from transformers import pipeline
from flair.models import TextClassifier
from flair.data import Sentence

In [None]:
bert_analyzer=pipeline('sentiment-analysis',model='nlptown/bert-base-multilingual-uncased-sentiment')
roberta_analyzer=pipeline('sentiment-analysis',model='cardiffnlp/twitter-roberta-base-sentiment')
distilbert_analyzer=pipeline('sentiment-analysis',model='distilbert-base-uncased-finetuned-sst-2-english')

In [None]:
flair_analyzer=TextClassifier.load('en-sentiment')

In [None]:
def apply_sentiment_analysis(row):
    text_processed_orig=row['simplified_preprocess_text_orig']
    text_processed_new=row['simplified_preprocess_text_lower']
    
    bert_result=bert_analyzer(text_processed_new[:512])[0]
    distilbert_result=distilbert_analyzer(text_processed_new[:512])[0]
    sentence=Sentence(text_processed_new[:512])
    flair_analyzer.predict(sentence)
    flair_result=sentence.labels[0]
    
    roberta_result=roberta_analyzer(text_processed_orig[:512])[0]
    
    
    #sentiment_results={
    #    'bert':{'label':bert_result['label'],'score':bert_result['score']},
    #    'distilbert':{'label':distilbert_result['label'],'score':distilbert_result['score']},
    #    'flair':{'label':flair_result.value,'score':flair_result.score},
    #    'roberta':{'label':roberta_result['label'],'score':roberta_result['score']}
    #}
    
    return pd.Series({
        'bert_label':bert_result['label'],
        'bert_score':bert_result['score'],
        'distilbert_label':distilbert_result['label'],
        'distilbert_score':distilbert_result['score'],        
        'flair_label':flair_result.value,
        'flair_score':flair_result.score,
        'roberta_label':roberta_result['label'],
        'roberta_score':roberta_result['score']})


In [None]:
df[['bert_label','bert_score',
    'distilbert_label','distilbert_score',
    'flair_label','flair_score',
    'roberta_label','roberta_score']]=df.apply(apply_sentiment_analysis,axis=1)

In [None]:
#outputs of sentiment models need to be mapped to "Positive","Negative" and "Neutral" labels
def map_labels_to_sentiment(label,column_name):
    if column_name=='bert_label':
        if label in ['5 stars','4 stars']:
            return 'POSITIVE'
        elif label in ['1 stars', '2 stars']:
            return 'NEGATIVE'
        elif label == '3 stars':
            return 'NEUTRAL'

    elif column_name=='distilbert_label':
        if label == 'POSITIVE':
            return 'POSITIVE'
        elif label == 'NEGATIVE':
            return 'NEGATIVE'

    elif column_name=='roberta_label':
        if label == 'LABEL_2':
            return 'POSITIVE'
        elif label == 'LABEL_0':
            return 'NEGATIVE'   
        elif label == 'LABEL 1':
            return 'NEUTRAL'
        
    elif column_name=='flair_label':
        if label == 'POSITIVE':
            return 'POSITIVE'
        elif label == 'NEGATIVE':
            return 'NEGATIVE'   

    return 'NEUTRAL'        
    

In [None]:
# 4-pronged Sentiment Analysis approach for Accuracy - 3 or more of the models have to agree on outputs 
def aggregate_sentiments(row):
    
    sentiment_columns=['bert_label','distilbert_label','roberta_label','flair_label']
    sentiment_counts={'POSITIVE':0,'NEGATIVE':0,'NEUTRAL':0}

    for column in sentiment_columns:
        sentiment=map_labels_to_sentiment(row[column],column)
        sentiment_counts[sentiment]+=1
    
    if sentiment_counts['POSITIVE']>=3:
        return 'POSITIVE'
    elif sentiment_counts['NEGATIVE']>=3:
        return 'NEGATIVE'
    else:
        return 'NEUTRAL'

In [None]:
df['final_sentiment']=df.apply(aggregate_sentiments,axis=1)

In [None]:
df_sent=df.final_sentiment.value_counts()

In [None]:
colors=['green','orange','red']
df_sent.plot(kind='bar',color=colors)
plt.title('Value Counts for Sentiments Uncovered')
plt.xlabel('Sentiment Label')
plt.ylabel('Counts')
plt.show()

# Identifying Medical Areas

In [None]:
#import regex package***
# Mapping dictionary 
medical_areas = { "cardiology": ["heart", "cardio",'cardiologist'],
                         "dermatology": ["skin", "derma",'dermatologist'],
                         "oncology": ["cancer", "tumor", "oncologist"],
                         "dentistry": ["teeth", "dentist",'dental','jaw'],
                         "neurology": ["nervous", "brain",'neuro'],
                         "pediatrics": ["pediatrician",'pediatrist'],
                        "urology": ['kidney','urine','urinary','urinate'],
                         "gastroenterology": ["stomach", "gut", "digestive", "gastro",'gastroenterologist'],
                         "orthopedics/podiatry": ["bones", "joints",'knee','toe', "orthopedist",'ankle','foot','podiatrist'],
                         "gynecology": ["gynecologist",'reproductive','ovaries','vagina','uterus'],
                         "rheumatology": ["arthritis","rheumatologist"],
                         "endocrinology": ["hormones", "glands", "endocrinologist"],
                         "psychiatry/psychology": ["psychiatrist",'psychologist','therapist','counseling', "psych"],
                         "pulmonology": ["lungs", "respiratory", "pulmonary",'pulmonologist'],
                         "hematology": ["hematologist",'anemia','blood'],
                         "nephrology": ["kidneys", "renal", "nephrologist"],
                         "ophthalmology": ["eyesight", "vision", "ophthalmologist"],
                         "otolaryngology": ['otolaryngologist'],
                         "immunology": ["allergies", "immunologist"],
                         "radiology": ["x-ray", "radiologist"],
                         "anesthesiology": ["anesthesia",'anesthesiologist', "anesthetist"],
                         "physical therapy": ["rehabilitation", "physiotherapy",'physical therapy',"PT"]
                        } 

In [None]:
#refine in V3
def extract_medical_areas(text):
    text_cleaned = re.sub(r'[^a-zA-Z\s]', '', text.lower())
    matched_areas=set()
    
    for area,keywords in medical_areas.items():
        for keyword in keywords:
            if keyword in text_cleaned:
                matched_areas.add(area)
                
                continue 
                
            elif fuzz.partial_ratio(keyword,text_cleaned)>95:
                matched_areas.add(area)
                
    return ', '.join(sorted(matched_areas))
        

In [None]:
#Only on survey responses longer than 3 characters*
df['medical_presence']=df['raw_translation'].apply(lambda x: extract_medical_areas(x) if len(x) >= 25 else '')

In [None]:
df.medical_presence.value_counts()

# Usefulness Score for Reviews

In [None]:
#can apply more to preprocessed text or raw text (v2)
from collections import Counter
from nltk.tokenize import word_tokenize 
from nltk.util import ngrams 
from nltk.corpus import stopwords, words 
import pandas as pd
import re 
import nltk

nltk.download('words')

# Define stopwords and command words 
#stop_words = set(stopwords.words('english'))
command_words = set([ 'need', 'must', 'should', 'stop', 'fix', 'do', 'don’t', 'can’t', 'please', 'ensure', 'verify', 'check', 'validate', 'confirm', 'achieve', 'address', 'follow', 'complete', 'review', 'implement', 'execute', 'provide', 'resolve', 'avoid', 'improve', 'assess', 'analyze', 'remove', 'adjust', 'notify', 'update', 'submit', 'request', 'clarify', 'refrain', 'consider', 'begin', 'start', 'facilitate','necessary']) 

# Load a list of valid words 
valid_words = set(words.words())

# Function to check for non-words 
def is_non_word(token): 
    return token.lower() not in valid_words and not re.match(r'^[a-zA-Z]+$', token) 

# Function to calculate non-word penalty
def calculate_non_word_penalty(tokens,valid_words): 
    non_words = sum(1 for token in tokens if is_non_word(token))
    return min(non_words / len(tokens), 0.25) if len(tokens)>0 else 0

# Function to calculate command word score 
def calculate_command_word_score(text, command_words):
    tokens = word_tokenize(text.lower())
    command_word_count = sum(1 for word in tokens if word in command_words)
    return command_word_count / len(tokens) if len(tokens) > 0 else 0

# Function to calculate n-gram frequencies across the corpus 
def calculate_corpus_ngram_frequencies(corpus_texts, ngram_range=2): 
    all_ngrams = [] 
    for text in corpus_texts:
        tokens = word_tokenize(text.lower()) 
        for n in range(2, ngram_range + 1):
            ngrams_list = list(ngrams(tokens, n))
            all_ngrams.extend(ngrams_list)
    return Counter(all_ngrams) 

# Function to calculate n-gram repetition score for a single text
def calculate_ngram_repetition_score(text, ngram_freqs, ngram_range=2):
    tokens = word_tokenize(text.lower())
    ngram_repetition_penalty = 0 
    
    for n in range(2, ngram_range + 1):
        ngrams_list = list(ngrams(tokens, n))
        ngrams_count = Counter(ngrams_list)
        total_ngrams = len(ngrams_list) 
        
        max_repetition = 0
        for ngram, count in ngrams_count.items():
            if ngram in ngram_freqs:
                max_repetition = max(max_repetition, count / ngram_freqs[ngram]) 
        
        if total_ngrams > 0:
            ngram_repetition_penalty = min(max_repetition, 0.25) 
            
    return ngram_repetition_penalty 

# Function to calculate usefulness score 
def calculate_usefulness_score(processed_text_2, raw_translation, sentiment_label, medical_presence, ngram_freqs, ngram_range=2):
    tokens_processed = word_tokenize(processed_text_2.lower()) 
    tokens_raw = word_tokenize(raw_translation.lower())     
    length_score = min(len(tokens_processed) / 25, 1)
    #non_stop_tokens = [word for word in tokens_processed if word not in stop_words]
    #repetition_score = 1 - (len(non_stop_tokens) / len(set(non_stop_tokens)))
    non_word_penalty_processed=calculate_non_word_penalty(tokens_processed,valid_words)
    non_word_penalty_raw=calculate_non_word_penalty(tokens_raw,valid_words)
    non_word_penalty = max(non_word_penalty_processed,non_word_penalty_raw)
    
    # Command words from both processed and raw text 
    command_word_score_processed = calculate_command_word_score(processed_text_2, command_words)
    command_word_score_raw = calculate_command_word_score(raw_translation, command_words)
    command_word_score = max(command_word_score_processed, command_word_score_raw)
    
    ngram_repetition_penalty = calculate_ngram_repetition_score(processed_text_2, ngram_freqs, ngram_range)
    sentiment_weight = 1.1 if sentiment_label in ['POSITIVE', 'NEGATIVE'] else 1.0
    medical_weight = 1.1 if pd.notna(medical_presence) and medical_presence.strip() != '' else 1
    
    final_score = (0.35 * length_score + 
                  # 0.15 * (1 - repetition_score) +
                   0.15 * (1 - non_word_penalty) +
                   0.15 * (1 - ngram_repetition_penalty) +
                   0.10 * command_word_score +
                   0.15 * sentiment_weight +
                   0.10 * medical_weight)
    return final_score 

corpus_texts = df['in_depth_preprocess_text'].tolist()
ngram_freqs = calculate_corpus_ngram_frequencies(corpus_texts, ngram_range=2) 

# Then, calculate the usefulness score for each review 
df['usefulness_score'] = df.apply(lambda row: calculate_usefulness_score(row['in_depth_preprocess_text'], row['simplified_preprocess_text_orig'], row['final_sentiment'], row['medical_presence'], ngram_freqs), axis=1)
#print(df[['processed_text_2', 'raw_translation', 'final_sentiment', 'medical_presence', 'usefulness_score']]) 

In [None]:
df.groupby('final_sentiment')['usefulness_score'].mean()

In [None]:
#usefulness_scores over time

In [None]:
df['month_year']=df['month_year'].astype(str)
df['usefulness_score']=pd.to_numeric(df['usefulness_score'],errors='coerce')

In [None]:
avg_usefulness_over_time=df.groupby('month_year')['usefulness_score'].mean().reset_index()
avg_usefulness_per_sentiment=df.groupby(['month_year','final_sentiment'])['usefulness_score'].mean().reset_index()

In [None]:
plt.figure(figsize=(10,6))

color_palette_2 = {'POSITIVE':'Green',
                   'NEGATIVE':'Red',
                   'NEUTRAL':'Orange'}

for sentiment,color in color_palette_2.items():
    sentiment_data=avg_usefulness_per_sentiment[avg_usefulness_per_sentiment['final_sentiment']==sentiment]
    sns.lineplot(x='month_year',y='usefulness_score',data=sentiment_data,label=sentiment,color=color)

sns.lineplot(x='month_year',y='usefulness_score',data=avg_usefulness_over_time,label='Overall Average Usefulness',color='black')
    

plt.title('Average Usefulness Over Time')
plt.xlabel('Month-Year')
plt.ylabel('Usefulness Score')

plt.legend(title='Sentiment')
plt.xticks(rotation=90)
plt.tight_layout()
plt.show()