In [None]:
import pandas as pd
from rapidfuzz import fuzz
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from collections import Counter
import swifter

In [None]:
def cleanliness_count(review,word_set):
        if word_set:
            word_count = Counter(review)
            for word in word_set:
                if word in word_count:
                    return 1
        else:
             return 0
def cleanliness_count_fuzzy(review,word_set):
    for word in review:
         for category_word in word_set:
              if(fuzz.ratio(category_word,word) >= 80):
                 return 1
    return 0
  
    
def get_sentiment(listing_id, reviews_df, category,word_set=None):
    sid = SentimentIntensityAnalyzer()  
    # Filter reviews for the given listing_id
    listing_reviews = reviews_df[reviews_df['listing_id'] == listing_id].copy()
    if word_set:
        listing_reviews.loc[:,'cleanliness_count'] = listing_reviews['review'].apply(
             lambda x: cleanliness_count(x.split(),word_set=word_set)
        )
        reviews = listing_reviews[listing_reviews['cleanliness_count'] > 0]
    else:
        reviews = listing_reviews

    if not reviews.empty:
        sentiment = reviews.apply(
            lambda row: (sid.polarity_scores(row['review'])['compound']), axis=1
        )
        return pd.Series({f'median_sentiment_{category}': sentiment.median(), 
                f'average_sentiment_{category}': sentiment.mean()})
    else:
        return pd.Series({f'median_sentiment_{category}': 0, 
                f'average_sentiment_{category}': 0})


In [None]:
with open('./Dictionaries/cleanliness_words.txt','r') as f:
    clean_words = [line.strip().lower() for line in f.readlines()]
with open('./Dictionaries/accuracy_words.txt','r') as f:
    accuracy_words = [line.strip().lower() for line in f.readlines()]
with open('./Dictionaries/checkin_words.txt','r') as f:
    checkin_words = [line.strip().lower() for line in f.readlines()]
with open('./Dictionaries/communication_words.txt','r') as f:
    communication_words = [line.strip().lower() for line in f.readlines()]
with open('./Dictionaries/location_words.txt','r') as f:
    location_words = [line.strip().lower() for line in f.readlines()]
with open('./Dictionaries/value_words.txt','r') as f:
    value_words = [line.strip().lower() for line in f.readlines()]

df = pd.read_csv('./Data/combined_short_removed_en_only.csv')
df.dropna(subset=['comments'],inplace=True)
df['comments'] = df['comments'].apply(lambda x: x.split('``'))
df.convert_dtypes()

In [None]:
clean_words = list(set(clean_words))
accuracy_words = list(set(accuracy_words))
checkin_words = list(set(checkin_words))
communication_words = list(set(communication_words))
location_words = list(set(location_words))
value_words = list(set(value_words))
category_list = {
    'clean':clean_words,
    'accurate':accuracy_words,
    'checkin':checkin_words,
    'communication':communication_words,
    'location':location_words,
    'value':value_words,
    'overall':None,
}

In [None]:
reviews_df = pd.DataFrame(columns=['listing_id','review'])

reviews_data = [
    {'listing_id': row[1]['listing_id'], 'review': review} for row in df.iterrows() for review in row[1]['comments']
]

reviews_df = pd.DataFrame(reviews_data)

In [None]:
def calculate_sentiments(key, value, df, reviews_df):
    df[[f'median_sentiment_{key}', f'average_sentiment_{key}']] = df['listing_id'].apply(
        lambda id: get_sentiment(id, reviews_df=reviews_df, category=key, word_set=value)
    )
    return df

# with concurrent.futures.ThreadPoolExecutor() as executor:
#     futures = [executor.submit(calculate_sentiments, key, value, df, reviews_df) for key, value in category_list.items()]
#     for future in concurrent.futures.as_completed(futures):
#         df = future.result()

for key,value in category_list.items():
    df[[f'median_sentiment_{key}',f'average_sentiment_{key}']] = df['listing_id'].swifter.apply(
        lambda id: get_sentiment(id, word_set=value, reviews_df=reviews_df,category=key)
    )


In [None]:
df.to_csv('./Data/cleaned_with_sentiment_scores.csv',index=False,encoding='utf-8')