# Clothing Reviews and predictive sentiment analysis

By: Chuan Law

In [None]:
# This is a pre requirement to the 'pattern framework below'
import nltk
nltk.download('omw-1.4')

In [None]:
from pattern.web import Twitter
from pattern.text.en import tag
from pattern.vector import KNN, count
from tqdm import tqdm
import pandas as pd
import json
import copy


In [None]:
# Test loading the dataset
df = pd.read_json('./datasets/renttherunway_final_data.json')
print("Total records:", df.shape[0])
print("Train", round(df.shape[0] * 0.7))
print("Test", round(df.shape[0] * 0.3))

Do some experimentation and analysis of the bias of lower reviews to correlation of the size of dress and age

In [None]:
# sample the dataset structure and values
print(df.head())

I have decided to focus on the relationship between size, age, body type and weight to the rating of review.
The following sections simply process the data and generate metrics for which I will use to make assumptions on the dataset available at hand and if I think there is any obvious bias.

In [None]:
# 8, 10 small <= 10
# 14 large >= 14 is large

# create the analysis dict
body_dict = {'hourglass': 0, 'straight & narrow': 0, 'athletic': 0, 'apple': 0, 'full bust': 0, 
             'petite': 0, 'pear': 0}
size_dict = {"large": 0, "medium": 0, "small":0, "xsmall":0}
age_dict = {"<19": 0, "20-29": 0, "30-39": 0, "40-49": 0, "50+": 0}

all_values = {
    '1.0': {
        "total": 0,
        "body_type": [copy.deepcopy(body_dict)],
        "age": [copy.deepcopy(age_dict)],
        "size": [copy.deepcopy(size_dict)]
    },
    '2.0': {
        "total": 0,
        "body_type": [copy.deepcopy(body_dict)],
        "age": [copy.deepcopy(age_dict)],
        "size": [copy.deepcopy(size_dict)]
    },
    '3.0': {
        "total": 0,
        "body_type": [copy.deepcopy(body_dict)],
        "age": [copy.deepcopy(age_dict)],
        "size": [copy.deepcopy(size_dict)]
    },
    '4.0': {
        "total": 0,
        "body_type": [copy.deepcopy(body_dict)],
        "age": [copy.deepcopy(age_dict)],
        "size": [copy.deepcopy(size_dict)]
    },
    '5.0': {
        "total": 0,
        "body_type": [copy.deepcopy(body_dict)],
        "age": [copy.deepcopy(age_dict)],
        "size": [copy.deepcopy(size_dict)]
    },
    '6.0': {
        "total": 0,
        "body_type": [copy.deepcopy(body_dict)],
        "age": [copy.deepcopy(age_dict)],
        "size": [copy.deepcopy(size_dict)]
    },
    '7.0': {
        "total": 0,
        "body_type": [copy.deepcopy(body_dict)],
        "age": [copy.deepcopy(age_dict)],
        "size": [copy.deepcopy(size_dict)]
    },
    '8.0': {
        "total": 0,
        "body_type": [copy.deepcopy(body_dict)],
        "age": [copy.deepcopy(age_dict)],
        "size": [copy.deepcopy(size_dict)]
    },
    '9.0': {
        "total": 0,
        "body_type": [copy.deepcopy(body_dict)],
        "age": [copy.deepcopy(age_dict)],
        "size": [copy.deepcopy(size_dict)]
    },
    '10.0': {
        "total": 0,
        "body_type": [copy.deepcopy(body_dict)],
        "age": [copy.deepcopy(age_dict)],
        "size": [copy.deepcopy(size_dict)]
    },
    'nan': {
        "total": 0,
        "body_type": [copy.deepcopy(body_dict)],
        "age": [copy.deepcopy(age_dict)],
        "size": [copy.deepcopy(size_dict)]
    },
}

sizes_set = set()
body_type_set = set()
age_set = set()

# print(df.head())
for index, row in df.iterrows():
    rating_key = str(row['rating'])
    all_values[rating_key]["total"] += 1
    
    sizes_set.add(row["size"])
    body_type_set.add(row["body type"])
    age_set.add(row["age"])
    
    # TODO: use single variables here to automatically populate each size
    if row['size'] >= 14:
        all_values[rating_key]["size"][0]["large"] += 1
        
    if row['size'] > 10 and row['size'] < 14:
        all_values[rating_key]["size"][0]["medium"] += 1
    
    if row['size'] > 8 and row['size'] <= 10:
        all_values[rating_key]["size"][0]["small"] += 1
        
    if row['size'] <= 8:
        all_values[rating_key]["size"][0]["xsmall"] += 1
    
    if row['age'] <= 19.0:
        all_values[rating_key]["age"][0]["<19"] += 1
                
    if row['age'] >= 20.0 and row['age'] <= 29.0:
        all_values[rating_key]["age"][0]["20-29"] += 1
    
    if row['age'] >= 30.0 and row['age'] <= 39.0:
        all_values[rating_key]["age"][0]["30-39"] += 1

    if row['age'] >= 40.0 and row['age'] <= 49.0:
        all_values[rating_key]["age"][0]["40-49"] += 1
    
    if row['age'] >= 50.0:
        all_values[rating_key]["age"][0]["50+"] += 1                
                
    if row['body type'] == 'hourglass':
        all_values[rating_key]["body_type"][0]["hourglass"] += 1
        
    if row['body type'] == 'apple':
        all_values[rating_key]["body_type"][0]["apple"] += 1
    
    if row['body type'] == 'straight & narrow':
        all_values[rating_key]["body_type"][0]["straight & narrow"] += 1
    
    if row['body type'] == 'athletic':
        all_values[rating_key]["body_type"][0]["athletic"] += 1
        
    if row['body type'] == 'full bust':
        all_values[rating_key]["body_type"][0]["full bust"] += 1
    
    if row['body type'] == 'petite':
        all_values[rating_key]["body_type"][0]["petite"] += 1

    if row['body type'] == 'pear':
        all_values[rating_key]["body_type"][0]["pear"] += 1

In [None]:
total = 0
total_large = 0
total_small = 0
total_xsmall = 0

for key in all_values:
    if all_values[key]['total'] != 0:  # remove the ratings with no reviews
        total += all_values[key]["total"]
        total_large += all_values[key]["size"][0]["large"]
        total_small += all_values[key]["size"][0]["small"]
        total_xsmall += all_values[key]["size"][0]["xsmall"]
        print(f"Size totals {key}: {all_values[key]['size'][0]}")
        print(f"Age Groups {key}: {all_values[key]['age'][0]}")
        print(f"Body Type totals {key}: {all_values[key]['body_type'][0]}")
        print(f"Rating amount {key}: {all_values[key]['total']}\n")
    
print(f"Total records: {total}")
print(f"Total large reviews: {total_large}")
print(f"Total small reviews: {total_small}")
print(f"Total xsmall reviews: {total_xsmall}")
# print(f"All sizes: {sizes_set}")
# print(f"All body types: {body_type_set}")
# print(f"All age groups: {age_set}")

The number of large (including xlarge) sized reviews in the dataset is 73706 (38%)

The number of small (including xsmall) sized reviews in the dataset is 91126 (47%)

Highlight use case of data gap for a particular group of - eg age 34, apple sized body - see the amount reviews ratings for this compared to a more normal body type. 

In [None]:
# All calculations here are for dataset evaluation
rating_2_size_large = all_values["2.0"]["size"][0]["large"]
rating_2_size_small = all_values["2.0"]["size"][0]["small"]
rating_2_total = all_values["2.0"]["total"]
rating_2_hourglass_total = all_values["2.0"]["body_type"][0]["hourglass"]

rating_4_size_large = all_values["4.0"]["size"][0]["large"]
rating_4_size_small = all_values["4.0"]["size"][0]["small"]
rating_4_total = all_values["4.0"]["total"]

bad_rating_total = rating_2_total + rating_4_total

rating_8_size_large = all_values["8.0"]["size"][0]["large"]
rating_8_size_small = all_values["8.0"]["size"][0]["small"]
rating_8_total = all_values["8.0"]["total"]

rating_10_size_large = all_values["10.0"]["size"][0]["large"]
rating_10_size_small = all_values["10.0"]["size"][0]["small"]
rating_10_total = all_values["10.0"]["total"]
rating_10_hourglass_total = all_values["10.0"]["body_type"][0]["hourglass"]

good_rating_total = rating_8_total + rating_10_total


# specific statistics eg: large sized bad reviews
total_size_large = 0
total_size_small = 0
total_size_xsmall = 0

for key in all_values:
    total_size_large += all_values[key]["size"][0]["large"] 
    total_size_small += all_values[key]["size"][0]["small"] 
    total_size_small += all_values[key]["size"][0]["xsmall"] 

# rating percentages - bad
total_bad_large_reviews = all_values["2.0"]["size"][0]["large"] + all_values["4.0"]["size"][0]["large"] 
total_bad_small_reviews = all_values["2.0"]["size"][0]["small"] + all_values["4.0"]["size"][0]["small"] + all_values["2.0"]["size"][0]["xsmall"] + all_values["4.0"]["size"][0]["xsmall"]  
    
# rating percentages - good
total_good_large_reviews = all_values["8.0"]["size"][0]["large"] + all_values["10.0"]["size"][0]["large"] 
total_good_small_reviews = all_values["8.0"]["size"][0]["small"] + all_values["10.0"]["size"][0]["small"] + all_values["8.0"]["size"][0]["xsmall"] + all_values["10.0"]["size"][0]["xsmall"]
    
# percentages of sizes
pct_size_large_total = (total_size_large / total) * 100
pct_size_small_total = ((total_size_small + total_size_xsmall) / total) * 100

# percentages of ratings
pct_rating_2_total = (rating_2_total / total) * 100
pct_rating_4_total = (rating_4_total / total) * 100
pct_rating_10_total = (rating_10_total / total) * 100

pct_bad_large = (total_bad_large_reviews/bad_rating_total) * 100
pct_good_large = (total_good_large_reviews/good_rating_total) * 100

pct_bad_small = (total_bad_small_reviews/bad_rating_total) * 100
pct_good_small = (total_good_small_reviews/good_rating_total) * 100

pct_bad_rating_total = (bad_rating_total / total) * 100
pct_good_rating_total = (good_rating_total / total) * 100

Display display metrics here

In [None]:
print(f"{pct_bad_rating_total:.2f}% of bad ratings in total ({bad_rating_total}/{total})")
print(f"{pct_good_rating_total:.2f}% of good ratings in total ({good_rating_total}/{total})\n")

# how many large sized women gave a bad review compared to small sized womemn

print(f"Bad reviews for large size: {total_bad_large_reviews}/{bad_rating_total} ({pct_bad_large:.2f}%)")
print(f"Bad reviews for small size: {total_bad_small_reviews}/{bad_rating_total} ({pct_bad_small:.2f}%)\n")

print(f"Good reviews for large size: {total_good_large_reviews}/{good_rating_total} ({pct_good_large:.2f}%)")
print(f"Good reviews for small size: {total_good_small_reviews}/{good_rating_total} ({pct_good_small:.2f}%)\n")

print(f"{pct_size_large_total:.2f}% of reviews are large (size 14 or greater) ({total_size_large}/{total})")
print(f"{pct_size_small_total:.2f}% of reviews are small (size 10 or less) ({total_size_small}/{total})\n")

Almost 40% of the dataset has a large dress size, 44% of these users had a bad experience with the dress. This shows a slight bias towards larger dress sizes being reviewed as bad comparitively to smaller sizes.

Here I have assumed that any review with a rating of 8 or higher is positive.
Conversely, a rating of 4 or lower is considered negative.

In [None]:
# Initialize new model
knn = KNN()

# Train it to output a score based on input
for index, row in tqdm(df[0:134781].iterrows()):
    if row['rating'] >= 9:
        v = tag(row['review_text'].lower())
        # TODO see if we can add body type and age here to train the model with
        # have a think about what we can use that for to predict...
        v = [word for word, pos in v if pos == 'JJ'] # JJ = adjective
        v = count(v)
        p = "good"
    elif row['rating'] <= 3:
        v = tag(row['review_text'].lower())
        v = [word for word, pos in v if pos == 'JJ'] # JJ = adjective
        v = count(v)
        p = "bad"
    if v:
        knn.train(v, type=p)


In [None]:
# Initialize new model
knn = KNN()

# Train KNN model on the train split of data. Will determine the rating based on review text
for index, row in tqdm(df[0:134781].iterrows()):
    if row['rating'] >= 8:
        v = tag(row['review_text'].lower())
        # TODO see if we can add body type and age here to train the model with
        # have a think about what we can use that for to predict...
        v = [word for word, pos in v if pos == 'JJ'] # JJ = adjective
        v = count(v)
        p = "good"
    elif row['rating'] <= 4:
        v = tag(row['review_text'].lower())
        v = [word for word, pos in v if pos == 'JJ'] # JJ = adjective
        v = count(v)
        p = "bad"
    if v:
        knn.train(v, type=p)

The following reviews are placed into the classifier to test how good the outputs are. So far they are looking correct.

In [None]:
# test output
print(knn.classify("this dress is really lovely. it's got great lines and does amazing things for your bust. however, the two slits have a kind of loin-cloth effect"))
print(knn.classify('uncomfortable shapeless unflattering'))
print(knn.classify('beautiful cute dress'))
print(knn.classify('not reccommend, this dress is unforgiving, short, lumpy, stiff and needless, its disappointing and awful.'))
print(knn.classify('uncomfortable and unflattering'))
print(knn.classify('I love this'))

In [None]:
#Test the model in the test set of dataset, and validate if it is correct or now

list_of_correct_good_predictions = []
list_of_correct_bad_predictions = []

# ground truths
total_good_reviews = 0
total_bad_reviews = 0

for index, row in tqdm(df[135082:136082].iterrows()):
    prediction = knn.classify(row['review_text'].lower())
#     print(f"Review has a rating of: {row['rating']}. Prediction: {prediction}")
    
    if row["rating"] >= 8:
#         print(f"Correct good predicted review {row['rating']}: {row['review_text']}")
        total_good_reviews += 1
        if prediction == "good":
            list_of_correct_good_predictions.append({
                "userid": row['user_id'],
                "rating": row['rating'],
                "text": row['review_text'],
                "size": row['size'],
                "age": row["age"]
            })
    
    elif row["rating"] <= 4:
        total_bad_reviews += 1
        if prediction == "bad":
            list_of_correct_bad_predictions.append({
                "userid": row['user_id'],
                "rating": row['rating'],
                "text": row['review_text'],
                "size": row['size'],
                "age": row["age"]
            })
    
    # Uncomment these to sample some of the predictions
#     if prediction == "bad" and row["rating"] >= 8:
#         print(f"Bad predicted review {row['rating']}: {row['review_text']}")
        
#     if prediction == "good" and row["rating"] <= 4:
#         print(f"Good predicted review {row['rating']}: {row['review_text']}")
print("-" * 10)
    
length_correct_good_predtiction = len(list_of_correct_good_predictions)
length_correct_bad_predtiction = len(list_of_correct_bad_predictions)
print(f"Total amount of correct good predictions: {length_correct_good_predtiction}/ {total_good_reviews} ({(length_correct_good_predtiction/total_good_reviews)*100:.2f}%)")
print(f"Total amount of correct bad predictions: {length_correct_bad_predtiction} / {total_bad_reviews} ({(length_correct_bad_predtiction/total_bad_reviews)*100:.2f}%)")

500it [02:49,  2.94it/s]
----------
Total amount of correct good predictions: 444/ 57763 (0.7686581375621072%)
Total amount of correct bad predictions: 0 / 57763 (0.0%)

Now I want to analyse the length of good to bad rated adjectives in the dataset

In [None]:
# seperate all the good reviews from the bad reviews - This takes quite a while on the whole dataset
# for quicker computation time, I have reduced this to the first 10000 records
# results of the full dataset are documented in the report

# get a list of adjectives of the good reviews and train those to be 'Positive'
# get a list of adjectives of the bad reviews and train those to be 'Negative'

good_df = []
bad_df = []

for index, row in tqdm(df[0:10000].iterrows()):
    if row['rating'] >= 8:
        good_df.append(row)
    elif row['rating'] <= 4:
        bad_df.append(row)
    else:
        continue
    
print(f"Amount of positive reviews: {len(good_df)}")
print(f"Amount of negative reviews: {len(bad_df)}")
good_result = pd.concat(good_df)
bad_result = pd.concat(bad_df)

# generate list of good words
good_words = []
bad_words = []

print("Computing list of positive words...")
for item in good_result['review_text']:
    v = tag(item)
    v = [word for word, pos in v if pos == 'JJ']
    v = count(v)
    good_words += list(v.keys())
    
print("Computing list of negative words...")
for item in bad_result['review_text']:
    v = tag(item)
    v = [word for word, pos in v if pos == 'JJ']
    v = count(v)
    bad_words += list(v.keys())

print("Done")

I noticed a lot of positive adjectives present in reviews rated good were also in present in bad - probably due to the combined use of these words to describe the dress. So assuming the same adjective in both good and bad reviews would 'cancel' each other out so to say, I wanted to find the number of uniquely 'bad' adjectives from the limited set of reviews.

In [None]:
good_words_final = list(set(good_words))
bad_words_final = list(set(bad_words))

print(f"Total amount of unique positive adjectives: {len(good_words_final)}")
print(f"Total amount of unique negative adjectives: {len(bad_words_final)}")

bad_words_final_filtered = [i for i in bad_words_final if i not in good_words_final]
print(f"Total amount of negative adjectives that aren't identified in positive reviews: {len(bad_words_final_filtered)}:{bad_words_final_filtered}")