In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")  # This is to ignore any warnings that might pop up during execution

# Basic libraries to manipulate data
import matplotlib.pyplot as plt  # Matplotlib for data visualization
import numpy as np  # Numpy for numerical computations
import pandas as pd  # Pandas for data manipulation

np.random.seed(42) # To ensure all the probabilistic things are reproducible

In [2]:
# Specify the path to the datasets
data_path = "./data/"

# Specify the filenames of the datasets
release_log_filename = "release_log.csv"
reviews_filename = "chatgpt_reviews.csv"

# Read the CSV files and create backup copies
backup_release_log = pd.read_csv(data_path + release_log_filename)
backup_reviews = pd.read_csv(data_path + reviews_filename)

# Create working copies of the dataframes to perform analysis
release_log_df = backup_release_log.copy()
reviews_df = backup_reviews.copy()

# Data Transformation on ChatGPT Reviews Dataset

In [4]:
# Let's create the length review column
reviews_df['reviews_length'] = reviews_df['review'].apply(len)
reviews_df.head()

Unnamed: 0,date,title,review,rating,reviews_length
0,2023-05-21 16:42:24,Much more accessible for blind users than the ...,Up to this point I’ve mostly been using ChatGP...,4,1407
1,2023-05-27 21:57:27,"4.5 stars, here’s why","I recently downloaded the app and overall, it'...",4,1420
2,2023-05-19 10:16:22,"Almost 5 stars, but… no search function",This app would almost be perfect if it wasn’t ...,4,1390
3,2023-06-09 07:49:36,"Good, but Siri support would take it to the ne...",I appreciate the devs implementing Siri suppor...,4,1160
4,2023-05-31 10:20:48,App review,"No doubt, this technology is absolutely life-c...",1,1429


## Exercise 1: 
Create a pivot table to check the average length of reviews by rating (values = reviews_length; index = rating, aggregate function = mean)

# Data tranformation

## Pivoting

**How can we analyze the average review length by rating?**

In [4]:
reviews_df.head()

Unnamed: 0,date,title,review,rating,reviews_length
0,2023-05-21 16:42:24,Much more accessible for blind users than the ...,Up to this point I’ve mostly been using ChatGP...,4,1407
1,2023-05-27 21:57:27,"4.5 stars, here’s why","I recently downloaded the app and overall, it'...",4,1420
2,2023-05-19 10:16:22,"Almost 5 stars, but… no search function",This app would almost be perfect if it wasn’t ...,4,1390
3,2023-06-09 07:49:36,"Good, but Siri support would take it to the ne...",I appreciate the devs implementing Siri suppor...,4,1160
4,2023-05-31 10:20:48,App review,"No doubt, this technology is absolutely life-c...",1,1429


In [5]:
pivot_table = reviews_df.pivot_table(index='rating', 
                        values='reviews_length', aggfunc='mean')
pivot_table

Unnamed: 0_level_0,reviews_length
rating,Unnamed: 1_level_1
1,164.659847
2,177.826087
3,182.492462
4,246.543478
5,229.528319


## Melting

## Exercise 2:
Create a melted table to check the reviews length and ratings according to each title (id_vars = title, value vars = rating and reviews_length)

In [6]:
melted_data = reviews_df.melt(id_vars=['title'], 
                    value_vars=['rating', 'reviews_length'])
melted_data

Unnamed: 0,title,variable,value
0,Much more accessible for blind users than the ...,rating,4
1,"4.5 stars, here’s why",rating,4
2,"Almost 5 stars, but… no search function",rating,4
3,"Good, but Siri support would take it to the ne...",rating,4
4,App review,rating,1
...,...,...,...
4111,Fantastic App with Room for Enhancements,reviews_length,1163
4112,"Awesome technology, deplorable tactics",reviews_length,205
4113,Legit amazing,reviews_length,364
4114,Amazing!!,reviews_length,432


# Feature Engineering

## Exercise 3:
Create a new variable "length_category" with cut function where labels of reviews_length are "Short" (if bins are between 0 and 100), "Medium" (if bins are between 100 and 500) and "Long" (if bins are over 500)

In [11]:
# Option 1
bins=[0, 100, 500, float('inf')]
labels=['Short', 'Medium', 'Long']      

reviews_df['length_category'] = 
    pd.cut(reviews_df['reviews_length'], bins=bins, labels=labels)

reviews_df

Unnamed: 0,date,title,review,rating,reviews_length,length_category
0,2023-05-21 16:42:24,Much more accessible for blind users than the ...,Up to this point I’ve mostly been using ChatGP...,4,1407,Long
1,2023-05-27 21:57:27,"4.5 stars, here’s why","I recently downloaded the app and overall, it'...",4,1420,Long
2,2023-05-19 10:16:22,"Almost 5 stars, but… no search function",This app would almost be perfect if it wasn’t ...,4,1390,Long
3,2023-06-09 07:49:36,"Good, but Siri support would take it to the ne...",I appreciate the devs implementing Siri suppor...,4,1160,Long
4,2023-05-31 10:20:48,App review,"No doubt, this technology is absolutely life-c...",1,1429,Long
...,...,...,...,...,...,...
2053,2023-05-18 18:27:04,Fantastic App with Room for Enhancements,The ChatGPT iOS app is an outstanding product....,5,1163,Long
2054,2023-05-18 17:17:44,"Awesome technology, deplorable tactics",Sam Altman’s blatant attempt at regulatory cap...,2,205,Medium
2055,2023-06-25 04:55:57,Legit amazing,So I like to role-play on this app because of ...,5,364,Medium
2056,2023-06-25 04:20:59,Amazing!!,I’m so grateful that they finally added iPad c...,5,432,Medium


In [8]:
# Option 2
def length_category(length):
    if length <= 100:
        return 'Short'
    elif 100 < length <= 500:
        return 'Medium'
    else:
        return 'Long'

In [9]:
reviews_df['Length Category']=reviews_df['reviews_length'].apply(length_category)
reviews_df.head()

Unnamed: 0,date,title,review,rating,reviews_length,Length Category
0,2023-05-21 16:42:24,Much more accessible for blind users than the ...,Up to this point I’ve mostly been using ChatGP...,4,1407,Long
1,2023-05-27 21:57:27,"4.5 stars, here’s why","I recently downloaded the app and overall, it'...",4,1420,Long
2,2023-05-19 10:16:22,"Almost 5 stars, but… no search function",This app would almost be perfect if it wasn’t ...,4,1390,Long
3,2023-06-09 07:49:36,"Good, but Siri support would take it to the ne...",I appreciate the devs implementing Siri suppor...,4,1160,Long
4,2023-05-31 10:20:48,App review,"No doubt, this technology is absolutely life-c...",1,1429,Long


## Binning and discretization

How can we segment ratings into high, medium, and low categories?

## Exercise 4: 
Create a new variable "rating_category" with cut function where labels of rating are "Low" (if bins are between 0 and 2), "Medium" (if bins are between 2 and 4) and "High" (if bins are over 4)

In [12]:
bins=[0, 2, 4, float('inf')] #Until "positive infinity"
labels=['Low', 'Medium', 'High']

reviews_df['rating_category'] = pd.cut(reviews_df['rating'], 
                            bins=bins, labels=labels)
reviews_df.sample(5)

Unnamed: 0,date,title,review,rating,reviews_length,length_category,rating_category
1298,2023-05-29 09:30:11,log in to apple's account,Log in to the official website and can't be op...,1,51,Short,Low
591,2023-05-19 02:32:18,First message for ChatGPT in iOS App Store,"Well, must to say, this is first version. Must...",4,129,Medium,Medium
1318,2023-05-18 17:20:40,TruthGPT >>,Elon Musk >> Sam Altman\nTruthGPT >> ChatGPT\n...,5,126,Medium,High
1067,2023-06-10 23:02:47,GPT 3.0,"Great, only issue is it doesn’t have any knowl...",4,70,Short,Medium
29,2023-05-22 03:43:48,Cutting-Edge Conversations in Your Pocket,Written by ChatGPT: \n\nThe ChatGPT app is a g...,5,876,Long,High
