In [23]:
import pandas as pd

df = pd.read_csv("camera_dataset.tsv", delimiter="\t", on_bad_lines="skip")
print(df.head())

# list the columns
df.columns

# list the unique values for column "product_category"
df["product_category"].unique()

  marketplace  customer_id       review_id  product_id  product_parent  \
0          US      2975964  R1NBG94582SJE2  B00I01JQJM       860486164   
1          US     23526356  R273DCA6Y0H9V7  B00TCO0ZAA       292641483   
2          US     52764145   RQVOXO7WUOFK6  B00B7733E0        75825744   
3          US     47348933  R1KWKSF21PO6HO  B006ZN4U34       789352955   
4          US     33680700  R38H3UO1J190GI  B00HUEBGMU        19067902   

                                       product_title product_category  \
0  GoPro Rechargeable Battery 2.0 (HERO3/HERO3+ o...           Camera   
1  Professional 58mm Center Pinch Lens Cap for CA...           Camera   
2  Spy Tec Z12 Motion Activated Intelligent Secur...           Camera   
3  Celestron UpClose G2 10x25 Monocular, Black (7...           Camera   
4  Vidpro XM-L Wired Lavalier microphone - 20' Au...           Camera   

   star_rating  helpful_votes  total_votes vine verified_purchase  \
0            5              0            0    N

array(['Camera'], dtype=object)

In [24]:
#remove all rows that have a null value in column "product_category" for dataframe df
df_remove_null = df.dropna(subset=["product_category"])

#count the rows for dataframe df
df_remove_null.count()



marketplace          1800845
customer_id          1800845
review_id            1800845
product_id           1800845
product_parent       1800845
product_title        1800845
product_category     1800845
star_rating          1800845
helpful_votes        1800845
total_votes          1800845
vine                 1800845
verified_purchase    1800845
review_headline      1800827
review_body          1800753
review_date          1800836
dtype: int64

In [25]:
# count how many rows in dataframe df have the value "[null]" in the column "product_category"
null_count = df[df['product_category'] == '-'].shape[0]
# Print the result
print(f"Number of rows with '[null]' value: {null_count}")


Number of rows with '[null]' value: 0


In [26]:
# Columns we need: customer_id, product_id, product_title, product_category, star_rating, total_votes, vine, verified_purchase, review_headline, review_body

trimmed_df = df[["customer_id", "product_id", "product_title", "star_rating", "helpful_votes", "total_votes", "vine", "verified_purchase", "review_headline", "review_body"]]

In [27]:
trimmed_df.head(50)

Unnamed: 0,customer_id,product_id,product_title,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body
0,2975964,B00I01JQJM,GoPro Rechargeable Battery 2.0 (HERO3/HERO3+ o...,5,0,0,N,Y,Five Stars,ok
1,23526356,B00TCO0ZAA,Professional 58mm Center Pinch Lens Cap for CA...,5,0,0,N,Y,Love it!!!,"Perfect, even sturdier than the original!"
2,52764145,B00B7733E0,Spy Tec Z12 Motion Activated Intelligent Secur...,2,1,1,N,Y,Another Motion Detect Fail,"If the words, &#34;Cheap Chinese Junk&#34; com..."
3,47348933,B006ZN4U34,"Celestron UpClose G2 10x25 Monocular, Black (7...",5,0,0,N,Y,Exactly what I wanted and expected.,Exactly what I wanted and expected. Perfect fo...
4,33680700,B00HUEBGMU,Vidpro XM-L Wired Lavalier microphone - 20' Au...,5,1,1,N,Y,Good mic at a Good Price...Not Canon Though.,I will look past the fact that they tricked me...
5,30301059,B008MW6Y12,NIX 8 inch Hi-Res Digital Photo Frame with Mot...,3,0,0,N,Y,"The controls are pretty slow, and I can't get ...","The controls are pretty slow, and I can't get ..."
6,28282645,B00TE8XKIS,Polaroid ZIP Mobile Printer Parent ASIN,3,8,8,N,N,"Fun toy for making stickers, but expensive to ...",The printer came in a small fairly plain box w...
7,502818,B00ZKDUFBQ,GeekPro 2.0-Inch WIFI HD 1080P 12MP Sports Cam...,5,0,1,N,Y,Five Stars,Great camera for the price.
8,1481233,B010BZ7S2Q,Sony HDR-AZ1VR Action Cam Mini Camcorder 1080p...,5,0,2,N,Y,Five Stars,Product is very good and satisfactory.<br /><b...
9,27885926,B00HRXSSRA,"ChiliPower DMW-BLC12, DMW-BLC12E, DMW-BLC12PP ...",1,0,0,N,Y,Sucky.,"Lasted a few hours with its first charge, whic..."


In [28]:
# count rows with more than 0 helpful_votes
count = (trimmed_df['helpful_votes'] > 0).sum()
print(count)


707547


In [29]:
count = (trimmed_df['total_votes'] > 0).sum()
print(count)


843673


In [30]:
# random sampler to pull 10,000 rows from trimmed_df

random_sample = trimmed_df.sample(n=10000)

count = random_sample[random_sample['star_rating'] == 1].shape[0]
print(f"Number of rows = 1: {count}")

Number of rows = 1: 942


In [31]:
# def get_random_samples(df, n=5, frac=None, replace=False, random_state=None):
#   return df.sample(n=n, frac=frac, replace=replace, random_state=random_state)

# random_sample2 = get_random_samples(trimmed_df,n=10000)

# count = random_sample2[random_sample2['star_rating'] == 5].shape[0]
# print(f"Number of rows = 5: {count}")
random_sample.columns
#remove all null values from "review_body" and "review_headline" columns for random sample
random_sample = random_sample.dropna(subset=['review_body', 'review_headline'])


In [32]:
from nltk.corpus import stopwords
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from transformers import GPT2Tokenizer, GPT2Model

# Initialize GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

random_sample['review_body'] = random_sample['review_body'].str.lower()
random_sample['review_body'] = random_sample['review_body'].str.replace('[^\w\s]','')
random_sample['review_body'] = random_sample['review_body'].str.replace('\d+', '')

random_sample['review_headline'] = random_sample['review_headline'].str.lower()
random_sample['review_headline'] = random_sample['review_headline'].str.replace('[^\w\s]','')
random_sample['review_headline'] = random_sample['review_headline'].str.replace('\d+', '')

stop_words = set(stopwords.words('english'))
random_sample['review_body'] = random_sample['review_body'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
random_sample['review_headline'] = random_sample['review_headline'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

random_sample['review_body'] = random_sample['review_body'].apply(lambda x: ' '.join([word for word in x.split() if len(word)>2]))
random_sample['review_headline'] = random_sample['review_headline'].apply(lambda x: ' '.join([word for word in x.split() if len(word)>2]))


random_sample['review_body_tokens'] = random_sample['review_body'].apply(lambda x: tokenizer.tokenize(x) if isinstance(x, str) else [])
random_sample['review_headline_tokens'] = random_sample['review_headline'].apply(lambda x: tokenizer.tokenize(x) if isinstance(x, str) else [])


encoded_reviews = random_sample['review_body'].apply(lambda x: tokenizer.encode_plus(
    x,
    add_special_tokens=True,
    max_length=1024,  # GPT-2 can handle longer sequences
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
))

encoded_headlines = random_sample['review_headline'].apply(lambda x: tokenizer.encode_plus(
    x,
    add_special_tokens=True,
    max_length=128,
    padding='max_length',
    truncation=True,
    return_attention_mask=True,
    return_tensors='pt'
))

random_sample['review_input_ids'] = encoded_reviews.apply(lambda x: x['input_ids'].squeeze())
random_sample['review_attention_mask'] = encoded_reviews.apply(lambda x: x['attention_mask'].squeeze())

random_sample['headline_input_ids'] = encoded_headlines.apply(lambda x: x['input_ids'].squeeze())
random_sample['headline_attention_mask'] = encoded_headlines.apply(lambda x: x['attention_mask'].squeeze())



In [33]:
def map_stars_to_sentiment(star_rating):
    if star_rating == 1:
        return 'terrible'
    elif star_rating == 2:
        return 'bad'
    elif star_rating == 3:
        return 'ok'
    elif star_rating == 4:
        return 'good'
    elif star_rating == 5:
        return 'great'
    else:
        return 'unknown'

random_sample['sentiment'] = random_sample['star_rating'].apply(map_stars_to_sentiment)

sentiment_mapping = {
    'terrible': 0,
    'bad': 1,
    'ok': 2,
    'good': 3,
    'great': 4
}

random_sample['sentiment_label'] = random_sample['sentiment'].map(sentiment_mapping)

sentiment_dummies = pd.get_dummies(random_sample['sentiment'], prefix='sentiment')
random_sample = pd.concat([random_sample, sentiment_dummies], axis=1)

print("\nSentiment Distribution:")
print(random_sample['sentiment'].value_counts())

print("\nSentiment Distribution (%):")
print(random_sample['sentiment'].value_counts(normalize=True) * 100)



Sentiment Distribution:
sentiment
great       5910
good        1906
terrible     942
ok           754
bad          487
Name: count, dtype: int64

Sentiment Distribution (%):
sentiment
great       59.105911
good        19.061906
terrible     9.420942
ok           7.540754
bad          4.870487
Name: proportion, dtype: float64


In [35]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Function to pad sequences to the same length
def pad_sequences(sequences, max_length=None):
    if max_length is None:
        max_length = max(len(seq) for seq in sequences)
    
    padded_sequences = []
    for seq in sequences:
        if len(seq) > max_length:
            padded_seq = seq[:max_length]
        else:
            padding = np.zeros(max_length - len(seq))
            padded_seq = np.concatenate([seq, padding])
        padded_sequences.append(padded_seq)
    
    return np.array(padded_sequences)




STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Review Body - Logistic Regression:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       181
           1       0.00      0.00      0.00        84
           2       0.05      0.01      0.02       170
           3       0.27      0.05      0.09       383
           4       0.60      0.94      0.73      1182

    accuracy                           0.57      2000
   macro avg       0.18      0.20      0.17      2000
weighted avg       0.41      0.57      0.45      2000


Review Body - Random Forest:
              precision    recall  f1-score   support

           0       0.25      0.01      0.01       181
           1       0.00      0.00      0.00        84
           2       0.00      0.00      0.00       170
           3       0.29      0.02      0.03       383
           4       0.59      0.99      0.74      1182

    accuracy                           0.59      2000
   macro avg       0.23      0.20      0.16      2000
weighted avg

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


['best_headline_model.joblib']