In [1]:
# import dependencies and the data to be analyzed
import pandas as pd
import numpy as np

df = pd.read_csv("vine_table.zip")
df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
0,RTIS3L2M1F5SM,5,0,0,N,Y
1,R1ZV7R40OLHKD,5,0,0,N,Y
2,R3BH071QLH8QMC,1,0,1,N,Y
3,R127K9NTSXA2YH,3,0,0,N,Y
4,R32ZWUXDJPW27Q,4,0,0,N,Y


In [2]:
# set the number of total votes as the lower threshold for which reviews to include
total_votes = 20

In [3]:
# Filter the reviews to return lines that have greater than or equal to 20 total votes
twenty_total_df = df[df["total_votes"]>=total_votes]
twenty_total_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
55,R4PKAZRQJJX14,1,21,34,N,N
74,R2CI0Y288CC7E2,1,21,35,N,Y
209,R127WEQY2FM1T3,1,147,175,N,Y
289,R3EZ0EPYLDA34S,1,14,31,N,Y
483,R2FJ94555FZH32,2,55,60,N,N


In [4]:
# Filter further for reviews where the helpful votes are 50% or more of the total votes
helpful_df = twenty_total_df[twenty_total_df["helpful_votes"]/twenty_total_df["total_votes"]>=.5]
helpful_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
55,R4PKAZRQJJX14,1,21,34,N,N
74,R2CI0Y288CC7E2,1,21,35,N,Y
209,R127WEQY2FM1T3,1,147,175,N,Y
483,R2FJ94555FZH32,2,55,60,N,N
537,R1U3AR67RE273L,1,51,65,N,Y


In [5]:
# Separate all of the vine reviews
vine_df = helpful_df[helpful_df["vine"]=="Y"]
vine_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
32611,R3KKUSGFZWSUIY,5,56,63,Y,N
33112,R10FO5UKKVZBK2,3,23,23,Y,N
69680,RM4KSGEOR7MU1,5,19,24,Y,N
155361,RG7VRMYLEXD23,4,22,26,Y,N
239327,R11O4YSCPSNL6L,3,20,26,Y,N


In [6]:
# Separate all of the non-vine reviews
no_vine_df = helpful_df[helpful_df["vine"]=="N"]
no_vine_df.head()

Unnamed: 0,review_id,star_rating,helpful_votes,total_votes,vine,verified_purchase
55,R4PKAZRQJJX14,1,21,34,N,N
74,R2CI0Y288CC7E2,1,21,35,N,Y
209,R127WEQY2FM1T3,1,147,175,N,Y
483,R2FJ94555FZH32,2,55,60,N,N
537,R1U3AR67RE273L,1,51,65,N,Y


In [7]:
# get the total number of vine reviews
vine_length=vine_df["review_id"].size
vine_length

94

In [8]:
# get the total number of non-vine reviews
no_vine_length=no_vine_df["review_id"].size
no_vine_length

40471

In [9]:
# Determine the total count of each star-rating for vine reviews
vine_df.value_counts(vine_df["star_rating"])

star_rating
5    48
4    24
3    16
2     5
1     1
dtype: int64

In [10]:
# retrieve the total number of 5-star vine reviews
vine_5 = vine_df.value_counts(vine_df["star_rating"]).to_list()[0]
vine_5

48

In [11]:
# Determine the total count of each star-rating for non-vine reviews
no_vine_df.value_counts(no_vine_df["star_rating"])

star_rating
5    15663
1    10303
4     6738
3     4379
2     3388
dtype: int64

In [12]:
# Retrieve the total number of non-vin 5-star reviews
no_vine_5 = no_vine_df.value_counts(no_vine_df["star_rating"]).to_list()[0]
no_vine_5

15663

In [13]:
# Determine the percentage of vine reviews that were 5-star
vine_5_ratio = vine_5/vine_length
vine_5_ratio

0.5106382978723404

In [14]:
#Determine the percentage of non-vine reviews that were 5-star
no_vine_5_ratio = no_vine_5/no_vine_length
no_vine_5_ratio

0.38701786464381904