In [1]:
%matplotlib inline
import matplotlib.pyplot as plt 
import pandas as pd
import numpy as np
import seaborn as sns
import scipy as sp
from scipy import stats
import statsmodels.api as sm

sns.set()

In [2]:
#import hotel reviews into a dataframe
df = pd.read_csv('yelp_reviews_filtered.csv')

## Hypothesis Testing

In [3]:
#create a function to draw bootstrap replicates
def bootstrap_replicate_1D(data, func):
    bs_sample = np.random.choice(data, len(data))
    return func(bs_sample)

def draw_bs_reps(data, func, size=1):
    """Draw bootstrap replicates."""

    # Initialize array of replicates: bs_replicates
    bs_replicates = np.empty(size)

    # Generate replicates
    for i in range(size):
        bs_replicates[i] = bootstrap_replicate_1D(data, func)

    return bs_replicates

### Is there a significant difference between text length in high and low rated reviews?

Null hypothesis: The average text length of reviews above 3 == the average text length of reviews below 3

Alternative hypothesis: The average text length of reviews above 3 != the average text length of reviews below 3

significance level = .05

In [4]:
high_rating = df[df.review_rating > 3]
low_rating = df[df.review_rating < 3]

In [5]:
sp.stats.ttest_ind(high_rating.text_length, low_rating.text_length)

Ttest_indResult(statistic=-27.788654464057906, pvalue=1.1406181296993607e-168)

The t test assumed the means are identical, and the results allows us to reject that hypothesis. 

What is the probability that text_length is higher in low rated reviews?

In [6]:
# Compute observed difference of means
diff_means = np.mean(high_rating.text_length) - np.mean(low_rating.text_length)

# Compute 10,000 bootstrap replicates
bs_replicates_hl = draw_bs_reps(high_rating.text_length, np.mean, size=10000)
bs_replicates_ll = draw_bs_reps(low_rating.text_length, np.mean, size=10000)

# Get replicates of difference of means: bs_replicates
bs_replicatestl = bs_replicates_hl - bs_replicates_ll

In [7]:
# Compute p value
p = np.sum(bs_replicatestl <= 0) / float(len(bs_replicatestl))
print('p-value =', p)

conf95 = np.percentile(bs_replicatestl, [2.5,97.5])
print('95% confidence interval: ', conf95)
print('Observed difference: ', diff_means)

p-value = 1.0
95% confidence interval:  [-204.26076835 -177.0693572 ]
Observed difference:  -190.56633469385076


Based on the results we reject the null hypothesis and are confident that the average text length is larger for lower rated reviews.

### Is there a significant difference between text length in high and low rated companies?

Null hypothesis: The average text length for companies rated above 3 == the average text length of companies rated below 3

Alternative hypothesis: The average text length for companies rated above 3 == the average text length of companies rated below 3

significance level = .05

In [8]:
high_company = df[df.company_rating > 3]
low_company = df[df.company_rating < 3]

In [9]:
sp.stats.ttest_ind(high_company.text_length, low_company.text_length)

Ttest_indResult(statistic=-5.418520161172166, pvalue=6.045297064899137e-08)

The t test assumed the means are identical, and the results allows us to reject that hypothesis. 

What is the probability that text_length is longer for low rated companies?

In [10]:
# Compute observed difference of means
diff_means = np.mean(high_company.text_length) - np.mean(low_company.text_length)

# Compute 10,000 bootstrap replicates
bs_replicates_hcl = draw_bs_reps(high_company.text_length, np.mean, size=10000)
bs_replicates_lcl = draw_bs_reps(low_company.text_length, np.mean, size=10000)

# Get replicates of difference of means: bs_replicates
bs_replicatesctl = bs_replicates_hcl - bs_replicates_lcl

In [11]:
# Compute p value
p = np.sum(bs_replicatesctl <= 0) / float(len(bs_replicatesctl))
print('p-value =', p)

conf95 = np.percentile(bs_replicatesctl, [2.5,97.5])
print('95% confidence interval: ', conf95)
print('Observed difference: ', diff_means)

p-value = 1.0
95% confidence interval:  [-61.97624924 -29.58038267]
Observed difference:  -45.423446540501914


Based on the results we reject the null hypothesis and are confident that the average text length is larger for lower rated companies.

### Is there a significant difference between review counts for high and low rated companies?

Null hypothesis: The average review count for companies rated above 3 == the average review count of companies rated below 3

Alternative hypothesis: The average review count for companies rated above 3 == the review count length of companies rated below 3

significance level = .05

In [12]:
#create a df that will allow you to compare the review counts for each company and their company rating
review_counts = df.groupby('business_id').sum()
avg_company_rating = df.groupby('business_id').mean()
counts_by_company = review_counts.merge(avg_company_rating, left_on=review_counts.index, right_on=avg_company_rating.index)

#split this dataframe into high and low rated companies (high company count and low company count)
hcc = counts_by_company[counts_by_company.company_rating_y > 3] 
lcc = counts_by_company[counts_by_company.company_rating_y < 3]


In [13]:
sp.stats.ttest_ind(hcc.review_count_x, lcc.review_count_x)

Ttest_indResult(statistic=-1.170286535347613, pvalue=0.24732539587879276)

The t test assumed the means are identical, and the results do not allow us to reject the null hypothesis. There is not a statistically significant difference in the average review counts for high and low rated companies

### Is there a significant difference in Funny reviews for high and low rated companies?

Null hypothesis: The average Funny reviews for companies rated above 3 == the average Funny reviews of companies rated below 3

Alternative hypothesis: The average Funny reviews for companies rated above 3 != the average Funny reviews of companies rated below 3

significance level = .05

In [14]:
companies = df.groupby('business_id').mean()
high_vote = companies[companies.company_rating > 3]
low_vote = companies[companies.company_rating < 3]

In [15]:
sp.stats.ttest_ind(high_vote.funny, low_vote.funny)

Ttest_indResult(statistic=0.023347574144448702, pvalue=0.9814641358335048)

The t test assumed the means are identical, and the results do not allow us to reject the null hypothesis. There is not a statistically significant difference in funny reviews for high and low rated companies

### Is there a significant difference in Useful reviews for high and low rated companies?

Null hypothesis: The average Useful reviews for companies rated above 3 == the average Useful reviews of companies rated below 3

Alternative hypothesis: The average Useful reviews for companies rated above 3 != the average Useful reviews of companies rated below 3

significance level = .05

In [16]:
sp.stats.ttest_ind(high_vote.useful, low_vote.useful)

Ttest_indResult(statistic=0.32709783735571313, pvalue=0.7449326891860704)

The t test assumed the means are identical, and the results do not allow us to reject the null hypothesis. There is not a statistically significant difference in useful reviews for high and low rated companies

### Is there a significant difference in Cool reviews for high and low rated companies?

Null hypothesis: The average Cool reviews for companies rated above 3 == the average Cool reviews of companies rated below 3

Alternative hypothesis: The average Cool reviews for companies rated above 3 != the average Cool reviews of companies rated below 3

significance level = .05

In [17]:
sp.stats.ttest_ind(high_vote.cool, low_vote.cool)

Ttest_indResult(statistic=2.5312459335363044, pvalue=0.01449151198707585)

The t test assumed the means are identical, and the results failling below the significance level of .05 allow us to reject that hypothesis. 

What is the probability of more cool reviews in higher rated companies?

In [18]:
# Compute observed difference of means
diff_means = np.mean(high_vote.cool) - np.mean(low_vote.cool)

# Compute 10,000 bootstrap replicates
bs_replicates_hco = draw_bs_reps(high_vote.cool, np.mean, size=10000)
bs_replicates_lco = draw_bs_reps(low_vote.cool, np.mean, size=10000)

# Get replicates of difference of means: bs_replicates
bs_replicates_cool = bs_replicates_hco - bs_replicates_lco

In [19]:
# Compute p value
p = np.sum(bs_replicates_cool >= 0) / float(len(bs_replicates_cool))
print('p-value =', p)

conf95 = np.percentile(bs_replicates_cool, [2.5,97.5])
print('95% confidence interval: ', conf95)
print('Observed difference: ', diff_means)

p-value = 0.9978
95% confidence interval:  [0.1207656  0.80764126]
Observed difference:  0.44458562967754867


Based on the results we reject the null hypothesis and are confident that the average cool reviews are higher for higher rated companies.

### Is there a significant difference in Funny reviews for high and low review ratings?

Null hypothesis: The average Funny reviews for reviews rated above 3 == the average Funny reviews for reviews rated below 3

Alternative hypothesis: The average Funny reviews for reviews rated above 3 == the average Funny reviews for reviews rated below 3
significance level = .05

In [20]:
reviews = df.groupby('review_id').mean()
highr_vote = reviews[reviews.review_rating > 3]
lowr_vote = reviews[reviews.review_rating < 3]


In [21]:
sp.stats.ttest_ind(highr_vote.funny, lowr_vote.funny)

Ttest_indResult(statistic=-8.131779555795593, pvalue=4.326425729250118e-16)

The t test assumed the means are identical, and the results allows us to reject that hypothesis. 

What is the probability that average funny reviews are higher for lower review ratings?

In [22]:
# Compute observed difference of means
diff_means = np.mean(highr_vote.funny) - np.mean(lowr_vote.funny)

# Compute 10,000 bootstrap replicates
bs_replicates_hrf = draw_bs_reps(highr_vote.funny, np.mean, size=10000)
bs_replicates_lrf = draw_bs_reps(lowr_vote.funny, np.mean, size=10000)

# Get replicates of difference of means: bs_replicates
bs_replicatesrf = bs_replicates_hrf - bs_replicates_lrf

In [23]:
# Compute p value
p = np.sum(bs_replicatesrf <= 0) / float(len(bs_replicatesrf))
print('p-value =', p)

conf95 = np.percentile(bs_replicatesrf, [2.5,97.5])
print('95% confidence interval: ', conf95)
print('Observed difference: ', diff_means)

p-value = 1.0
95% confidence interval:  [-0.23212837 -0.14206482]
Observed difference:  -0.1876202869464002


Based on the results we reject the null hypothesis and are confident that the average funny reviews are higher for lower review ratings.

### Is there a significant difference in Useful reviews for high and low review ratings?

Null hypothesis: The average Useful reviews for reviews rated above 3 == the average Useful reviews for reviews rated below 3

Alternative hypothesis: The average Funny reviews for reviews rated above 3 == the average Funny reviews for reviews rated below 3
significance level = .05

In [24]:
sp.stats.ttest_ind(highr_vote.useful, lowr_vote.useful)

Ttest_indResult(statistic=-6.433998189906983, pvalue=1.2540863734333058e-10)

The t test assumed the means are identical, and the results allows us to reject that hypothesis. 

What is the probability that average useful reviews are higher for lower review ratings?

In [25]:
# Compute observed difference of means
diff_means = np.mean(highr_vote.useful) - np.mean(lowr_vote.useful)

# Compute 10,000 bootstrap replicates
bs_replicates_hru = draw_bs_reps(highr_vote.useful, np.mean, size=10000)
bs_replicates_lru = draw_bs_reps(lowr_vote.useful, np.mean, size=10000)

# Get replicates of difference of means: bs_replicates
bs_replicatesru = bs_replicates_hru - bs_replicates_lru

In [26]:
# Compute p value
p = np.sum(bs_replicatesru <= 0) / float(len(bs_replicatesru))
print('p-value =', p)

conf95 = np.percentile(bs_replicatesru, [2.5,97.5])
print('95% confidence interval: ', conf95)
print('Observed difference: ', diff_means)

p-value = 1.0
95% confidence interval:  [-0.2803227  -0.14694699]
Observed difference:  -0.21364267929634817


Based on the results we reject the null hypothesis and are confident that the average useful reviews are higher for lower review ratings.

### Is there a significant difference in Cool reviews for high and low review ratings?

Null hypothesis: The average Cool reviews for reviews rated above 3 == the average Cool reviews for reviews rated below 3

Alternative hypothesis: The average Cool reviews for reviews rated above 3 == the average Cool reviews for reviews rated below 3
significance level = .05

In [27]:
sp.stats.ttest_ind(highr_vote.cool, lowr_vote.cool)

Ttest_indResult(statistic=21.669653690837418, pvalue=1.1898962930781724e-103)

The t test assumed the means are identical, and the results allows us to reject that hypothesis. 

What is the probability that average cool reviews are higher for higher review ratings?

In [28]:
# Compute observed difference of means
diff_means = np.mean(highr_vote.cool) - np.mean(lowr_vote.cool)

# Compute 10,000 bootstrap replicates
bs_replicates_hrc = draw_bs_reps(highr_vote.cool, np.mean, size=10000)
bs_replicates_lrc = draw_bs_reps(lowr_vote.cool, np.mean, size=10000)

# Get replicates of difference of means: bs_replicates
bs_replicatesrc = bs_replicates_hrc - bs_replicates_lrc

In [29]:
# Compute p value
p = np.sum(bs_replicatesrc >= 0) / float(len(bs_replicatesrc))
print('p-value =', p)

conf95 = np.percentile(bs_replicatesrc, [2.5,97.5])
print('95% confidence interval: ', conf95)
print('Observed difference: ', diff_means)

p-value = 1.0
95% confidence interval:  [0.46870195 0.56249207]
Observed difference:  0.5147021452491629


Based on the results we reject the null hypothesis and are confident that the average cool reviews are higher for higher review ratings.