# STATS

In [26]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import wrangle

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk.sentiment

In [21]:
train, val, test = wrangle.wrangle_glassdoor()
three_star = train[train.binned_rating_int == 3]
four_star = train[train.binned_rating_int == 4]

In [6]:
four_star.head()

Unnamed: 0_level_0,pros,cons,name,rating,ceo_approval,friend_recommendation,pros_cleaned,pros_lemmatized,cons_cleaned,cons_lemmatized,binned_rating,binned_rating_int
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
https://www.glassdoor.com/Reviews/Perficient-Reviews-E9329.htm,Perficient is an ethical company that actually...,"None at all, love, love, love this company!\nI...",Perficient,4.1,87.0,80.0,perficient is an ethical company that actually...,perficient ethical company actually value empl...,none at all love love love this company\nit is...,none love love love company good company canno...,Four,4
https://www.glassdoor.com/Reviews/MIT-Reviews-E2889.htm,"Very inspiring place to work at, to feel that ...",Depends on the project to how much organizatio...,MIT,4.4,90.0,86.0,very inspiring place to work at to feel that s...,inspiring place work feel something new happen...,depends on the project to how much organizatio...,depends project much organization team include...,Four,4
https://www.glassdoor.com/Reviews/Morningstar-Reviews-E3299.htm,"- Coworkers are amicable, and they're overall ...",- Base pay for the area could be slightly high...,Morningstar,4.1,94.0,86.0,coworkers are amicable and they ' re overall v...,coworkers amicable ' overall supportive unlimi...,base pay for the area could be slightly higher...,base pay area could slightly higher bonus prog...,Four,4
https://www.glassdoor.com/Reviews/L-Or%C3%A9al-Reviews-E3470.htm,Good pay and benefits. Paid time off. Work lif...,No cons at the moment\nonly Location could be ...,L'Oréal,4.0,92.0,79.0,good pay and benefits paid time off work life ...,good pay benefit paid time work life balance c...,no cons at the moment\nonly location could be ...,con moment location could better politics stre...,Four,4
https://www.glassdoor.com/Reviews/Novartis-Reviews-E6667.htm,"Strong rewards and incentives: salary, bonus, ...","Leadership says one thing (e.g., be unbossed) ...",Novartis,4.0,81.0,78.0,strong rewards and incentives salary bonus sto...,strong reward incentive salary bonus stock hea...,leadership says one thing eg be unbossed but c...,leadership say one thing eg unbossed consisten...,Four,4


# Significance of words (tf-idf)

The IDF (Inverse Document Frequency) score is calculated to measure the importance of a word within a collection of documents. The IDF score indicates how rare or common a word is across the entire corpus.

- A higher IDF score suggests that a word is more unique and significant within the collection of documents.

In [7]:
def generate_trigrams(lemmatized):
    words = lemmatized.split()
    trigrams = []
    
    if len(words) < 3:
        return trigrams

    for i in range(len(words) - 1):
        if len(words[i]) > 1 and len(words[i+1]) > 1:
            trigram = " ".join(words[i:i+3])
            trigrams.append(trigram)

    return trigrams

**Calculate the TF score**

In [8]:
documents = {
    'pros': " ".join(four_star.pros_lemmatized.values),
    'cons': " ".join(four_star.cons_lemmatized.values),
}

# Create an empty list to store the TF dataframes
tfs = []

# Iterate through documents and their corresponding text
for doc, text in documents.items():
    # Split the text into words, count their occurrences, and reset the index
    word_counts = pd.Series(generate_trigrams(text)).value_counts()

    # Rename the columns for clarity and calculate the term frequency (TF)
    tf_df = pd.DataFrame(word_counts).reset_index()
    tf_df.columns = ['word', 'count']
    tf_df["tf"] = tf_df['count'] / len(generate_trigrams(text))
    tf_df = tf_df.assign(doc = doc)

    # Append the TF dataframe to the list
    tfs.append(tf_df)

In [9]:
tfs[1]

Unnamed: 0,word,count,tf,doc
0,work life balance,317,0.001410,cons
1,none none none,101,0.000449,cons
2,pay could better,72,0.000320,cons
3,great place work,61,0.000271,cons
4,get thing done,48,0.000213,cons
...,...,...,...,...
184102,make sure company,1,0.000004,cons
184103,sure company applying,1,0.000004,cons
184104,company applying within,1,0.000004,cons
184105,applying within cox,1,0.000004,cons


**Calculate IDF score**

In [10]:
def idf(word):
    """
    calculates the Inverse Document Frequency (IDF) for a given word in a collection of documents.
    """
    n_occurences = sum([1 for doc in documents.values() if word in doc])
    return len(documents) / (n_occurences + 1)

In [11]:
# Calculate the if-idf score of each word and add to the if dataframe
tf_idf_scores = pd.concat(tfs, axis=0).assign(idf=lambda df: df.word.apply(idf)).assign(tf_idf=lambda df: df.idf * df.tf)

In [12]:
tf_idf_scores.head()

Unnamed: 0,word,count,tf,doc,idf,tf_idf
0,work life balance,1066,0.005938,pros,0.666667,0.003958
1,great place work,287,0.001599,pros,0.666667,0.001066
2,good work life,284,0.001582,pros,0.666667,0.001055
3,great company work,165,0.000919,pros,0.666667,0.000613
4,great work life,153,0.000852,pros,0.666667,0.000568


**Add sentiment scores for each word**

In [13]:
# use polarity_scores from that object
sia = nltk.sentiment.SentimentIntensityAnalyzer()
# grab the sentiment from each of the texts as they stand
tf_idf_scores['sentiment'] = tf_idf_scores.word.apply(lambda doc: sia.polarity_scores(doc)['compound'])

In [14]:
tf_idf_scores.head()

Unnamed: 0,word,count,tf,doc,idf,tf_idf,sentiment
0,work life balance,1066,0.005938,pros,0.666667,0.003958,0.0
1,great place work,287,0.001599,pros,0.666667,0.001066,0.6249
2,good work life,284,0.001582,pros,0.666667,0.001055,0.4404
3,great company work,165,0.000919,pros,0.666667,0.000613,0.6249
4,great work life,153,0.000852,pros,0.666667,0.000568,0.6249


**Add to csv**

In [15]:
# tf_idf_scores.to_csv("./data/three_star_tf_idf_scores.csv", mode="w")

In [16]:
# tf_idf_scores.to_csv("./data/four_star_tf_idf_scores.csv", mode="w")

### Test documents together

In [22]:
train = train.copy()
three_star = pd.read_csv("./data/three_star_tf_idf_scores.csv", index_col=0)
four_star = pd.read_csv("./data/four_star_tf_idf_scores.csv", index_col= 0)
three_star.head(2)

Unnamed: 0,word,count,tf,doc,idf,tf_idf,sentiment
0,work life balance,1320,0.003417,pros,0.666667,0.002278,0.0
1,great place work,495,0.001281,pros,0.666667,0.000854,0.6249


In [23]:
train.head(2)

Unnamed: 0_level_0,pros,cons,name,rating,ceo_approval,friend_recommendation,pros_cleaned,pros_lemmatized,cons_cleaned,cons_lemmatized,binned_rating,binned_rating_int
url,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
https://www.glassdoor.com/Reviews/Perficient-Reviews-E9329.htm,Perficient is an ethical company that actually...,"None at all, love, love, love this company!\nI...",Perficient,4.1,87.0,80.0,perficient is an ethical company that actually...,perficient ethical company actually value empl...,none at all love love love this company\nit is...,none love love love company good company canno...,Four,4
https://www.glassdoor.com/Reviews/Farmers-Insurance-Group-Reviews-E3955.htm,"This company is the best ever.\nLarge, establi...",I have nothing bad to say.\nManagement company...,Farmers Insurance Group,3.4,37.0,52.0,this company is the best ever\nlarge establish...,company best ever large established company so...,i have nothing bad to say\nmanagement company ...,nothing bad say management company get paid re...,Three,3


In [24]:
def Hypothesis_check(p_value, alpha = 0.05):
    if p_value < alpha:
        print("Reject null")
    else:
        print("Fail to reject null")

**1. Do employee reviews (pros and cons) significantly differ in terms of sentiment between 3-star and 4-star rated companies?**

- **Null Hypothesis (H0):** Employee reviews' sentiment does not significantly differ between 3-star and 4-star rated companies.
- **Alternate Hypothesis (H1):** Employee reviews' sentiment significantly differs between 3-star and 4-star rated companies.
   
   - Test: Two-sample t-test or Mann-Whitney U test if the sentiment scores are not normally distributed.

**Three star rating**

**Four star rating**

**2. Is there a significant difference in the frequency of trigrams between 3-star and 4-star rated companies in employee reviews (pros and cons)?**

- **Null Hypothesis (H0):** There is no significant difference in the frequency of trigrams between 3-star and 4-star rated companies in employee reviews.
- **Alternate Hypothesis (H1):** There is a significant difference in the frequency of trigrams between 3-star and 4-star rated companies in employee reviews.
   - Test: Chi-squared test or Fisher's exact test if the data is categorical.

**3. Does the CEO approval rating have a significant impact on the company's overall rating?**

- **Null Hypothesis (H0):** CEO approval rating does not have a significant impact on the company's overall rating.
- **Alternate Hypothesis (H1):** CEO approval rating has a significant impact on the company's overall rating.
   
   - Test: Pearson correlation coefficient or Spearman rank correlation.

**4. Are there significant differences in the friend recommendation scores between employees in 3-star and 4-star rated companies?**

- **Null Hypothesis (H0):** There are no significant differences in the friend recommendation scores between employees in 3-star and 4-star rated companies.
- **Alternate Hypothesis (H1):** There are significant differences in the friend recommendation scores between employees in 3-star and 4-star rated companies.

   - Test: Two-sample t-test or Mann-Whitney U test.

**5. Does the sentiment of employee reviews correlate with CEO approval ratings?**

- **Null Hypothesis (H0):** The sentiment of employee reviews does not correlate with CEO approval ratings.
- **Alternate Hypothesis (H1):** The sentiment of employee reviews correlates with CEO approval ratings.

   - Test: Pearson correlation coefficient or Spearman rank correlation.

**6. Is there a significant relationship between the sentiment of employee reviews and the friend recommendation scores?**

- **Null Hypothesis (H0):** There is no significant relationship between the sentiment of employee reviews and the friend recommendation scores.
- **Alternate Hypothesis (H1):** There is a significant relationship between the sentiment of employee reviews and the friend recommendation scores.

   - Test: Pearson correlation coefficient or Spearman rank correlation.

**7. Do certain trigrams appear significantly more frequently in pros compared to cons in employee reviews?**

- **Null Hypothesis (H0):** Certain trigrams do not appear significantly more frequently in pros compared to cons in employee reviews.
- **Alternate Hypothesis (H1):** Certain trigrams appear significantly more frequently in pros compared to cons in employee reviews.
   
   - Test: Chi-squared test or Fisher's exact test.

**8. Is there a significant difference in the sentiment of pros and cons in employee reviews?**

- **Null Hypothesis (H0):** There is no significant difference in sentiment between pros and cons in employee reviews.
- **Alternate Hypothesis (H1):** There is a significant difference in sentiment between pros and cons in employee reviews.

   - Test: Paired t-test or Wilcoxon signed-rank test.

**9. Does the frequency of specific trigrams in pros correlate with the CEO approval rating?**

- **Null Hypothesis (H0):** The frequency of specific trigrams in pros does not correlate with the CEO approval rating.
- **Alternate Hypothesis (H1):** The frequency of specific trigrams in pros correlates with the CEO approval rating.

   - Test: Pearson correlation coefficient or Spearman rank correlation.

**10. Is there a significant difference in sentiment between pros and cons for companies with a 3-star rating?**

- **Null Hypothesis (H0):** There is no significant difference in sentiment between pros and cons for companies with a 3-star rating.
- **Alternate Hypothesis (H1):** There is a significant difference in sentiment between pros and cons for companies with a 3-star rating.

   - Test: Paired t-test or Wilcoxon signed-rank test.

**11. Does the frequency of trigrams in employee reviews predict the likelihood of a 4-star rating for a company?**

- **Null Hypothesis (H0):** The frequency of trigrams in employee reviews does not predict the likelihood of a 4-star rating for a company.
- **Alternate Hypothesis (H1):** The frequency of trigrams in employee reviews predicts the likelihood of a 4-star rating for a company.

   - Test: Logistic regression or decision tree analysis.

**12. Is there a significant difference in sentiment between employee reviews in companies with high and low CEO approval ratings?**

- **Null Hypothesis (H0):** There is no significant difference in sentiment between employee reviews in companies with high and low CEO approval ratings.
- **Alternate Hypothesis (H1):** There is a significant difference in sentiment between employee reviews in companies with high and low CEO approval ratings.

   - Test: Independent samples t-test or Mann-Whitney U test.

**13. Does the CEO approval rating significantly predict the likelihood of a company receiving a 4-star rating?**

- **Null Hypothesis (H0):** CEO approval rating does not significantly predict the likelihood of a company receiving a 4-star rating.
- **Alternate Hypothesis (H1):** CEO approval rating significantly predicts the likelihood of a company receiving a 4-star rating.

   - Test: Logistic regression.

**14. Is there a significant difference in the sentiment of employee reviews between companies recommended by friends and those not recommended?**

- **Null Hypothesis (H0):** There is no significant difference in the sentiment of employee reviews between companies recommended by friends and those not recommended.
- **Alternate Hypothesis (H1):** There is a significant difference in the sentiment of employee reviews between companies recommended by friends and those not recommended.

   - Test: Independent samples t-test or Mann-Whitney U test.

**15. Do certain trigrams appear significantly more frequently in reviews of companies recommended by friends compared to those not recommended?**

- **Null Hypothesis (H0):** Certain trigrams do not appear significantly more frequently in reviews of companies recommended by friends compared to those not recommended.
- **Alternate Hypothesis (H1):** Certain trigrams appear significantly more frequently in reviews of companies recommended by friends compared to those not recommended.
   - Test: Chi-squared test or Fisher's exact test.