# **Audience Determination via Politeness and Formality**

**Welcome to our program!**

To run our program, you'll need to first select 'runtime' and then 'run all.' Enter your email in the very last code cell when prompted, then follow the directions.

We hope you enjoy our program!

In [None]:
#importing libraries

import csv
import re
import nltk
from nltk.corpus import wordnet
from nltk import tokenize
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk import RegexpParser
import itertools
from itertools import permutations
!pip install autocorrect
from autocorrect import Speller
import matplotlib.pyplot as plt
from scipy.stats import linregress
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
nltk.download('wordnet')

Collecting autocorrect
  Downloading autocorrect-2.6.1.tar.gz (622 kB)
[?25l[K     |▌                               | 10 kB 16.2 MB/s eta 0:00:01[K     |█                               | 20 kB 18.5 MB/s eta 0:00:01[K     |█▋                              | 30 kB 11.2 MB/s eta 0:00:01[K     |██                              | 40 kB 9.4 MB/s eta 0:00:01[K     |██▋                             | 51 kB 6.7 MB/s eta 0:00:01[K     |███▏                            | 61 kB 7.8 MB/s eta 0:00:01[K     |███▊                            | 71 kB 7.4 MB/s eta 0:00:01[K     |████▏                           | 81 kB 7.6 MB/s eta 0:00:01[K     |████▊                           | 92 kB 8.4 MB/s eta 0:00:01[K     |█████▎                          | 102 kB 6.9 MB/s eta 0:00:01[K     |█████▉                          | 112 kB 6.9 MB/s eta 0:00:01[K     |██████▎                         | 122 kB 6.9 MB/s eta 0:00:01[K     |██████▉                         | 133 kB 6.9 MB/s eta 0:00:01[K  

True

In [None]:
#FUNCTIONS

def clean(string1):
    '''removes punctuation and capitalization'''
    string1 = str.lower(string1)
    string1 = re.sub(r"[^\w\s]|\d", r" ", string1)
    string1 = re.sub(r"  ", r" ", string1)
    return string1

def audience_determiner(politeness, formality):
  '''inputs the politeness and formality scores and outputs an audience type'''
  if ((politeness < .2) and (formality < 0)) or ((.2 < politeness < .4) and (formality <-.25)):
    print("\n A suitable recipient for your email would be:")
    print("a CASUAL AUDIENCE or someone YOUNGER or LOWER STATUS than you in a given social context.")
  
  elif ((politeness < .2) and (formality >= 0)) or ((.2 <= politeness <= .4) and (-.25 <= formality <= 0)) or ((.4 <= politeness <= .6) and (formality < -.25)):
    print("\n A suitable recipient for your email would be:")
    print("a NEUTRAL AUDIENCE or someone THE SAME AGE OR STATUS as you in a given social context.")

  elif ((politeness >= .2) and (formality > 0)) or ((politeness > .4) and (formality > -.25)) or ((politeness > .6) and (formality < -.25)):
    print("\n A suitable recipient for your email would be:")
    print("a RESPECTED AUDIENCE or someone OLDER or HIGHER STATUS as you in a given social context.")


In [None]:
#FORMALITY INDICATORS
   
#greetings (dear sir/madam) and Signatures (sincerely)
greetings = ["dear ", "hello ", "hello.", "hi ", "hi.", "hey ", "hey.", "good morning", "good evening", "good afternoon", "greetings", "to whom this may concern"]

signatures = ["sincerely,", "many thanks,", "best,", "thanks again,", "see you soon,", "thank you,", "thanks,"]

def greet_sign_finder(email_text):
  '''finds greetings in strings'''
  greet_sign = []
  clean_txt = str.lower(email_text)
  #print(clean_txt)

  for i in greetings:
    if i in clean_txt:
      greet_sign.append(i)
  for i in signatures:
    if i in clean_txt:
      greet_sign.append(i)
  #print("Greetings and signatures:", greet_sign)
  return len(greet_sign)
  #print("\n")
  
#punctuation
def punct_checker(email_text):
  no_punct = []
  '''checks if punctuation is present'''
  punct = re.findall("[\.\?!,;]", email_text)
  #print(punct)
  if len(punct) == 0:
    #print("No punctuation found.")
    return 0
  else:
    #print("Punctuation: present")
    return 1 #returns 1 point if there is any punctuation in the sentence (1 pt per entire text)
    #print("\n")
  
#correct spelling
def spell_checker(email_text):
  '''returns the number of misspelled words'''
  #a word is only "misspelled" if it is identifiable as an english word
  #so for example, 'afjdksl' would not return a misspelling
  misspelled_words = []
  tokens = word_tokenize(email_text)
  for i in tokens:
    if i == str.title(i) or i == str.upper(i):
      tokens.remove(i)
    clean_txt = clean(i)
  #clean_txt = clean(email_text)
  #tokens = word_tokenize(clean_txt)
  spell = Speller(lang='en')
  for i in tokens:
    spelling = spell(i)
    if spelling != i:
      misspelled_words.append(i)
  #print("Misspelled words:", misspelled_words)
  return len(misspelled_words)
  #print("\n")
  
#correct capitalization
def cap_checker(email_text):
  '''checks if the first word in a sentence is capitalized'''
  #don't use this function for enron corpus since already lowercase
  #NO CLEANING in this function
  sentences = tokenize.sent_tokenize(email_text)
  no_start_cap = []
  for i in sentences:
    tokens = word_tokenize(i)
    if tokens[0] != str.title(tokens[0]):
      no_start_cap.append(i)
  #print("No starting capitalization:", no_start_cap)
  return len(no_start_cap)
  #print("\n")
  
#formal lexicon
formal_lexicon = ["inquire", "temporarily", "piqued", "fortunate", "evening", "enchanted", "inform", "receive", "negative", "appear", "moreover", "purchase", "must", "deficiency", "reside", "verify", "assist", "opportunity","utilize", "consume", "depart", "commence", "difficult", "disclose", "express", "initially", "fortunate", "determine", "discard", "distribute", "conceive", "perceive", "liberate", "request", "precede", "regard", "seek", "investigate", "resemble", "consult", "considerable", "disintegrate", "discard", "eject", "improved", "nevertheless", "consequently", "postpone", "resemble", "residence", "retain", "withdraw", "suspend", "adjourn", "additionally", "depict", "exceptional", "subsequently", "immediate", "sufficient", "insufficient", "terminate", "facilitate", "beneficial", "pursue", "obtain", "oppose", "haste", "notion", "envisage", "accumulate", "eliminate", "therefore", "thus", "endeavor", "amount", "evaluate", "accentuate", "revitalize", "reevaluate", "compensate"]

def formal_lexicon_finder(email_text):
  '''finds formal lexicon in string'''
  ps = PorterStemmer()
  formal_stems = [ps.stem(word) for word in formal_lexicon]
  formal_words = []
  clean_txt = clean(email_text)
  tokens = word_tokenize(clean_txt)
  stems = [ps.stem(word) for word in tokens]

  for i in stems:
    if i in formal_stems:
        formal_words.append(i)
  #print("Formal lexicon:", formal_words)
  return len(formal_words)
  #print("\n")
  
#no smiley faces, random symbols, emojis
emojis = [":)", ":))", ":)))" "(:", "):", ":(", ":-D", "D-:", ":0", "0:", "<3", ":P", ":O", "O:", ";)", ";(", "<3"]

def emoji_finder(email_text):
  '''finds emojis/smileys in string'''
  expressed_emojis = []
  for i in emojis:
    if i in email_text:
      expressed_emojis.append(i)
 #print("Emojis:", expressed_emojis)
  return len(expressed_emojis)
  #print("\n")
  
#few interjections/fillers
interjections = ["shh", "shhh", "psst", "shoo", "oh", "yo", "ahem", 
                 "yuck", "ew", "eww", "aw", "aww", "ugh", "phew", "phooey",
                 "yippee", "yay", "yeah", "brr", "eek", "alas", "bingo", 
                 "bravo", "eureka", "crikey", "gee", "gosh", "god", "hm",
                 "hmm", "aha", "huh", "duh", "ah", "ahh", "wow", "yikes",
                 "crap", "dang", "yeah", "yess", "ah", "whoops", "ope", "lol", 
                 "lmao", "btw", "asap"]
fillers = ["um", "uh"]

def itj_filler_finder(email_text):
  '''finds interjections and fillers in string'''
  interjections1 = []
  fillers1 = []
  
  clean_txt = clean(email_text)
  tokens = word_tokenize(clean_txt)
  
  for i in interjections:
    if i in tokens:
      interjections1.append(i)
  for i in fillers:
    if i in tokens:
      fillers1.append(i)

  #print("Interjections:", interjections1)
  #print("Fillers:", fillers1)
  return len(interjections1) + len(fillers1)
  #print("\n")

#few contractions
def contraction_finder(email_text):
  clean_txt = str.lower(email_text)
  contractions = re.findall("\w+\'[^s]{1,2}", clean_txt)
  #print("Contractions:", contractions)
  return len(contractions)
  #print("\n")

def formality_scorer(y):
  '''outputs a numerical score for formality'''
  aF = float(greet_sign_finder(y))
  bF = float(punct_checker(y))
  cF = float(spell_checker(y))
  dF = float(cap_checker(y))
  eF = float(formal_lexicon_finder(y))
  fF = float(emoji_finder(y))
  gF = float(itj_filler_finder(y))
  hF = float(contraction_finder(y))
  result = (aF*.25)+(bF*.25)-(cF*.0625)-(dF*.03125)+(eF*.03125)-(fF*.25)-(hF*.125)
  if gF>0:
    result == 0
  return (result)


In [None]:
#POLITENESS INDICATORS

#polite words
gratitude = ["thanks", "thank you", "please"]
apologizing = ["sorry", "apologies", "i apologize", "oops", "whoops", "excuse me"]

def polite_word_finder(email_text):
  '''checks for polite words in string'''
  clean_txt = clean(email_text)
  polite_words = []
  for i in gratitude:
    if i in clean_txt:
      polite_words.append(i)
  for i in apologizing:
    if i in clean_txt:
      polite_words.append(i)
  #print("Polite words:", polite_words)
  return len(polite_words)
  #print("\n")

#inclusive we
def incl_we_finder(email_text):
  '''finds instances of inclusive we (for now, all "we's")'''
  clean_txt = clean(email_text)
  tokens = word_tokenize(clean_txt)
  we_list = []

  if "we" in tokens:
    we_list.append("we!")
    #print("Inclusive 'we':", we_list)
  return len(we_list)

#hedging language
hedges = ["apparent", "apparently", "appear", "appeared", "appears", "approximately", "around", "assume", "assumed", "certain amount", "certain extent", "certain level", "claim", "claimed", "doubt", "doubtful", "essentially", "estimate", "estimated", "feel", "felt", "frequently", "from our perspective", "generally", "guess", "in general", "in most cases", "in most instances", "in our view", "indicate", "indicated", "largely", "likely", "mainly", "may", "maybe", "might", "mostly", "often", "on the whole", "ought", "perhaps", "plausible", "plausibly", "possible", "possibly", "postulate", "postulated", "presumable", "probable", "probably", "relatively", "roughly", "seems", "should", "sometimes", "somewhat", "suggest", "suggested", "suppose", "suspect", "tend to", "tends to", "typical", "typically", "uncertain", "uncertainly", "unclear", "unclearly", "unlikely", "usually", "broadly", "tended to", "presumably", "suggests", "from this perspective", "from my perspective", "in my view", "in this view", "in our opinion", "in my opinion", "to my knowledge", "fairly", "quite", "rather", "argue", "argues", "argued", "claims", "feels", "indicates", "supposed", "supposes", "suspects", "postulates"]

def hedge_lang_finder(email_text):
  '''finds hedgeing words in string'''
  clean_txt = clean(email_text)
  hedge_lang = []
  for i in hedges:
    if i in clean_txt:
      hedge_lang.append(i)
  #print("Hedging language:", hedge_lang)
  return len(hedge_lang)
  #print("\n")
  
#tag questions
def tag_question_finder(email_text):
  '''finds tag questions in string (ex. We should go, shouldn't we?)'''
  sentences = tokenize.sent_tokenize(email_text)
  tag_questions = []
  #print(sentences)
  for i in sentences:
    clean_txt = str.lower(i)
    tag_questions.append(re.findall(", \w+\'?t? ?\w+?\?$", clean_txt))
    for i in tag_questions:
      if len(i) == 0:
        tag_questions.remove(i)
  #print("Tag questions:", tag_questions)
  return len(tag_questions)
  #print("\n")

#requests
req_starters = ["do", "could", "can", "would"]
req_objects = ["you", "i"]
req_combos = []

permut = itertools.permutations(req_starters, len(req_objects)) # Getting all permutations of req_starters with length of req_objects
for comb in permut: # zip() is called to pair each permutation and shorter list element into combination
    zipped = zip(comb, req_objects)
    req_combos.append(list(zipped))

req_combos_flat = [] #flattening multidimentional list
for i in req_combos:
    for j in i:
        req_combos_flat.append(j)
req_combos_flat = list(set(req_combos_flat)) #purging duplicates from list
req_combos_flat.append(("may", "i")) #adding additional request forms
req_combos_flat.append(("will", "you"))

def request_finder(email_text):
  '''finds syntactic requests'''
  sentences = tokenize.sent_tokenize(email_text)
  clean_sents = []
  tokens = []
  requests = []

  for i in sentences: #cleaning and tokenizing the sentences
      clean_sents.append(clean(i))
  for i in clean_sents:
      tokens.append(word_tokenize(i))

  for i in range(0, (len(tokens)-1)):
      s = tokens[i]
      orig = sentences[i]
      for p in req_combos_flat:
          if p[0] in s: #checking whether a request starter is in the list
              if (s.index(p[0]) + 1) >= len(s): #dealing with list boundaries
                  continue
              if tokens[tokens.index(s)][s.index(p[0]) + 1] == p[1] or tokens[tokens.index(s)][s.index(p[0]) - 1] == p[1] : #checking adjacent words for request objects
                  if re.match(r", \w+\'? \w+\?$", sentences[s.index(p[0]) + 1]): #checks that it isn't a tag question
                      print("none")
                  else:
                      requests.append(orig)
  #print("Requests:", requests)
  return len(requests)
  #print("\n")

#commands
# ^ or CONJ (don't)? VB
def command_finder(email_text):
  '''finds syntactic imperatives in the email'''
  commands = []
  clean_txt = str.lower(email_text)
  sentences = tokenize.sent_tokenize(email_text)
  #tokenizer = word_tokenize(clean_txt)
  tokens = word_tokenize(clean_txt)
  tags = nltk.pos_tag(tokens)

  for j in sentences:
        if "?" in j:
          print("not a command")
        else:
          for i in tokens:
              chunkGram = r"""Chunk: {(don t|do not|never|always)?<VB>}"""
              chunkParser = nltk.RegexpParser(chunkGram)
              chunked = chunkParser.parse(tags)
              #print(chunked)
              chunked_string = str(chunked)
              if "Chunk" in chunked_string:
                #print("command found.")
                commands.append(j)
              break
  #print("Commands:", commands)
  return len(commands)
  #print("\n")

#qualifiers
qualifiers = ["quite", "rather", "somewhat", "just", "indeed", "still", "almost", "fairly", "pretty", "even", "a bit", "a little", "a lot", "a whole lot", "a good deal", "a great deal", "kind of", "sort of"]

def qual_finder(email_text):
  '''finds qualifiers in string'''
  #list POS tags, if RB (adverb) is in qualifiers list, then return it
  qualifiers1 = []
  clean_txt = clean(email_text)
  tokens = word_tokenize(clean_txt)
  tags = nltk.pos_tag(tokens)

  for i in tags:
      if i[1] == "RB":
          if i[0] in qualifiers:
              qualifiers1.append(i[0])
            
  lower = str.lower(email_text)
  for i in qualifiers:
    if i in lower:
      if i not in qualifiers1:
        qualifiers1.append(i)
  
  #print("Qualifiers:", qualifiers1)
  return len(qualifiers1)
  #print("\n")

#curse words
curses = ["damn", "shit", "fuck", "goddamn", "goddamnit",  "motherfucker", "whore", "dick", "dickhead", "asshole", "ass", "bastard", "bitch", "cunt", "hell", "slut"]

def curse_word_finder(email_text):
  '''finds curse words in string'''
  ps = PorterStemmer()
  curse_stems = [ps.stem(word) for word in curses]
  expressed_curses = []
  clean_txt = clean(email_text)
  tokens = word_tokenize(clean_txt)
  stems = [ps.stem(word) for word in tokens]
  for i in stems:
    if i in curse_stems:
      expressed_curses.append(i)
  #print("Curses:", expressed_curses)
  return len(expressed_curses)

def misc_pol(email_text):
  '''finds miscellaneous polite language in string'''
  misc = []
  misc_polite = ["if so", "if you can", "if you could", "you can", "your earliest convenience"]
  positive_words = ["beautiful", "great", "wonderful", "excellent", "amazing", "super", "supurb", "wonderful", "well done", "good job", "good work", "great work", "fantastic"]
  clean_txt = clean(email_text)
  polite_words = []
  for i in misc_polite:
    if i in clean_txt:
      misc.append(i)
  for i in positive_words:
    if i in clean_txt:
      misc.append(i)
  #print("Miscellaneous polite:", misc)
  return len(misc)
  #print("\n")


def politeness_scorer(x):
  '''outputs a numerical score for politeness'''
  aP = float(polite_word_finder(x))
  bP = float(incl_we_finder(x))
  cP = float(hedge_lang_finder(x))
  dP = float(tag_question_finder(x))
  #eP = float(command_finder(x))
  fP = float(qual_finder(x))
  gP = float(curse_word_finder(x))
  hP = float(misc_pol(x))
  #iP = float(request_finder(x))
  result = (aP*.25)+(cP*.25) +(fP*.125)+(dP*.0625)+(hP*.0625)+(bP*.0625)#+(iP*.125) #need to add eP HERE
  #-(eP*.0625)
  if gP>0:
      result = 0
  return (result)


In [None]:
#user interface

instructions = "This program analyzes emails for politeness and formality and predicts an intended audience. \n \n"
print(instructions)

run = True
while run == True:
  user_email = input("Please enter the body text of your email: \n")

  user_p_score = politeness_scorer(user_email)
  #print("\n Politeness score:", user_p_score, "\n \n")
  user_f_score = formality_scorer(user_email)
  #print("\n Formality score:", user_f_score, "\n \n")

  #audience categorizing
  audience_determiner(user_p_score, user_f_score)
  
  #Check to see if user's intentions matched audience category
  print("\n \n Was this your intended audience for your email?")
  print("Enter 'y' for yes, 'n' for no.")
  choice_1 = input()

  valid = True
  while valid == True:
    if str.lower(choice_1) not in ["y", "n"]:
      print("Please type 'y' or 'n'.")
      choice_1 = input()
    else:
      valid = False

  if choice_1 == "y":
    print("\n Good work! Your email is appropriately polite and formal for your intended audience.")
    print("\n Would you like to enter another email or quit the program?")
    print("Type 'quit' to quit. Type anything else to enter another email.")
    quit = input()
    if quit == "quit":
      break
    else:
      continue

  else: #giving suggestions
    print("Would you like to see suggestions for increasing your politeness and formality?")
    print("Enter 'y' for yes, 'n' for no.")
    choice_2 = input()

    valid = True
    while valid == True:
      if str.lower(choice_2) not in ["y", "n"]:
        print("Please type 'y' or 'n'.")
        choice_2 = input()
      else:
        valid = False

    if choice_2 == "y":
      print("Who is your intended audience? Type the number corresponding to your audience category.")
      print("1 A FRIEND \n 2 A FAMILY MEMBER \n 3 A MENTOR, PROFESSOR, OR TEACHER, \n 4 A CO-WORKER OR COLLEAGUE")
      choice_3 = input()

      valid = True
      while valid == True:
        if choice_3 not in ["1", "2", "3", "4"]:
          print("Please select a choice 1-4.")
          choice_3 = input()
        else:
          valid = False

      print("What is the age or status of your intended audience? Type the number corresponding to the category.")
      print("1 YOUNGER/LOWER STATUS THAN ME \n 2 THE SAME AGE/STATUS AS ME \n 3 OLDER/HIGHER STATUS THAN ME")
      choice_4 = input()

      valid = True
      while valid == True:
        if choice_4 not in ["1", "2", "3"]:
          print("Please select a choice 1-3.")
          choice_4 = input()
        else:
          valid = False

      if choice_3 == "1" and choice_4 == "1":
        print("When speaking to a friend younger/lower status than you, you can use informal, less polite language if you desire.")
      elif choice_3 == "1" and choice_4 == "2":
        print("When speaking to a friend the same age/status as you, you can use informal, less polite language if you desire.")
      elif choice_3 == "1" and choice_4 == "3":
        print("When speaking to a friend older/higher status than you, use polite, semi-formal language. Remember to say please and thank you, and using greetings and farewells is suggested.")
      elif choice_3 == "2" and choice_4 == "1":
        print("When speaking to a family member younger/lower status than you, use moderately polite, relatively informal \n language. You should model politeness to younger cousins or siblings, with words such as please and thank you, but you may use an informal tone.")
      elif choice_3 == "2" and choice_4 == "2":
        print("When speaking to a family member the same age/status as you, you can use informal, less polite language if you desire.")
      elif choice_3 == "2" and choice_4 == "3":
        print("When speaking to a family member older/higher status than you, use very polite language, such as please \n and thank you, with moderate formality. Be sure to use moderately formal greetings and signatures, capitalize your words, and fix misspellings.")
      elif choice_3 == "3" and choice_4 == "1":
        print("The audience situation you indicated is unlikely to occur. A mentor, professor, or teacher is likely to \n be a higher status than you. Even if, for some reason, this situation did occur, you still ought to use very polite and formal language, seeing as how you are learning from a mentor/teacher and should want to impress them. Please, thank you, greetings and signatures, proper capitalization, and proper spellings are important.")
      elif choice_3 == "3" and choice_4 == "2":
        print("When speaking to a mentor, professor, or teacher the same age/status as you, use moderate politeness and \n formality. Use greetings, signatures, and polite words such as please and thank you, but an informal tone is acceptable in most cases. \n You should still, however, use proper capitalization, punctuation, and spelling.")
      elif choice_3 == "3" and choice_4 == "3":
        print("When speaking to a mentor, professor, or teacher older/higher status than you, always use be highly polite \n and formal. Use greetings, signatures, and polite words such as please and thank you, and avoid more informal slang. \n Proofread to fix typos, misspellings, and improper capitalization/punctuation. \n Avoid using emojis, filler words, rude words, or interjections. When asking for something, hedge your request \n with qualifier words.")
      elif choice_3 == "4" and choice_4 == "1":
        print("When speaking to a co-worker or colleague younger/lower status than you, you may use informal, but polite, language. \n Greetings, signatures, and proper capitalization/punctuation is suggested but not always necessary, but \n make requests instead of commands, and always say please and thank you.")
      elif choice_3 == "4" and choice_4 == "2":
        print("When speaking to a co-worker or colleague the same age/status as you, you may use informal language and moderate \n politeness. You may address the recipient as if they were a friend, but be careful to make requests \n politely and qualify any direct or negative statements. The words please and thank you are suggested.")
      elif choice_3 == "4" and choice_4 == "3":
        print("When speaking to a co-worker or colleague older/higher status than you, use highly polite, semi-formal \n language. Use greetings and signatures, and proofread your email for capitalization, punctuation, \n and grammatical errors. Using the words please and thank you are always polite, and ensure that you leave out any rude words.")

    print("\n Would you like to enter another email or quit the program?")
    print("Type 'quit' to quit. Type anything else to enter another email.")
    quit = input()
    if quit == "quit":
      break
    else:
      continue

    valid = True
    while valid == True:
      if str.lower(choice_2) not in ["y", "n"]:
        print("Please type 'y' or 'n'.")
        choice_2 = input()
      else:
        valid = False
    if choice_5 == "n":
      run = False

sample_email_1 = "Hi Teresa, Just a quick question about the RSVP form. The form only lets me enter one email address for the mentor section, if I enter my primary faculty advisor is there a way it can be forwarded to my graduate student mentor as well? Thanks"
sample_email_2 = "Good evening, Dean Moore, I am just reaching out for an update on the letter to the editor. How is it going? Do you think you would be able to have it to me by Tuesday? Best," 
sample_email_3 = "Love you <3 these are for you AND Dad!"
sample_email_4 = "Hey Emily, I’m not on the weekly schedule. I know I sent in my form, is there some way I could hop on the Friday afternoon shift? It was the only one I could take"