In [143]:
import nltk
from nltk import FreqDist
from nltk.tokenize import word_tokenize
from nltk import bigrams
from nltk import collocations
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer
nltk.download('vader_lexicon')
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/elliekim/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


# Cleaning Data
Create a variable, **text1**, that stores the user's pitch. 
NLTK will tokenize this into characters, so we must use **word_tokenize** to obtain tokens as words. 

Create a variable, **text**, that stores user's pitch with word tokens
`text = word_tokenize(text)`

Then, for future functions, work with text, instead of text1. This ensures raw data is not manipulated.

In [148]:
text1 = "Hello! I'm Ellie, a dedicated and ambitious Sophomore at UNC pursuing a major in Computer Science and a double minor in Statistics and Journalism. With a passion for technology, I am continuously seeking opportunities to create impactful solutions and intersect design and technology. My drive extends beyond technical skills; I am committed to leveraging technology for social good, advocating for accessibility and inclusivity in every tech product I work on. Connect and collaborate with me at cellie@unc.edu to make a positive difference together!"

def words(text1):
    return word_tokenize(text1)

text = words(text1)

lower_case_text = text1.lower()
lemmatizedWords = []
for word in text: 
    rootWord = lemmatizer.lemmatize(word)
    lemmatizedWords.append(rootWord)
print(lemmatizedWords)

def stop_word(word):
    stopwords_list = stopwords.words("english")
    if word.lower() in stopwords_list:
        return True
    else:
        return False


['Hello', '!', 'I', "'m", 'Ellie', ',', 'a', 'dedicated', 'and', 'ambitious', 'Sophomore', 'at', 'UNC', 'pursuing', 'a', 'major', 'in', 'Computer', 'Science', 'and', 'a', 'double', 'minor', 'in', 'Statistics', 'and', 'Journalism', '.', 'With', 'a', 'passion', 'for', 'technology', ',', 'I', 'am', 'continuously', 'seeking', 'opportunity', 'to', 'create', 'impactful', 'solution', 'and', 'intersect', 'design', 'and', 'technology', '.', 'My', 'drive', 'extends', 'beyond', 'technical', 'skill', ';', 'I', 'am', 'committed', 'to', 'leveraging', 'technology', 'for', 'social', 'good', ',', 'advocating', 'for', 'accessibility', 'and', 'inclusivity', 'in', 'every', 'tech', 'product', 'I', 'work', 'on', '.', 'Connect', 'and', 'collaborate', 'with', 'me', 'at', 'cellie', '@', 'unc.edu', 'to', 'make', 'a', 'positive', 'difference', 'together', '!']


# Basic Data Manipulation
`sorted(set(text))` returns the tokens in alphabetical order, with capitalized words coming first
`len(set(text1))` returns the total # of distinct words in the text, while `len(text` returns the entire length

## Calculating the lexical richness, or the number of distinct words out of the total # of words
Type-Token Ratio(TTR): Our ratio calculates the # of unique words in proportion to the total number of words. 
Generally, a TTR between 0.2 and 0.4 is considered average, while a TTR above 0.4 is often seen as more diverse and rich.
`len(set(text1)) / len(text1)`

In [71]:
# This function calculates the lexical richness, or the number of distinct words out of the total # of words

def lexical_diversity(text):
    diversity = len(set(text))/len(text)
    if (diversity > 0.4):
        print("Your text has lexical richness!")
    else:
        print("You have low lexical richness. Try diversifying the words you are using.")
    return diversity

lexical_diversity(text)

Your text has lexical richness!


0.7052631578947368

# Frequency Distribution
Tells us the frequency of each vocabulary item in the text
Can determine the most over-used words

The function below creates a frequency distribution of words in the text. 
Then, it returns the words in the text that appears more than 5 times.

In [102]:

def overused_words(text):
    fdist1 = FreqDist(text)    
    V = set(text)
    overused = [w for w in V if fdist1[w] > 4 and not stop_word(w)]
    if len(overused) == 0: 
        print("You have no overused words in your text. Good job!")
    else: 
        print("Here are your overused words. Here are some alternatives you can use: ")
    return(overused)

overused_words(text)



You have no overused words in your text. Good job!


[]

# Roberta Pretrained Model
Use a mdoel trained on a large corpus of data
Transformer model accounts for the words but also the context related to other words

**Important note: make sure to run functions on RAW text, since word_tokenized text is an array of string**

In [106]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from scipy.special import softmax

In [107]:
MODEL=f"cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)

In [126]:
def polarity_scores_roberta(example):
    encoded_text = tokenizer(text1, return_tensors = 'pt')
    output = model(**encoded_text)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    scores_dict = {
        'Here is your roberta_neg score' : scores[0],
        'Here is your roberta_neu score' : scores[1],
        'Here is your roberta_pos score' : scores[2]
    }
    if (scores[0] > 0.2):
        print("Your text is too negative. Try adding some positive words: ")
    return scores_dict

polarity_scores_roberta(text1)

Your text is too negative. Try adding some positive words: 


{'Here is your roberta_neg score': 0.9823239,
 'Here is your roberta_neu score': 0.01427591,
 'Here is your roberta_pos score': 0.0034001828}

# Extracting Skills from Elevator Pitch


In [151]:
tagged_tokens = pos_tag(text)
relevant_tags = ['NN', 'NNS', 'NNP', 'NNPS', 'JJ']
skills = [token for token, tag in tagged_tokens if tag in relevant_tags]
print(skills)

['Hello', 'Ellie', 'dedicated', 'ambitious', 'Sophomore', 'UNC', 'major', 'Computer', 'Science', 'double', 'minor', 'Statistics', 'Journalism', 'passion', 'technology', 'opportunities', 'impactful', 'solutions', 'intersect', 'design', 'technology', 'drive', 'extends', 'technical', 'skills', 'technology', 'social', 'good', 'accessibility', 'inclusivity', 'tech', 'product', 'Connect', 'collaborate', 'cellie', '@', 'unc.edu', 'positive', 'difference']
