In [6]:
# Cleaning Text

text_data = ["  Interrobang. By Aishwarya Henriette     ",
             "Parking And Going. By Karl Gautier",
             "  Today Is The night. By Jarek Prakash    "]

strip_whitespace = [string.strip() for string in text_data]
strip_whitespace

remove_periods = [string.replace(".", "") for string in strip_whitespace]
remove_periods

# Also try a custom function for transformation
def capitalizer(string: str) -> str:
    return string.upper()

[capitalizer(string) for string in remove_periods]

['INTERROBANG BY AISHWARYA HENRIETTE',
 'PARKING AND GOING BY KARL GAUTIER',
 'TODAY IS THE NIGHT BY JAREK PRAKASH']

In [8]:
import re

def replace_letters_with_X(string: str) -> str:
    return re.sub(r"[a-zA-Z]", "X", string)

[replace_letters_with_X(string) for string in remove_periods]

['XXXXXXXXXXX XX XXXXXXXXX XXXXXXXXX',
 'XXXXXXX XXX XXXXX XX XXXX XXXXXXX',
 'XXXXX XX XXX XXXXX XX XXXXX XXXXXXX']

In [4]:
# Parsing and Cleaning HTML
from bs4 import BeautifulSoup

html = "<div class='full_name'>"\
    "<span style='font-weight:bold'>Masego"\
    "</span> Azra</div>"

soup = BeautifulSoup(html)

soup.find("div", {"class": "full_name"}).text

'Masego Azra'

In [7]:
# Removing Punctuation
import unicodedata
import sys

text_data = ['Hi!!!! I. Love. This. Song....',
             '10000% Agree!!!! #LoveIT',
             'Right?!?!']

punctuation = dict.fromkeys(
    (i for i in range(sys.maxunicode)
    if unicodedata.category(chr(i)).startswith('P')
    ),
    None
)

[string.translate(punctuation) for string in text_data]

['Hi I Love This Song', '10000 Agree LoveIT', 'Right']

In [None]:
# import nltk
from nltk.tokenize import word_tokenize

# nltk.download('all', download_dir='C:/nltk_data')

string = "The science of today is the technology of tomorrow"

word_tokenize(string)


['The', 'science', 'of', 'today', 'is', 'the', 'technology', 'of', 'tomorrow']

In [7]:
from nltk.tokenize import sent_tokenize

string = "The science of today is the technology of tomorrow. Tomorrow is today."

sent_tokenize(string)

['The science of today is the technology of tomorrow.', 'Tomorrow is today.']

In [11]:
# Removing Stop Words
from nltk.corpus import stopwords

tokenize_words = ['i',
                  'am',
                  'going',
                  'to',
                  'go',
                  'the',
                  'store',
                  'and',
                  'park']

stop_words = stopwords.words('english')

[word for word in tokenize_words if word not in stop_words]


['going', 'go', 'store', 'park']

In [1]:
# Stemming Words
from nltk.stem.porter import PorterStemmer

tokenize_words = ['i', 'am', 'humbled', 'by', 'this', 'traditional', 'meeting']

porter = PorterStemmer()

[porter.stem(word) for word in tokenize_words]

['i', 'am', 'humbl', 'by', 'thi', 'tradit', 'meet']

In [4]:
# Tagging Parts of Speech
from nltk import pos_tag, word_tokenize

text_data = "Chris loved outdoor running"

text_tagged = pos_tag(word_tokenize(text_data))

print(text_tagged)

[word for word, tag in text_tagged if tag in ['NN', 'NNS', 'NNP', 'NNPS']]

[('Chris', 'NNP'), ('loved', 'VBD'), ('outdoor', 'RP'), ('running', 'VBG')]


['Chris']

In [12]:
from sklearn.preprocessing import MultiLabelBinarizer

tweets = ["I am eating a burrito for breakfast",
          "Political science is an amazing field",
          "San Francisco is an awesome city"]

tagged_tweets = []

for tweet in tweets:
    tweet_tag = pos_tag(word_tokenize(tweet))
    # print(tweet_tag)
    tagged_tweets.append([tag for word, tag in tweet_tag])

one_hot_multi = MultiLabelBinarizer()

print(tagged_tweets)
print(one_hot_multi.fit_transform(tagged_tweets))
one_hot_multi.classes_

[['PRP', 'VBP', 'VBG', 'DT', 'NN', 'IN', 'NN'], ['JJ', 'NN', 'VBZ', 'DT', 'JJ', 'NN'], ['NNP', 'NNP', 'VBZ', 'DT', 'JJ', 'NN']]
[[1 1 0 1 0 1 1 1 0]
 [1 0 1 1 0 0 0 0 1]
 [1 0 1 1 1 0 0 0 1]]


array(['DT', 'IN', 'JJ', 'NN', 'NNP', 'PRP', 'VBG', 'VBP', 'VBZ'],
      dtype=object)

In [22]:
# Performing Named-Entity Recognition
import spacy

nlp = spacy.load("en_core_web_sm")
doc = nlp("Elon Musk offered to buy Twitter using $21B of his own money")

print(doc.ents)

for entity in doc.ents:
    print(entity.text, entity.label_, sep=",")

(Elon Musk, Twitter, 21B)
Elon Musk,PERSON
Twitter,PERSON
21B,MONEY


In [31]:
# Encoding Text as a Bag of Words
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

text_data = np.array(['I love Brazil. Brazil!',
                      'Sweden is best',
                      'Germany beats both'])

count = CountVectorizer()
bag_of_words = count.fit_transform(text_data)

bag_of_words

print(bag_of_words.toarray())

count.get_feature_names_out()

[[0 0 0 2 0 0 1 0]
 [0 1 0 0 0 1 0 1]
 [1 0 1 0 1 0 0 0]]


array(['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love',
       'sweden'], dtype=object)

In [37]:
count_2gram = CountVectorizer(ngram_range=(1,2),
                              stop_words='english',
                              vocabulary=['brazil'])

bag = count_2gram.fit_transform(text_data)

bag.toarray()

count_2gram.vocabulary_

{'brazil': 0}

In [42]:
# Weighting Word Importance
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

text_data = np.array(['I love Brazil. Brazil!',
                      'Sweden is best',
                      'Germany beats both'])

tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)
feature_matrix

feature_matrix.toarray()

tfidf.vocabulary_

{'love': 6,
 'brazil': 3,
 'sweden': 7,
 'is': 5,
 'best': 1,
 'germany': 4,
 'beats': 0,
 'both': 2}

In [46]:
# Using Text Vectors to Calculate Text Similarity in a Search Query
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

text_data = np.array(['I love Brazil. Brazil!',
                      'Sweden is best',
                      'Germany beats both'])

tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)

text = "I love Brazil. Brazil!"
vector = tfidf.transform([text])

cosine_similarities = linear_kernel(vector, feature_matrix).flatten()

related_doc_indices = cosine_similarities.argsort()[:-10:-1]

print([(text_data[i], cosine_similarities[i]) for i in related_doc_indices])

[(np.str_('I love Brazil. Brazil!'), np.float64(0.9999999999999999)), (np.str_('Germany beats both'), np.float64(0.0)), (np.str_('Sweden is best'), np.float64(0.0))]


In [6]:
# Using a Sentiment Analysis Classifier
from transformers import pipeline

classifier = pipeline("sentiment-analysis")

sentiment_1 = classifier("I hate machine learning! It's the absolute worst.")
sentiment_2 = classifier(
    "Machine learning is absolute"
    "bees knees I love it so much!")

print(sentiment_1, sentiment_2)


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.
Device set to use cpu


[{'label': 'NEGATIVE', 'score': 0.9998020529747009}] [{'label': 'POSITIVE', 'score': 0.9992170333862305}]
