# Feature Engineering for NLP in Python

# 1. Basic features and readability scores

## Introduction to NLP feature engineering

### One-hot encoding

In [None]:
# Print the features of df1
print(df1.columns)

"""
Index(['feature 1', 'feature 2', 'feature 3', 'feature 4', 'feature 5', 'label'], dtype='object')
"""

# Perform one-hot encoding
df1 = pd.get_dummies(df1, columns=['feature 5'])

# Print the new features of df1
print(df1.columns)

"""
Index(['feature 1', 'feature 2', 'feature 3', 'feature 4', 'label', 'feature 5_female', 'feature 5_male'], dtype='object')
"""

# Print first five rows of df1
print(df1.head())

"""
       feature 1  feature 2  feature 3  feature 4  label  feature 5_female  feature 5_male
    0     29.000          0          0    211.338      1                 1               0
    1      0.917          1          2    151.550      1                 0               1
    2      2.000          1          2    151.550      0                 1               0
    3     30.000          1          2    151.550      0                 0               1
    4     25.000          1          2    151.550      0                 1               0
"""

## Basic feature extraction

### Character count of Russian tweets

In [None]:
# Create a feature char_count
tweets['char_count'] = tweets['content'].apply(len)

# Print the average character count
print(tweets['char_count'].mean()
      
# 103.462

### Word count of TED talks

In [None]:
# Function that returns number of words in a string
def count_words(string):
	# Split the string into words
    words = string.split()
    
    # Return the number of words
    return len(words)

# Create a new feature word_count
ted['word_count'] = ted['transcript'].apply(count_words)

# Print the average word count of the talks
print(ted['word_count'].mean())

# 1987.1

### Hashtags and mentions in Russian tweets

In [None]:
# Function that returns number of hashtags in a string
def count_hashtags(string):
	# Split the string into words
    words = string.split()
    
    # Create a list of words that are hashtags
    hashtags = [word for word in words if word.startswith('#')]
    
    # Return number of hashtags
    return(len(hashtags))

# Create a feature hashtag_count and display distribution
tweets['hashtag_count'] = tweets['content'].apply(count_hashtags)
tweets['hashtag_count'].hist()
plt.title('Hashtag count distribution')
plt.show()

In [None]:
# Function that returns number of mentions in a string
def count_mentions(string):
	# Split the string into words
    words = string.split()
    
    # Create a list of words that are mentions
    mentions = [word for word in words if word.startswith('@')]
    
    # Return number of mentions
    return(len(mentions))

# Create a feature mention_count and display distribution
tweets['mention_count'] = tweets['content'].apply(count_mentions)
tweets['mention_count'].hist()
plt.title('Mention count distribution')
plt.show()

## Readability tests

### Readability of 'The Myth of Sisyphus'

In [None]:
# Import Textatistic
from textatistic import Textatistic

# Compute the readability scores 
readability_scores = Textatistic(sisyphus_essay).scores

# Print the flesch reading ease score
flesch = readability_scores['flesch_score']
print("The Flesch Reading Ease is %.2f" % (flesch))

# The Flesch Reading Ease is 81.67

### Readability of various publications

In [None]:
# Import Textatistic
from textatistic import Textatistic

# List of excerpts
excerpts = [forbes, harvard_law, r_digest, time_kids]

# Loop through excerpts and compute gunning fog index
gunning_fog_scores = []
for excerpt in excerpts:
  readability_scores = Textatistic(excerpt).scores
  gunning_fog = readability_scores['gunningfog_score']
  gunning_fog_scores.append(gunning_fog)
  
# Print the gunning fog indices
print(gunning_fog_scores)

# [14.436002482929858, 20.735401069518716, 11.085587583148559, 5.926785009861934]

# 2. Text preprocessing, POS tagging and NER

## Tokenization and Lemmatization

### Tokenizing the Gettysburg address

In [None]:
import spacy

# Load the en_core_web_sm model
nlp = spacy.load('en_core_web_sm')

# Create a Doc object
doc = nlp(gettysburg)

# Generate the tokens
tokens = [token.text for token in doc]
print(tokens)

"""
['Four', 'score', 'and', 'seven', 'years', 'ago', 'our', 'fathers', 'brought', 'forth', 'on', 'this', 'continent', ',', 'a', 
'new', 'nation', ',', 'conceived', 'in', 'Liberty', ',', 'and', 'dedicated', 'to', 'the', 'proposition', 'that', 'all', 'men',
'are', 'created', 'equal', '.', 'Now', 'we', "'re", 'engaged', 'in', 'a', 'great', 'civil', 'war', ',', 'testing', 'whether', 
'that', 'nation', ',', 'or', 'any', 'nation', 'so', 'conceived', 'and', 'so', 'dedicated', ',', 'can', 'long', 'endure', '.', 
'We', "'re", 'met', 'on', 'a', 'great', 'battlefield', 'of', 'that', 'war', '.', 'We', "'ve", 'come', 'to', 'dedicate', 'a', 
'portion', 'of', 'that', 'field', ',', 'as', 'a', 'final', 'resting', 'place', 'for', 'those', 'who', 'here', 'gave', 'their', 
'lives', 'that', 'that', 'nation', 'might', 'live', '.', 'It', "'s", 'altogether', 'fitting', 'and', 'proper', 'that', 'we', 
'should', 'do', 'this', '.', 'But', ',', 'in', 'a', 'larger', 'sense', ',', 'we', 'ca', "n't", 'dedicate', '-', 'we', 'can', 
'not', 'consecrate', '-', 'we', 'can', 'not', 'hallow', '-', 'this', 'ground', '.', 'The', 'brave', 'men', ',', 'living', 'and',
'dead', ',', 'who', 'struggled', 'here', ',', 'have', 'consecrated', 'it', ',', 'far', 'above', 'our', 'poor', 'power', 'to', 
'add', 'or', 'detract', '.', 'The', 'world', 'will', 'little', 'note', ',', 'nor', 'long', 'remember', 'what', 'we', 'say', 
'here', ',', 'but', 'it', 'can', 'never', 'forget', 'what', 'they', 'did', 'here', '.', 'It', 'is', 'for', 'us', 'the', 
'living', ',', 'rather', ',', 'to', 'be', 'dedicated', 'here', 'to', 'the', 'unfinished', 'work', 'which', 'they', 'who', 
'fought', 'here', 'have', 'thus', 'far', 'so', 'nobly', 'advanced', '.', 'It', "'s", 'rather', 'for', 'us', 'to', 'be', 'here',
'dedicated', 'to', 'the', 'great', 'task', 'remaining', 'before', 'us', '-', 'that', 'from', 'these', 'honored', 'dead', 'we', 
'take', 'increased', 'devotion', 'to', 'that', 'cause', 'for', 'which', 'they', 'gave', 'the', 'last', 'full', 'measure', 'of',
'devotion', '-', 'that', 'we', 'here', 'highly', 'resolve', 'that', 'these', 'dead', 'shall', 'not', 'have', 'died', 'in', 
'vain', '-', 'that', 'this', 'nation', ',', 'under', 'God', ',', 'shall', 'have', 'a', 'new', 'birth', 'of', 'freedom', '-', 
'and', 'that', 'government', 'of', 'the', 'people', ',', 'by', 'the', 'people', ',', 'for', 'the', 'people', ',', 'shall', 
'not', 'perish', 'from', 'the', 'earth', '.']
"""

### Lemmatizing the Gettysburg address

In [None]:
# Print the gettysburg address
print(gettysburg)

"""
Four score and seven years ago our fathers brought forth on this continent, a new nation, conceived in Liberty, 
and dedicated to the proposition that all men are created equal. Now we're engaged in a great civil war, testing whether 
that nation, or any nation so conceived and so dedicated, can long endure. We're met on a great battlefield of that war. 
We've come to dedicate a portion of that field, as a final resting place for those who here gave their lives that that nation
might live. It's altogether fitting and proper that we should do this. But, in a larger sense, we can't dedicate - we can not 
consecrate - we can not hallow - this ground. The brave men, living and dead, who struggled here, have consecrated it, far above
our poor power to add or detract. The world will little note, nor long remember what we say here, but it can never forget what 
they did here. It is for us the living, rather, to be dedicated here to the unfinished work which they who fought here have 
thus far so nobly advanced. It's rather for us to be here dedicated to the great task remaining before us - that from these 
honored dead we take increased devotion to that cause for which they gave the last full measure of devotion - that we here 
highly resolve that these dead shall not have died in vain - that this nation, under God, shall have a new birth of 
freedom - and that government of the people, by the people, for the people, shall not perish from the earth. 
"""

In [None]:
import spacy

# Load the en_core_web_sm model
nlp = spacy.load('en_core_web_sm')

# Create a Doc object
doc = nlp(gettysburg)

# Generate lemmas
lemmas = [token.lemma_ for token in doc]

In [None]:
import spacy

# Load the en_core_web_sm model
nlp = spacy.load('en_core_web_sm')

# Create a Doc object
doc = nlp(gettysburg)

# Generate lemmas
lemmas = [token.lemma_ for token in doc]

# Convert lemmas into a string
print(' '.join(lemmas))

"""
 four score and seven year ago our father bring forth on this continent , a new nation , conceive in Liberty , and dedicate to
 the proposition that all man be create equal . now we be engage in a great civil war , test whether that nation , or any nation
 so conceive and so dedicated , can long endure . we be meet on a great battlefield of that war . we 've come to dedicate a 
 portion of that field , as a final resting place for those who here give their life that that nation might live . it be 
 altogether fitting and proper that we should do this . but , in a large sense , we ca n't dedicate - we can not consecrate - 
 we can not hallow - this ground . the brave man , live and dead , who struggle here , have consecrate it , far above our poor 
 power to add or detract . the world will little note , nor long remember what we say here , but it can never forget what they 
 do here . it be for we the living , rather , to be dedicate here to the unfinished work which they who fight here have thus 
 far so nobly advanced . it be rather for we to be here dedicate to the great task remain before we - that from these honor 
 dead we take increased devotion to that cause for which they give the last full measure of devotion - that we here highly 
 resolve that these dead shall not have die in vain - that this nation , under God , shall have a new birth of freedom - and 
 that government of the people , by the people , for the people , shall not perish from the earth 
"""

## Text cleaning

### Cleaning a blog post

In [None]:
# Load model and create Doc object
nlp = spacy.load('en_core_web_sm')
doc = nlp(blog)

# Generate lemmatized tokens
lemmas = [token.lemma_ for token in doc]

# Remove stopwords and non-alphabetic tokens
a_lemmas = [lemma for lemma in lemmas 
            if lemma.isalpha() and lemma not in stopwords]

# Print string after text cleaning
print(' '.join(a_lemmas))

"""
Twenty-first-century politics has witnessed an alarming rise of populism in the U.S. and Europe. The first warning signs came 
with the UK Brexit Referendum vote in 2016 swinging in the way of Leave. This was followed by a stupendous victory by 
billionaire Donald Trump to become the 45th President of the United States in November 2016. Since then, Europe has seen a 
steady rise in populist and far-right parties that have capitalized on Europe’s Immigration Crisis to raise nationalist and 
anti-Europe sentiments. Some instances include Alternative for Germany (AfD) winning 12.6% of all seats and entering the 
Bundestag, thus upsetting Germany’s political order for the first time since the Second World War, the success of the Five Star
Movement in Italy and the surge in popularity of neo-nazism and neo-fascism in countries such as Hungary, Czech Republic, Poland
and Austria.


century politic witness alarming rise populism Europe warning sign come UK Brexit Referendum vote swinging way Leave follow 
stupendous victory billionaire Donald Trump President United States November Europe steady rise populist far right party 
capitalize Europe Immigration Crisis raise nationalist anti europe sentiment instance include Alternative Germany AfD win seat 
enter Bundestag upset Germany political order time Second World War success Five Star Movement Italy surge popularity neo 
nazism neo fascism country Hungary Czech Republic Poland Austria

"""

### Cleaning TED talks in a dataframe

In [None]:
# Function to preprocess text
def preprocess(text):
  	# Create Doc object
    doc = nlp(text, disable=['ner', 'parser'])
    # Generate lemmas
    lemmas = [token.lemma_ for token in doc]
    # Remove stopwords and non-alphabetic characters
    a_lemmas = [lemma for lemma in lemmas 
            if lemma.isalpha() and lemma not in stopwords]
    
    return ' '.join(a_lemmas)
  
# Apply preprocess to ted['transcript']
ted['transcript'] = ted['transcript'].apply(preprocess)
print(ted['transcript'])

"""
    0     talk new lecture TED I illusion create TED I t...
    1     representation brain brain break left half log...
    2     great honor today share Digital Universe creat...
    3     passion music technology thing combination thi...
    4     use want computer new program programming requ...
    5     I neuroscientist mixed background physics medi...
    6     Pat Mitchell day January begin like work love ...
    7     Taylor Wilson I year old I nuclear physicist l...
    8     I grow Northern Ireland right north end absolu...
    9     I publish article New York Times Modern Love c...
    10    Joseph Member Parliament Kenya picture Maasai ...
    11    hi I talk little bit music machine life specif...
    12    hi let I ask audience question lie child raise...
    13    historical record allow know ancient Greeks dr...
    14    good morning I little boy I experience change ...
    15    I slide I year ago time I short slide morning ...
    16    I like world I like share year old love story ...
    17    I fail woman I fail feminist I passionate opin...
    18    revolution century significant longevity revol...
    19    today baffle lady observe shell soul dwellsand...
    Name: transcript, dtype: object
"""

## Part-of-speech tagging

### POS tagging in Lord of the Flies

In [None]:
# Load the en_core_web_sm model
nlp = spacy.load('en_core_web_sm')

# Create a Doc object
doc = nlp(lotf)

# Generate tokens and pos tags
pos = [(token.text, token.pos_) for token in doc]
print(pos)

"""
He found himself understanding the wearisomeness of this life, where every path was an improvisation and a considerable part of
one’s waking life was spent watching one’s feet.


[('He', 'PRON'), ('found', 'VERB'), ('himself', 'PRON'), ('understanding', 'VERB'), ('the', 'DET'), ('wearisomeness', 'NOUN'),
('of', 'ADP'), ('this', 'DET'), ('life', 'NOUN'), (',', 'PUNCT'), ('where', 'ADV'), ('every', 'DET'), ('path', 'NOUN'), 
('was', 'VERB'), ('an', 'DET'), ('improvisation', 'NOUN'), ('and', 'CCONJ'), ('a', 'DET'), ('considerable', 'ADJ'), 
('part', 'NOUN'), ('of', 'ADP'), ('one', 'PRON'), ('’s', 'ADV'), ('waking', 'VERB'), ('life', 'NOUN'), ('was', 'AUX'), 
('spent', 'VERB'), ('watching', 'VERB'), ('one', 'NUM'), ('’s', 'NOUN'), ('feet', 'NOUN'), ('.', 'PUNCT')]
"""

### Counting nouns in a piece of text

In [None]:
nlp = spacy.load('en_core_web_sm')

# Returns number of proper nouns
def proper_nouns(text, model=nlp):
  	# Create doc object
    doc = model(text)
    # Generate list of POS tags
    pos = [token.pos_ for token in doc]
    
    # Return number of proper nouns
    return pos.count('PROPN')

print(proper_nouns("Abdul, Bill and Cathy went to the market to buy apples.", nlp))

#  3

In [None]:
nlp = spacy.load('en_core_web_sm')

# Returns number of other nouns
def nouns(text, model=nlp):
  	# Create doc object
    doc = model(text)
    # Generate list of POS tags
    pos = [token.pos_ for token in doc]
    
    # Return number of other nouns
    return pos.count('NOUN')

print(nouns("Abdul, Bill and Cathy went to the market to buy apples.", nlp))

# 2

### Noun usage in fake news

In [None]:
headlines['num_propn'] = headlines['title'].apply(proper_nouns)

# Compute mean of proper nouns
real_propn = headlines[headlines['label'] == 'REAL']['num_propn'].mean()
fake_propn = headlines[headlines['label'] == 'FAKE']['num_propn'].mean()

# Print results
print("Mean no. of proper nouns in real and fake headlines are %.2f and %.2f respectively"%(real_propn, fake_propn))

# Mean no. of proper nouns in real and fake headlines are 2.40 and 4.67 respectively

In [None]:
headlines['num_noun'] = headlines['title'].apply(nouns)

# Compute mean of other nouns
real_noun = headlines[headlines['label'] == 'REAL']['num_noun'].mean()
fake_noun = headlines[headlines['label'] == 'FAKE']['num_noun'].mean()

# Print results
print("Mean no. of other nouns in real and fake headlines are %.2f and %.2f respectively"%(real_noun, fake_noun))


# Mean no. of other nouns in real and fake headlines are 2.28 and 1.84 respectively

## Named entity recognition

### Named entities in a sentence

In [None]:
# Load the required model
nlp = spacy.load('en_core_web_sm')

# Create a Doc instance 
text = 'Sundar Pichai is the CEO of Google. Its headquarters is in Mountain View.'
doc = nlp(text)

# Print all named entities and their labels
for ent in doc.ents:
    print(ent.text, ent.label_)
    
"""
    Google ORG
    Mountain View GPE
"""

### Identifying people mentioned in a news article

In [None]:
def find_persons(text):
  # Create Doc object
  doc = nlp(text)
  
  # Identify the persons
  persons = [ent.text for ent in doc.ents if ent.label_ == 'PERSON']
  
  # Return persons
  return persons

print(find_persons(tc))


"""
It’s' been a busy day for Facebook  exec op-eds. Earlier this morning, Sheryl Sandberg broke the site’s silence around the 
Christchurch massacre, and now Mark Zuckerberg is calling on governments and other bodies to increase regulation around the 
sorts of data Facebook traffics in. He’s hoping to get out in front of heavy-handed regulation and get a seat at the table 
shaping it.


['Sheryl Sandberg', 'Mark Zuckerberg']
"""

# 3. N-Gram models

## Building a bag of words model

### BoW model for movie taglines

In [None]:
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Create CountVectorizer object
vectorizer = CountVectorizer()

# Generate matrix of word vectors
bow_matrix = vectorizer.fit_transform(corpus)

# Print the shape of bow_matrix
print(bow_matrix.shape)

"""
1            Roll the dice and unleash the excitement!
2    Still Yelling. Still Fighting. Still Ready for...
3    Friends are the people who let you be yourself...
4    Just When His World Is Back To Normal... He's ...
5                             A Los Angeles Crime Saga
Name: tagline, dtype: object

<script.py> output:
    (7033, 6614)
"""

### Analyzing dimensionality and preprocessing

In [None]:
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Create CountVectorizer object
vectorizer = CountVectorizer()

# Generate matrix of word vectors
bow_lem_matrix = vectorizer.fit_transform(lem_corpus)

# Print the shape of bow_lem_matrix
print(bow_lem_matrix.shape)

"""
0    roll dice unleash excitement
1           yell fight ready love
2    friend people let let forget
3      world normal surprise life
4          los angeles crime saga
Name: 1, dtype: object

<script.py> output:
    (6959, 5223)
"""

### Mapping feature indices with feature names

In [None]:
# Create CountVectorizer object
vectorizer = CountVectorizer()

# Generate matrix of word vectors
bow_matrix = vectorizer.fit_transform(corpus)

# Convert bow_matrix into a DataFrame
bow_df = pd.DataFrame(bow_matrix.toarray())

# Map the column names to vocabulary 
bow_df.columns = vectorizer.get_feature_names()

# Print bow_df
print(bow_df)

"""
['The lion is the king of the jungle', 'Lions have lifespans of a decade', 'The lion is an endangered species']

<script.py> output:
       an  decade  endangered  have  is  ...  lion  lions  of  species  the
    0   0       0           0     0   1  ...     1      0   1        0    3
    1   0       1           0     1   0  ...     0      1   1        0    0
    2   1       0           1     0   1  ...     1      0   0        1    1
    
    [3 rows x 13 columns]
"""

## Building a BoW Naive Bayes classifier

### BoW vectors for movie reviews

In [None]:
# Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Create a CountVectorizer object
vectorizer = CountVectorizer(lowercase=True, stop_words='english')

# Fit and transform X_train
X_train_bow = vectorizer.fit_transform(X_train)

# Transform X_test
X_test_bow = vectorizer.transform(X_test)

# Print shape of X_train_bow and X_test_bow
print(X_train_bow.shape)
print(X_test_bow.shape)

"""
    (250, 8158)
    (750, 8158)
"""

### Predicting the sentiment of a movie review

In [None]:
# Create a MultinomialNB object
clf = MultinomialNB()

# Fit the classifier
clf.fit(X_train_bow, y_train)

# Measure the accuracy
accuracy = clf.score(X_test_bow, y_test)
print("The accuracy of the classifier on the test set is %.3f" % accuracy)

# Predict the sentiment of a negative review
review = "The movie was terrible. The music was underwhelming and the acting mediocre."
prediction = clf.predict(vectorizer.transform([review]))[0]
print("The sentiment predicted by the classifier is %i" % (prediction))


"""
    The accuracy of the classifier on the test set is 0.732
    The sentiment predicted by the classifier is 0
"""

## Building n-gram models

### n-gram models for movie tag lines

In [None]:
# Generate n-grams upto n=1
vectorizer_ng1 = CountVectorizer(ngram_range=(1,1))
ng1 = vectorizer_ng1.fit_transform(corpus)

# Generate n-grams upto n=2
vectorizer_ng2 = CountVectorizer(ngram_range=(1,2))
ng2 = vectorizer_ng2.fit_transform(corpus)

# Generate n-grams upto n=3
vectorizer_ng3 = CountVectorizer(ngram_range=(1,3))
ng3 = vectorizer_ng3.fit_transform(corpus)

# Print the number of features for each model
print("ng1, ng2 and ng3 have %i, %i and %i features respectively" % (ng1.shape[1], ng2.shape[1], ng3.shape[1]))

"""
    ng1, ng2 and ng3 have 6614, 37100 and 76881 features respectively
"""

### Higher order n-grams for sentiment analysis

In [None]:
# Define an instance of MultinomialNB 
clf_ng = MultinomialNB()

# Fit the classifier
clf_ng.fit(X_train_ng, y_train)

# Measure the accuracy
accuracy = clf_ng.score(X_test_ng, y_test)
print("The accuracy of the classifier on the test set is %.3f" % accuracy)

# Predict the sentiment of a negative review
review = "The movie was not good. The plot had several holes and the acting lacked panache."
prediction = clf_ng.predict(ng_vectorizer.transform([review]))[0]
print("The sentiment predicted by the classifier is %i" % (prediction))

"""
    The accuracy of the classifier on the test set is 0.758
    The sentiment predicted by the classifier is 0
"""

### Comparing performance of n-gram models

In [None]:
start_time = time.time()
# Splitting the data into training and test sets
train_X, test_X, train_y, test_y = train_test_split(df['review'], df['sentiment'], test_size=0.5, random_state=42, stratify=df['sentiment'])

# Generating ngrams
vectorizer = CountVectorizer()
train_X = vectorizer.fit_transform(train_X)
test_X = vectorizer.transform(test_X)

# Fit classifier
clf = MultinomialNB()
clf.fit(train_X, train_y)

# Print accuracy, time and number of dimensions
print("The program took %.3f seconds to complete. The accuracy on the test set is %.2f. 
The ngram representation had %i features." % (time.time() - start_time, clf.score(test_X, test_y), train_X.shape[1]))
      
"""
The program took 0.141 seconds to complete. The accuracy on the test set is 0.75. The ngram representation had 12347 features.
"""


In [None]:
start_time = time.time()
# Splitting the data into training and test sets
train_X, test_X, train_y, test_y = train_test_split(df['review'], df['sentiment'], test_size=0.5, random_state=42, stratify=df['sentiment'])

# Generating ngrams
vectorizer = CountVectorizer(ngram_range=(1,3))
train_X = vectorizer.fit_transform(train_X)
test_X = vectorizer.transform(test_X)

# Fit classifier
clf = MultinomialNB()
clf.fit(train_X, train_y)

# Print accuracy, time and number of dimensions
print("The program took %.3f seconds to complete. The accuracy on the test set is %.2f. 
The ngram representation had %i features." % (time.time() - start_time, clf.score(test_X, test_y), train_X.shape[1]))
      
"""
The program took 0.971 seconds to complete. The accuracy on the test set is 0.77. The ngram representation had 178240 features.
"""

# 4. TF-IDF and similarity scores

## Building tf-idf document vectors

### tf-idf vectors for TED talks

In [None]:
# Import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TfidfVectorizer object
vectorizer = TfidfVectorizer()

# Generate matrix of word vectors
tfidf_matrix = vectorizer.fit_transform(ted)

# Print the shape of tfidf_matrix
print(tfidf_matrix.shape)

#  (500, 29158)

## Cosine similarity

### Computing dot product

In [None]:
# Initialize numpy vectors
A = np.array([1,3])
B = np.array([-2, 2])

# Compute dot product
dot_prod = np.dot(A, B)

# Print dot product
print(dot_prod)

# 4

### Cosine similarity matrix of a corpus

In [None]:
# Initialize an instance of tf-idf Vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Generate the tf-idf vectors for the corpus
tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)

# Compute and print the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix,tfidf_matrix)
print(cosine_sim)

"""
corpus:
 ['The sun is the largest celestial body in the solar system', 'The solar system consists of the sun and eight revolving planets', 'Ra was the Egyptian Sun God', 'The Pyramids were the pinnacle of Egyptian architecture', 'The quick brown fox jumps over the lazy dog']

<script.py> output:
    [[1.         0.36413198 0.18314713 0.18435251 0.16336438]
     [0.36413198 1.         0.15054075 0.21704584 0.11203887]
     [0.18314713 0.15054075 1.         0.21318602 0.07763512]
     [0.18435251 0.21704584 0.21318602 1.         0.12960089]
     [0.16336438 0.11203887 0.07763512 0.12960089 1.        ]]
"""

## Building a plot line based recommender

### Comparing linear_kernel and cosine_similarity

In [None]:
# Record start time
start = time.time()

# Compute cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Print cosine similarity matrix
print(cosine_sim)

# Print time taken
print("Time taken: %s seconds" %(time.time() - start))

"""
    [[1.         0.         0.         ... 0.         0.         0.        ]
     [0.         1.         0.         ... 0.         0.         0.        ]
     [0.         0.         1.         ... 0.         0.01418221 0.        ]
     ...
     [0.         0.         0.         ... 1.         0.01589009 0.        ]
     [0.         0.         0.01418221 ... 0.01589009 1.         0.        ]
     [0.         0.         0.         ... 0.         0.         1.        ]]
    Time taken: 1.0700690746307373 seconds
"""

In [None]:
# Record start time
start = time.time()

# Compute cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Print cosine similarity matrix
print(cosine_sim)

# Print time taken
print("Time taken: %s seconds" %(time.time() - start))

"""
    [[1.         0.         0.         ... 0.         0.         0.        ]
     [0.         1.         0.         ... 0.         0.         0.        ]
     [0.         0.         1.         ... 0.         0.01418221 0.        ]
     ...
     [0.         0.         0.         ... 1.         0.01589009 0.        ]
     [0.         0.         0.01418221 ... 0.01589009 1.         0.        ]
     [0.         0.         0.         ... 0.         0.         1.        ]]
    Time taken: 1.0700690746307373 seconds

<script.py> output:
    [[1.         0.         0.         ... 0.         0.         0.        ]
     [0.         1.         0.         ... 0.         0.         0.        ]
     [0.         0.         1.         ... 0.         0.01418221 0.        ]
     ...
     [0.         0.         0.         ... 1.         0.01589009 0.        ]
     [0.         0.         0.01418221 ... 0.01589009 1.         0.        ]
     [0.         0.         0.         ... 0.         0.         1.        ]]
    Time taken: 0.24660348892211914 seconds
"""

### Plot recommendation engine

In [None]:
# Initialize the TfidfVectorizer 
tfidf = TfidfVectorizer(stop_words='english')

# Construct the TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(movie_plots)

# Generate the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
 
# Generate recommendations
print(get_recommendations('The Dark Knight Rises', cosine_sim, indices))

"""
    1                              Batman Forever
    2                                      Batman
    3                              Batman Returns
    8                  Batman: Under the Red Hood
    9                            Batman: Year One
    10    Batman: The Dark Knight Returns, Part 1
    11    Batman: The Dark Knight Returns, Part 2
    5                Batman: Mask of the Phantasm
    7                               Batman Begins
    4                              Batman & Robin
    Name: title, dtype: object
"""

### The recommender function

In [None]:
# Generate mapping between titles and index
indices = pd.Series(metadata.index, index=metadata['title']).drop_duplicates()

def get_recommendations(title, cosine_sim, indices):
    # Get index of movie that matches title
    idx = indices[title]
    # Sort the movies based on the similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores for 10 most similar movies
    sim_scores = sim_scores[1:11]
    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    # Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]

"""
               title                                            tagline
938  Cinema Paradiso  A celebration of youth, friendship, and the ev...
630         Spy Hard  All the action. All the women. Half the intell...
682        Stonewall                    The fight for the right to love
514           Killer                    You only hurt the one you love.
365    Jason's Lyric                                   Love is courage.

"""

### TED talk recommender

In [None]:
# Initialize the TfidfVectorizer 
tfidf = TfidfVectorizer(stop_words='english')

# Construct the TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(transcripts)

# Generate the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
 
# Generate recommendations
print(get_recommendations('5 ways to kill your dreams', cosine_sim, indices))

"""
    453             Success is a continuous journey
    157                        Why we do what we do
    494                   How to find work you love
    149          My journey into movies that matter
    447                        One Laptop per Child
    230             How to get your ideas to spread
    497         Plug into your hard-wired happiness
    495    Why you will fail to have a great career
    179             Be suspicious of simple stories
    53                          To upgrade is human
    Name: title, dtype: object
"""

## Beyond n-grams: word embeddings

### Generating word vectors

In [None]:
# Create the doc object
doc = nlp(sent)

# Compute pairwise similarity scores
for token1 in doc:
  for token2 in doc:
    print(token1.text, token2.text, token1.similarity(token2))
    
"""
I like apples and oranges

<script.py> output:
    I I 1.0
    I like 0.13463897
    I apples -0.036133606
    I and -0.085230574
    I oranges 0.033708632
    like I 0.13463897
    like like 1.0
    like apples 0.0007651703
    like and 0.104521796
    like oranges -0.045859136
    apples I -0.036133606
    apples like 0.0007651703
    apples apples 1.0
    apples and -0.051072996
    apples oranges 0.46452007
    and I -0.085230574
    and like 0.104521796
    and apples -0.051072996
    and and 1.0
    and oranges 0.038236685
    oranges I 0.033708632
    oranges like -0.045859136
    oranges apples 0.46452007
    oranges and 0.038236685
    oranges oranges 1.0
"""

### Computing similarity of Pink Floyd songs

In [None]:
# Create Doc objects
mother_doc = nlp(mother)
hopes_doc = nlp(hopes)
hey_doc = nlp(hey)

# Print similarity between mother and hopes
print(mother_doc.similarity(hopes_doc))

# Print similarity between mother and hey
print(mother_doc.similarity(hey_doc))

"""
<script.py> output:
    0.39086030814019257
    0.8043759483951038
"""