In [1]:
# Natural Language Processing or Text Mining - Text Data is Unsupervised Learning Data.
# Unsupervised Learning means no proper structure, no variables, and also traditional models
# do not work on this data

# Text Data must be processed and convert to Supervised Learning where it will have proper
# structure, variable formation and traditional models will work.

# Different types of text data in terms of documents(PDF, Word Docs, OCR Files(PDF Images),
# Web Pages, Social Media Posts/Updates/Text Content, Databases(XML Format), Text from Images,
# IPO Documents(Redherring Prospectus), etc.)

In [2]:
# Web Scraping - Scraping text data from webpages. Web Pages are typically html pages
# that has many other factors along with text like headings, Page Details, Styling, fonts,
# etc.

# Scrape Text content only. Libraries used are requests, bs4, nltk
# requests library for url page scraping
# bs4 for scraping text content

In [3]:
import requests
from bs4 import BeautifulSoup

In [4]:
response = requests.get("https://www.britannica.com/science/climate-change/Climate-change-since-the-emergence-of-civilization")
soup = BeautifulSoup(response.text, 'lxml')

In [5]:
paragraphs = soup.find_all('p')
paragraphs_txt=[p.text for p in paragraphs]

In [6]:
# Text preprocessing - Cleaning up text using re library. re(Regular Expressions) library
# used for identifying different text patterns and clean them.

# Text patterns like email, digits, words, spaces, word boundaries(start/end), etc are
# predefined and are used for cleaning text data.

# Text preprocessing involves removing punctuations, special characters, digits, spaces,
# emoji's, hyperlinks, specific characters, etc.

![image.png](attachment:ed226b74-2beb-4118-9d91-67e4891c49e6.png)

In [7]:
import re

In [8]:
pattern=r'[^a-zA-Z0-9\s.]'

In [9]:
# Assuming paragraphs_txt is a list of strings from BeautifulSoup
# Join all paragraphs into a single string
processed_text = " ".join(paragraphs_txt)

# Remove numbers
processed_text = re.sub(r'[0-9]+', '', processed_text)

# Remove special characters/punctuation (replace with space to avoid merging words)
# pattern is defined as r'[^a-zA-Z0-9\s.]'
processed_text = re.sub(pattern, ' ', processed_text)

# Normalize multiple spaces to a single space and strip leading/trailing whitespace
processed_text = re.sub(r'\s+', ' ', processed_text).strip()

# Convert to lower case
paragraphs_txt = processed_text.lower()

In [10]:
# This step is now handled in the previous cell.

In [11]:
# This step is now handled in the previous cell.

In [12]:
# Tokenization - Break text into tokens/words or sentences.
# Sentence Tokenization - Breaking text into sentences. default delimiter is fullstop
# Word Tokenization - Breaking text into words or tokens. default delimiter is space

In [None]:
# !pip install nltk
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download("all")

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |  

True

In [None]:
climatesentences=sent_tokenize(paragraphs_txt) #store all sentences in the graph

In [None]:
# On Sentences - Sentiment Analysis is done. positive, negative, neutral sentiments are
# generated for each sentence.
# Many Sentiment Analysis Models like VADER, Text Blob Polarity Model, Stanford Sentiment Model
# ,nltk sentiment , etc.

# Most popular and highly accurate is Text Blob Library sentiment model. Text Blob Model
# provides 2 scores - Polarity Score and Subjectivity Score.
# Polarity Score is a value that lies between -1 to 1. Using this score sentiment classification
# done

# Subjectivity Score is a value that lies between 0 and 1. Close to 1 is high personal
# opinion (involves adverbs & Superlatives) and Close to 0 is low personal opinion.

In [None]:
# !pip install textblob
from textblob import TextBlob

In [None]:
TextBlob("Tendulkar is greatest batsman in Cricket").sentiment

Sentiment(polarity=1.0, subjectivity=1.0)

In [None]:
TextBlob("Tendulkar is great batsman").sentiment

Sentiment(polarity=0.8, subjectivity=0.75)

In [None]:
TextBlob("Tendulkar is most reputed cricketer").sentiment

Sentiment(polarity=0.5, subjectivity=0.5)

In [None]:
def analyze_sentiment(text):
    analysis=TextBlob(text)
    if analysis.sentiment.polarity>0:
        return "Positive"
    elif analysis.sentiment.polarity==0:
        return "Neutral"
    else:
        return "Negative"

In [None]:
import pandas as pd
climatesentences=pd.DataFrame(climatesentences,columns=['sentence'])

In [None]:
climatesentences['sentiment']=[str(analyze_sentiment(x)) for x in climatesentences['sentence']]

This line of code processes each sentence in your climatesentences DataFrame to determine its sentiment. Here's a breakdown:

climatesentences['sentence']: This selects the 'sentence' column from your climatesentences DataFrame.
for x in climatesentences['sentence']: This part creates a loop that goes through each individual sentence (x) within that 'sentence' column.
analyze_sentiment(x): For each sentence (x), it calls the analyze_sentiment function you defined earlier. This function takes a text string and returns 'Positive', 'Neutral', or 'Negative' based on its polarity.
str(...): This ensures that the output of analyze_sentiment (which is already a string) is explicitly treated as a string, although in this case, it might not change the value.
[...]: The square brackets indicate a list comprehension.


In [None]:
climatesentences['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1


In [None]:
climatesentences.head()

Unnamed: 0,sentence,sentiment


In [None]:
# NLP uses words or tokens as fundamental point of analysis
climatewords=word_tokenize(paragraphs_txt)

In [None]:
# isalnum() will select only words and digits. All special characters deleted
climatewords=[w for w in climatewords if w.isalnum()]
climatewords

[]

The isalnum() method is a built-in string method in Python. It returns True if all characters in the string are alphanumeric (i.e., either alphabets or numbers) and there is at least one character, and False otherwise.

In [None]:
# Remove Stopwords. Stopwords are list of words like is, a, an, the, then, to, etc. that
# are not required for analysis
from nltk.corpus import stopwords

In [None]:
english_stopwords=set(stopwords.words("english"))

In [None]:
climatewords=[w for w in climatewords if not w in english_stopwords] # store all words except stopwords

In [None]:
climatewords=[w for w in climatewords if len(w)>2] # Select words more than 2 characters

In [None]:
from nltk.probability import FreqDist

In [None]:
wordfreq=FreqDist(climatewords) #FRequency distribution of words

In [None]:
wordfreq.most_common(20)

[]

In [None]:
# Word Cloud is a vizual representation of most frequent words. large font size most frequent
# small font size less frequent.

from wordcloud import WordCloud

In [None]:
# wordcloud=WordCloud(width=1000,height=500,stopwords=english_stopwords,colormap="plasma",max_words=200).generate(str(climatewords))

In [None]:
#import matplotlib.pyplot as plt

#plt.imshow(wordcloud)

In [None]:
# Vectorization is process of converting text or word/tokens into matrix of numbers
# Algorithms will work only with numbers, hence words/tokens must be vectorized.
# 2 types of vectorization are widely used -
# a ) Document Term Matrix - By default documents/sentences in rows and words/tokens
# in columns.

# Doc1 - "the car is driven on the road"
# Doc2 - "the truck is driven on the highway"
# Doc3 - " car and truck are both driven on highway"

# post preprocessing
# Doc1 - "car", "driven", "road"
# Doc2 - "truck", "driven", "highway"
# Doc3 - "car", "truck", "driven", "highway"

# Document Term Matrix
#           car  driven road truck highway
# Doc 1 -    1     1     1     0    0
# Doc 2 -    0     1     0     1    1
# Doc 3 -    1     1     0     1    1

In [None]:
# CountVectorizer() is a predefined function for creating Document Term Matrix
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
DTM=CountVectorizer(max_features=30,stop_words="english")
# max_features = number of columns/words to be considered (Top N)

You can create a Document Term Matrix (DTM) not only using CountVectorizer but also with other vectorizers, most notably TfidfVectorizer.

In [None]:
# X_DTM=DTM.fit_transform(climatesentences['sentence'])

In [None]:
# pd.DataFrame(X_DTM.toarray(),columns=DTM.get_feature_names_out()).head()

TfidfVectorizer: This vectorizer also creates a matrix similar to a DTM, but instead of raw counts, the cells contain TF-IDF (Term Frequency-Inverse Document Frequency) values. TF-IDF gives more weight to words that are frequent in a specific document but rare across all documents, effectively highlighting terms that are more important to that particular document in the context of the entire corpus.

![image.png](attachment:07b0e29a-f72f-4321-b45e-de7881d6fad1.png)

In [None]:
# Doc1 - "the car is driven on the road"
# Doc2 - "the truck is driven on the highway"
# Doc3 - " car and truck are both driven on highway"

# post preprocessing
# Doc1 - "car", "driven", "road"
# Doc2 - "truck", "driven", "highway"
# Doc3 - "car", "truck", "driven", "highway"

# TFIDF for car
# Term Frequency of car - 1/3 * 1/4= 0.083
# Inverse Document Frequnecy of car = log(3/2)=0.4054
# TFIDF of car = 0.083 * 0.4054 = 0.0336482

# TFIDF Matrix
#           car         driven road  truck         highway
# Doc 1 -    0.0336482     1     1     0             0
# Doc 2 -    0             1     0     0.0336482   0.0336482
# Doc 3 -    0.0336482     1     0     0.0336482   0.0336482

In [None]:
# TFIDF of Driven
# Term Frequency of driven - 1/3 * 1/3 * 1/4 =0.02777
# Inverse Document Freqeuncy of driven - log(3/3) = 0
# TFIDF of driven - 0.02777 * 0 =0

# Laplace Smoothing must be done
# TFIDF with laplace smoothing = tf*log(N/(df+1))
# TFIDF of Driven
# Term Frequency of driven - 1/3 * 1/3 * 1/4 =0.02777
# Inverse Document Freqeuncy of driven - log(3/(3+1)) = -0.28768
# TFIDF of driven - 0.02777 * -0.28768 =-0.0079888736

# Sparse Matrices are matrices that have many Zeroes(Laplace smoothing is used in TF–IDF mainly for stability and edge-case handling)

#Without Laplace smoothing, the standard TF-IDF formula is TFIDF = Term Frequency (tf) * Inverse Document Frequency (idf).
 #The Inverse Document Frequency (idf) is calculated as log(N / df) where N is the total number of documents and df is the
 #document frequency of the term.

| Term \ Doc | Doc1   | Doc2   | Doc3   |
| ---------- | ------ | ------ | ------ |
| car        | 0      | 0      | 0      |
| driven     | −0.288 | −0.288 | −0.288 |
| road       | 0.405  | 0      | 0      |
| truck      | 0      | 0      | 0      |
| highway    | 0      | 0      | 0      |


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf=TfidfVectorizer(max_features=30,stop_words="english")

tfidf=TfidfVectorizer(max_features=30,stop_words="english"):

tfidf = TfidfVectorizer(...): This line initializes an object of the TfidfVectorizer class from scikit-learn. This object is designed to convert a collection of raw text documents into a matrix of TF-IDF features. TF-IDF stands for Term Frequency-Inverse Document Frequency, which is a numerical statistic intended to reflect how important a word is to a document in a collection or corpus.

max_features=30: This parameter tells the TfidfVectorizer to only consider the top 30 words (or features) that appear most frequently in your entire corpus of text. If you have thousands of unique words, this will select the 30 most significant ones based on TF-IDF scores.

stop_words="english": This parameter instructs the TfidfVectorizer to automatically remove common English stop words (like "the", "is", "a", "an", etc.) before processing the text. Stop words are usually filtered out because they carry little semantic meaning and don't contribute much to distinguishing between documents.

In [None]:
X_tfidf=tfidf.fit_transform(climatesentences['sentence'])

ValueError: empty vocabulary; perhaps the documents only contain stop words

In [None]:
pd.DataFrame(X_tfidf.toarray(),columns=tfidf.get_feature_names_out()).head()

In [None]:
# Unigrams - Single Words
# Bigrams - 2 Consecutive words
# Trigrams - 3 Consecutive words
# Unigrams, bigrams, and trigrams are terms used in natural language processing (NLP)
# to describe sequences of words, often referred to as n-grams:

In [None]:
tfidf_bigrams=TfidfVectorizer(max_features=30,stop_words="english",ngram_range=(2,2))

In [None]:
X_tfidf_bigrams=tfidf_bigrams.fit_transform(climatesentences['sentence'])

In [None]:
pd.DataFrame(X_tfidf_bigrams.toarray(),
             columns=tfidf_bigrams.get_feature_names_out()).head()