In [None]:
# Natural Language Processing or Text Mining - Text Data is Unsupervised Learning Data.
# Unsupervised Learning means no proper structure, no variables, and also traditional models-
# do not work on this data

# Text Data must be processed and convert to Supervised Learning where it will have proper
# structure, variable formation and traditional models will work.

# Different types of text data in terms of documents(PDF, Word Docs, OCR Files(PDF Images),
# Web Pages, Social Media Posts/Updates/Text Content, Databases(XML Format), Text from Images,
# IPO Documents(Redherring Prospectus), etc.)

In [None]:
# Web Scraping - Scraping text data from webpages. Web Pages are typically html pages
# that has many other factors along with text like headings, Page Details, Styling, fonts,
# etc.

# Scrape Text content only. Libraries used are requests, bs4, nltk
# requests library for url page scraping
# bs4 for scraping text content

In [None]:
import requests
from bs4 import BeautifulSoup

In [None]:
response = requests.get("https://www.britannica.com/procon/climate-change-debate")
soup = BeautifulSoup(response.text)

In [None]:
paragraphs = soup.find_all('p')
paragraphs_txt=[p.text for p in paragraphs]

In [None]:
# Text preprocessing - Cleaning up text using re library. re(Regular Expressions) library
# used for identifying different text patterns and clean them.

# Text patterns like email, digits, words, spaces, word boundaries(start/end), etc are
# predefined and are used for cleaning text data.

# Text preprocessing involves removing punctuations, special characters, digits, spaces,
# emoji's, hyperlinks, specific characters, etc.

![image.png](attachment:ed226b74-2beb-4118-9d91-67e4891c49e6.png)

In [None]:
import re

In [None]:
pattern=r'[^a-zA-Z0-9\s.]'

In [None]:
paragraphs_txt=re.sub(pattern,"",str(paragraphs_txt)) # replace pattern with space

# rs.sub("pattern to be replace", "pattern replacement", data)

In [None]:
paragraphs_txt=re.sub(r'[0-9]+',"",paragraphs_txt) # Remove All Digits/numbers

In [None]:
paragraphs_txt=paragraphs_txt.lower()

# Convert text to Lower Case. All comparitive words are in lower case. predefined lists or
# lexicons are in smallcap/lower case. ind != Ind ; ind = ind

In [None]:
# Tokenization - Break text into tokens/words or sentences.
# Sentence Tokenization - Breaking text into sentences. default delimiter is fullstop
# Word Tokenization - Breaking text into words or tokens. default delimiter is space

In [None]:
# !pip install nltk
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
#nltk.download("all")

In [None]:
 nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
climatesentences=sent_tokenize(paragraphs_txt)

In [None]:
# On Sentences - Sentiment Analysis is done. positive, negative, neutral sentiments are
# generated for each sentence.
# Many Sentiment Analysis Models like VADER, Text Blob Polarity Model, Stanford Sentiment Model
# ,nltk sentiment , etc.

# Most popular and highly accurate is Text Blob Library sentiment model. Text Blob Model
# provides 2 scores - Polarity Score and Subjectivity Score.
# Polarity Score is a value that lies between -1 to 1. Using this score sentiment classification
# done

# Subjectivity Score is a value that lies between 0 and 1. Close to 1 is high personal
# opinion (involves adverbs & Superlatives) and Close to 0 is low personal opinion.

In [None]:
# !pip install textblob
from textblob import TextBlob

In [None]:
TextBlob("Tendulkar is greatest batsman in Cricket").sentiment

Sentiment(polarity=1.0, subjectivity=1.0)

In [None]:
TextBlob("Tendulkar is great batsman").sentiment

Sentiment(polarity=0.8, subjectivity=0.75)

In [None]:
TextBlob("Tendulkar is most reputed cricketer").sentiment

Sentiment(polarity=0.5, subjectivity=0.5)

In [None]:
def analyze_sentiment(text):
    analysis=TextBlob(text)
    if analysis.sentiment.polarity>0:
        return "Positive"
    elif analysis.sentiment.polarity==0:
        return "Neutral"
    else:
        return "Negative"

In [None]:
import pandas as pd
climatesentences=pd.DataFrame(climatesentences,columns=['sentence']) # create a climate sentence column, then add

In [None]:
climatesentences['sentiment']=[str(analyze_sentiment(x)) for x in climatesentences['sentence']]
climatesentences

Unnamed: 0,sentence,sentiment


In [None]:
climatesentences['sentiment'].value_counts()

Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1


In [None]:
climatesentences.head()

Unnamed: 0,sentence,sentiment


In [None]:
# NLP uses words or tokens as fundamental point of analysis
climatewords=word_tokenize(paragraphs_txt)

In [None]:
# isalnum() will select only words and digits. All special characters deleted
climatewords=[w for w in climatewords if w.isalnum()]

In [None]:
# Remove Stopwords. Stopwords are list of words like is, a, an, the, then, to, etc. that
# are not required for analysis
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
english_stopwords=set(stopwords.words("english"))

In [None]:
climatewords=[w for w in climatewords if not w in english_stopwords]

In [None]:
climatewords=[w for w in climatewords if len(w)>2] # Select words more than 2 characters

In [None]:
from nltk.probability import FreqDist

In [None]:
wordfreq=FreqDist(climatewords)

In [None]:
wordfreq.most_common(20)

[]

In [None]:
# Word Cloud is a vizual representation of most frequent words. large font size most frequent
# small font size less frequent.

from wordcloud import WordCloud

In [None]:
#wordcloud=WordCloud(width=1000,height=500,stopwords=english_stopwords,
 #                   colormap="plasma",max_words=200).generate(str(climatewords))

In [None]:
import matplotlib.pyplot as plt

#plt.imshow(wordcloud)

In [None]:
# Vectorization is process of converting text or word/tokens into matrix of numbers
# Algorithms will work only with numbers, hence words/tokens must be vectorized.
# 2 types of vectorization are widely used -
# a ) Document Term Matrix - By default documents/sentences in rows and words/tokens
# in columns.

# Doc1 - "the car is driven on the road"
# Doc2 - "the truck is driven on the highway"
# Doc3 - " car and truck are both driven on highway"

# post preprocessing
# Doc1 - "car", "driven", "road"
# Doc2 - "truck", "driven", "highway"
# Doc3 - "car", "truck", "driven", "highway"

# Document Term Matrix
#           car  driven road truck highway
# Doc 1 -    1     1     1     0    0
# Doc 2 -    0     1     0     1    1
# Doc 3 -    1     1     0     1    1

In [None]:
# CountVectorizer() is a predefined function for creating Document Term Matrix
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
DTM=CountVectorizer(max_features=30,stop_words="english")
# max_features = number of columns/words to be considered (Top N)

In [None]:
import pandas as pd
climatesentences=pd.read_csv("/content/climatechange.csv")

In [None]:
X_DTM=DTM.fit_transform(climatesentences['sentence'])

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 255 stored elements and shape (80, 30)>

In [None]:
pd.DataFrame(X_DTM.toarray(),columns=DTM.get_feature_names_out()).head()
# Document Term Matrix(DTM) aka Count Matrix

Unnamed: 0,america,atlantic,changes,climate,climatic,conditions,early,enso,europe,evidence,...,patterns,period,records,region,regions,temperatures,th,variation,variations,years
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
2,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


![image.png](attachment:07b0e29a-f72f-4321-b45e-de7881d6fad1.png)

No, but TF-IDF is built from the document matrix.

Document-term matrix = raw counts

TF-IDF matrix = weighted version of that matrix

In [None]:
# Doc1 - "the car is driven on the road"
# Doc2 - "the truck is driven on the highway"
# Doc3 - " car and truck are both driven on highway"

# post preprocessing
# Doc1 - "car", "driven", "road"
# Doc2 - "truck", "driven", "highway"
# Doc3 - "car", "truck", "driven", "highway"

# TFIDF for car
# Term Frequency of car - 1/3 * 1/4= 0.083 # count term in each doc..here there is no word car in doc2.
# Now suppose Doc 1 has 2 occurrences of "car". TF=2/4​=0.5. count of term/ number of term in doc. like probability
# Inverse Document Frequnecy of car = log(3/2)=0.4054 - Number of documents = 3 , Documents containing car = 2
# TFIDF of car = 0.083 * 0.4054 = 0.0336482

# TFIDF Matrix
#           car         driven road  truck         highway
# Doc 1 -    0.0336482     1     1     0             0
# Doc 2 -    0             1     0     0.0336482   0.0336482
# Doc 3 -    0.0336482     1     0     0.0336482   0.0336482

In [None]:
# TFIDF of Driven
# Term Frequency of driven - 1/3 * 1/3 * 1/4 =0.02777
# Inverse Document Freqeuncy of driven - log(3/3) = 0
# TFIDF of driven - 0.02777 * 0 =0

# Laplace Smoothing must be done
# TFIDF with laplace smoothing = tf*log(N/(df+1))
# TFIDF of Driven
# Term Frequency of driven - 1/3 * 1/3 * 1/4 =0.02777
# Inverse Document Freqeuncy of driven - log(3/(3+1)) = -0.28768
# TFIDF of driven - 0.02777 * -0.28768 =-0.0079888736

# Sparse Matrices are matrices that have many Zeroes

Lapalce smoothing:
Laplace smoothing (also called add-one smoothing) is mainly used in probability models, especially Naïve Bayes text classifiers.

The core problem:

If a word never appears in a document/class, its probability becomes zero.

NB: P(Spam | email) ∝ P(word1 | Spam) * P(word2 | Spam) * ...

If any word has probability = 0,
the entire product becomes zero,
and the model will never classify such documents as Spam.

This is unrealistic and breaks the model.

✅ Laplace Smoothing solves this

We simply add 1 to every word count, even unseen words.
What this does:

Removes zero probabilities

Makes the model more stable

Handles unseen words gracefully

In [None]:
climatesentences=pd.read_csv("/content/climatechange.csv")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

✅ What is TfidfVectorizer?

TfidfVectorizer is a tool in Python’s scikit-learn (sklearn) library that:

✔ Converts text documents into numerical features
✔ Using TF-IDF weighting
✔ Automatically handles tokenizing, counting, and weighting

In [None]:
tfidf=TfidfVectorizer(max_features=30,stop_words="english")

This means:

“Keep only the top 30 most important words (by frequency or tf-idf score) across the whole dataset.”

Why?

To reduce dimensionality

To remove rare/unimportant words

To make the TF-IDF matrix smaller and faster

stop_words="english"

This means:

“Remove all common English stopwords.”

what vectorizer does:
1) Preprocess the text

lowercase

remove stopwords

tokenize

normalize

2) Select the top 30 most informative words

3) Compute the TF-IDF score for each term in each document

4) Produce a TF-IDF matrix

In [None]:
X_tfidf=tfidf.fit_transform(climatesentences['sentence'])

Document-Term Matrix (DTM)

Advantages (Adv):

Simplicity: It's straightforward to understand and implement, representing the raw count of terms in documents.
Direct Interpretability: Each cell directly shows how many times a word appears in a document, making it easy to see term frequency.
Good for Short Documents: For very short documents, raw counts can be quite informative.

Disadvantages (Diadv):
Sparsity: Most documents don't contain all words in the vocabulary, leading to many zero values and a sparse matrix, which can be computationally expensive and memory-intensive.
Lack of Semantic Meaning: It treats all words equally, regardless of their importance. Common words (like 'the', 'is', 'a') that appear frequently in many documents can dominate the representation, even if they carry little meaning.
High Dimensionality: The number of unique words (vocabulary size) can be very large, leading to a very wide matrix (many columns), which can be challenging for some machine learning algorithms.
No Weighting: It doesn't account for the importance of a word within a document or across the entire corpus.
TF-IDF Matrix (Term Frequency-Inverse Document Frequency)

Advantages (Adv):

Weights Term Importance: It assigns higher weights to words that are important (frequent within a document) but also unique (rare across the corpus). This helps filter out common, less informative words.
Improved Representation: Provides a more nuanced representation of document content compared to raw counts, often leading to better performance in tasks like text classification, clustering, and information retrieval.

Disadvantages (Diadv):

Sparsity: Like DTM, it can still result in a very sparse matrix, especially with a large vocabulary.
Loss of Word Order/Context: It treats text as a "bag of words," meaning it doesn't capture the order of words or their syntactic/semantic relationships. Phrases like "not good" are treated as separate words "not" and "good."
Sensitive to Corpus Size: The Inverse Document Frequency (IDF) component is highly dependent on the corpus. A word might be rare in one corpus but common in another, affecting its TF-IDF score.

In [None]:
pd.DataFrame(X_tfidf.toarray(),columns=tfidf.get_feature_names_out()).head()

Unnamed: 0,america,atlantic,changes,climate,climatic,conditions,early,enso,europe,evidence,...,patterns,period,records,region,regions,temperatures,th,variation,variations,years
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.590835,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.472958
2,0.0,0.0,0.658487,0.752592,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.658487,0.752592,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Unigrams - Single Words
# Bigrams - 2 Consecutive words
# Trigrams - 3 Consecutive words

In [None]:
tfidf_bigrams=TfidfVectorizer(max_features=30,stop_words="english",ngram_range=(2,2))

# This creates a TF-IDF vectorizer that extracts only bigrams from text.

In [None]:
X_tfidf_bigrams=tfidf_bigrams.fit_transform(climatesentences['sentence'])

In [None]:
pd.DataFrame(X_tfidf_bigrams.toarray(),
             columns=tfidf_bigrams.get_feature_names_out()).head()

Unnamed: 0,atlantic region,atmospheric circulation,climatic changes,early holocene,early midholocene,eastern north,enso patterns,enso variation,holocene epoch,ice age,...,solar radiation,studies indicate,summer insolation,temperature moisture,th century,thousand years,tree rings,united states,variation enso,years ago
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
from sklearn.cluster import KMeans

In [None]:
# Building 15 cluster models and storing each models WCSS for plotting Elbow plot.
wcss=[]
for i in range(1,15):
  kmeans=KMeans(n_clusters=i)
  kmeans.fit(X_tfidf)
  wcss.append(kmeans.inertia_)

In [None]:
kmeans_final=KMeans(n_clusters=5).fit(X_tfidf)

In [None]:
# SEE SIRS FILE