# TP-2: Processing and Document Representation

This notebook demonstrates various text preprocessing techniques, including removing punctuation, URLs, stopwords, lowercasing, tokenization, stemming, and lemmatization.


In [1]:
# Import Required Libraries
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk

# Download necessary NLTK data files
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('wordnet')

# Read the Data


In [2]:
# Load and Inspect the Dataset
# Load the dataset from the CSV file
file_path = 'elon_musk.csv'
data = pd.read_csv(file_path, header=None)

# Display the first few rows of the dataset
data.head()

Unnamed: 0,0
0,Text
1,https://en.wikipedia.org/wiki/Elon_Musk
2,Elon Reeve Musk FRS (/?i?l?n/ EE-lon; born Jun...
3,Musk was born to a Canadian mother and White S...
4,"In 2002, Musk founded SpaceX, an aerospace man..."


# Removing Punctuation


In [3]:
# Removing Punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

data[0] = data[0].apply(remove_punctuation)
data.head()

Unnamed: 0,0
0,Text
1,httpsenwikipediaorgwikiElonMusk
2,Elon Reeve Musk FRS iln EElon born June 28 197...
3,Musk was born to a Canadian mother and White S...
4,In 2002 Musk founded SpaceX an aerospace manuf...


# Removing the URLs


In [4]:
# Removing URLs
def remove_urls(text):
    return re.sub(r'http\S+', '', text)

data[0] = data[0].apply(remove_urls)
data.head()

Unnamed: 0,0
0,Text
1,
2,Elon Reeve Musk FRS iln EElon born June 28 197...
3,Musk was born to a Canadian mother and White S...
4,In 2002 Musk founded SpaceX an aerospace manuf...


# Removing Stop Words


In [5]:
# Extract and print all stop words in the CSV file
stop_words = set(stopwords.words('english'))

# Function to extract stop words from the text
def extract_stopwords(text):
    words = word_tokenize(text)
    return [word for word in words if word.lower() in stop_words] 

# Apply the function to extract stop words from each row
data['stopwords'] = data[0].apply(extract_stopwords)

# Print all stop words found in the CSV file
all_stopwords = data['stopwords'].explode().dropna().unique()
print("Stop words in the CSV file:", list(all_stopwords))

Stop words in the CSV file: ['is', 'a', 'and', 'He', 'the', 'at', 'of', 'The', 'With', 'an', 'as', 'in', 'to', 'both', 'was', 'before', 'where', 'he', 'but', 'with', 'his', 'by', 'for', 'same', 'which', 'In', 'now', 'that', 'on', 'has', 'been', 'such', 'about', 'had', 'did', 'not', 'down', 'from', 'won', 'against', 'him', 'who']


In [None]:
# Removing Stop Words
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    return ' '.join([word for word in words if word.lower() not in stop_words])
data[0] = data[0].apply(remove_stopwords)
data.head()

Unnamed: 0,0,stopwords
0,Text,[]
1,,[]
2,Elon Reeve Musk FRS iln EElon born June 28 197...,"[is, a, and, He, is, the, and, at, and, of, of..."
3,Musk born Canadian mother White South African ...,"[was, to, a, and, and, in, He, the, of, before..."
4,2002 Musk founded SpaceX aerospace manufacture...,"[In, an, and, of, which, he, as, and, In, he, ..."


# Lowercasing for the sentences


In [None]:
# Lowercasing
data[0] = data[0].str.lower()
data.head()

Unnamed: 0,0,stopwords
0,text,[]
1,,[]
2,elon reeve musk frs iln eelon born june 28 197...,"[is, a, and, He, is, the, and, at, and, of, of..."
3,musk born canadian mother white south african ...,"[was, to, a, and, and, in, He, the, of, before..."
4,2002 musk founded spacex aerospace manufacture...,"[In, an, and, of, which, he, as, and, In, he, ..."


# Tokenization for the sentences


In [None]:
# Tokenization
data['tokens'] = data[0].apply(word_tokenize)
data.head()

Unnamed: 0,0,stopwords,tokens
0,text,[],[text]
1,,[],[]
2,elon reeve musk frs iln eelon born june 28 197...,"[is, a, and, He, is, the, and, at, and, of, of...","[elon, reeve, musk, frs, iln, eelon, born, jun..."
3,musk born canadian mother white south african ...,"[was, to, a, and, and, in, He, the, of, before...","[musk, born, canadian, mother, white, south, a..."
4,2002 musk founded spacex aerospace manufacture...,"[In, an, and, of, which, he, as, and, In, he, ...","[2002, musk, founded, spacex, aerospace, manuf..."


# Stemming for the sentences


In [None]:
# Stemming (Reduction to Root Word)
stemmer = PorterStemmer()
data['stemmed'] = data['tokens'].apply(lambda tokens: [stemmer.stem(token) for token in tokens])
data.head()

Unnamed: 0,0,stopwords,tokens,stemmed
0,text,[],[text],[text]
1,,[],[],[]
2,elon reeve musk frs iln eelon born june 28 197...,"[is, a, and, He, is, the, and, at, and, of, of...","[elon, reeve, musk, frs, iln, eelon, born, jun...","[elon, reev, musk, fr, iln, eelon, born, june,..."
3,musk born canadian mother white south african ...,"[was, to, a, and, and, in, He, the, of, before...","[musk, born, canadian, mother, white, south, a...","[musk, born, canadian, mother, white, south, a..."
4,2002 musk founded spacex aerospace manufacture...,"[In, an, and, of, which, he, as, and, In, he, ...","[2002, musk, founded, spacex, aerospace, manuf...","[2002, musk, found, spacex, aerospac, manufact..."


# Lemmatization


In [None]:
# Lemmatization (Reduction to Dictionary Form)
lemmatizer = WordNetLemmatizer()
data['lemmatized'] = data['tokens'].apply(lambda tokens: [lemmatizer.lemmatize(token) for token in tokens])
data.head()

Unnamed: 0,0,stopwords,tokens,stemmed,lemmatized
0,text,[],[text],[text],[text]
1,,[],[],[],[]
2,elon reeve musk frs iln eelon born june 28 197...,"[is, a, and, He, is, the, and, at, and, of, of...","[elon, reeve, musk, frs, iln, eelon, born, jun...","[elon, reev, musk, fr, iln, eelon, born, june,...","[elon, reeve, musk, fr, iln, eelon, born, june..."
3,musk born canadian mother white south african ...,"[was, to, a, and, and, in, He, the, of, before...","[musk, born, canadian, mother, white, south, a...","[musk, born, canadian, mother, white, south, a...","[musk, born, canadian, mother, white, south, a..."
4,2002 musk founded spacex aerospace manufacture...,"[In, an, and, of, which, he, as, and, In, he, ...","[2002, musk, founded, spacex, aerospace, manuf...","[2002, musk, found, spacex, aerospac, manufact...","[2002, musk, founded, spacex, aerospace, manuf..."
