In [1]:
# One-Hot Encoding

import pandas as pd

data = {"Color": ["Red", "Blue", "Green", "Blue"]}
df = pd.DataFrame(data)
df

Unnamed: 0,Color
0,Red
1,Blue
2,Green
3,Blue


In [2]:
df_encoded = pd.get_dummies(df, columns=["Color"], prefix="Color")
df_encoded

Unnamed: 0,Color_Blue,Color_Green,Color_Red
0,False,False,True
1,True,False,False
2,False,True,False
3,True,False,False


In [3]:
# Binning
data = {"Age": [23, 45, 18, 34, 67, 50, 21]}
df = pd.DataFrame(data)
df

Unnamed: 0,Age
0,23
1,45
2,18
3,34
4,67
5,50
6,21


In [4]:
bins = [0, 20, 40, 60, 100]
labels = ["0-20", "21-40", "41-60", "61+"]
df["Age_Group"] = pd.cut(df["Age"], bins=bins, labels=labels, right=False)
df

Unnamed: 0,Age,Age_Group
0,23,21-40
1,45,41-60
2,18,0-20
3,34,21-40
4,67,61+
5,50,41-60
6,21,21-40


In [6]:
%conda install nltk

3 channel Terms of Service accepted
Retrieving notices: done
Channels:
 - defaults
Platform: win-64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: c:\Users\paul\programming\courses\python-exercises\.conda

  added / updated specs:
    - nltk


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    click-8.2.1                |  py311haa95532_0         333 KB
    nltk-3.9.2                 |  py311ha55a155_0         4.0 MB
    regex-2025.9.1             |  py311h02ab6af_0         373 KB
    tqdm-4.67.1                |  py311h746a85d_0         191 KB
    ------------------------------------------------------------
                                           Total:         4.8 MB

The following NEW packages will be INSTALLED:

  click              pkgs/main/win-64::click-8.2.1-py311haa95532_0 
  nltk               pkgs/main/w



    current version: 25.5.1
    latest version: 25.9.1

Please update conda by running

    $ conda update -n base -c defaults conda




In [14]:
# Text Data Preprocessing
import nltk


from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

texts = ["This is a sample sentence.", "Text data preprocessing is important."]

nltk.download("stopwords", force=True)
stop_words = set(stopwords.words("english"))
stemmer = PorterStemmer()
vectorizer = CountVectorizer()


def preprocess_text(text):
    words = text.split()
    words = [stemmer.stem(word) for word in words if word.lower() not in stop_words]
    return " ".join(words)


cleaned_texts = [preprocess_text(text) for text in texts]

X = vectorizer.fit_transform(cleaned_texts)

print("Cleaned Texts:", cleaned_texts)
print("Vectorized Text:", X.toarray())

Cleaned Texts: ['sampl sentence.', 'text data preprocess important.']
Vectorized Text: [[0 0 0 1 1 0]
 [1 1 1 0 0 1]]


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\paul\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [15]:
# Feature Spliting
import pandas as pd

data = {
    "Full_Address": ["123 Elm St, Springfield, 12345", "456 Oak Rd, Shelbyville, 67890"]
}
df = pd.DataFrame(data)

df[["Street", "City", "Zipcode"]] = df["Full_Address"].str.extract(
    r"([0-9]+\s[\w\s]+),\s([\w\s]+),\s(\d+)"
)
df

Unnamed: 0,Full_Address,Street,City,Zipcode
0,"123 Elm St, Springfield, 12345",123 Elm St,Springfield,12345
1,"456 Oak Rd, Shelbyville, 67890",456 Oak Rd,Shelbyville,67890
