##  Working with Text and Categorical Data

In [2]:
import pandas as pd
import numpy as np

data = { 
'SKU': ['SKU-A-101', 'SKU-B-203', 'SKU-A-105', 'SKU-C-301', 
'SKU-B-207'], 
'Description': ['Red T-Shirt, Size M', 'Blue Jeans, Size L', 'Red Hoodie, Size M', 'Green Cap, One Size', 'Blue Shorts, Size L'], 
'Rating': ['Good', 'Excellent', 'Good', 'Average', 'Excellent'] 
} 
df_text = pd.DataFrame(data)
df_text

Unnamed: 0,SKU,Description,Rating
0,SKU-A-101,"Red T-Shirt, Size M",Good
1,SKU-B-203,"Blue Jeans, Size L",Excellent
2,SKU-A-105,"Red Hoodie, Size M",Good
3,SKU-C-301,"Green Cap, One Size",Average
4,SKU-B-207,"Blue Shorts, Size L",Excellent


In [4]:
# 81. From df_text, use str.extract() with a regular expression to create a new column 'Product_Code' that contains the middle part of the SKU (e.g., 'A', 'B', 'C').
df_text['Product_Code'] = df_text['SKU'].str.extract(r'SKU-([A-Z])-')
df_text[['SKU','Product_Code']]

Unnamed: 0,SKU,Product_Code
0,SKU-A-101,A
1,SKU-B-203,B
2,SKU-A-105,A
3,SKU-C-301,C
4,SKU-B-207,B


In [5]:
# 82. Create two new columns, 'Product_Type' and 'Size', by splitting the 'Description' column on the comma.
df_text[['Product_Type','Size']] = df_text['Description'].str.split(',', expand=True)
df_text['Product_Type'] = df_text['Product_Type'].str.strip()
df_text['Size'] = df_text['Size'].str.strip()
df_text[['Description','Product_Type','Size']]

Unnamed: 0,Description,Product_Type,Size
0,"Red T-Shirt, Size M",Red T-Shirt,Size M
1,"Blue Jeans, Size L",Blue Jeans,Size L
2,"Red Hoodie, Size M",Red Hoodie,Size M
3,"Green Cap, One Size",Green Cap,One Size
4,"Blue Shorts, Size L",Blue Shorts,Size L


In [6]:
# 83. Convert the 'Rating' column into a categorical data type with a specific order: ['Average', 'Good', 'Excellent']. 
rating_order = ['Average','Good','Excellent']
df_text['Rating'] = pd.Categorical(df_text['Rating'], categories=rating_order, ordered=True)
df_text[['Rating']]

Unnamed: 0,Rating
0,Good
1,Excellent
2,Good
3,Average
4,Excellent


In [7]:
# 84. After setting the categorical order, filter df_text to show only rows where the 'Rating' is 'Good' or better. 
df_text[df_text['Rating'] >= 'Good']

Unnamed: 0,SKU,Description,Rating,Product_Code,Product_Type,Size
0,SKU-A-101,"Red T-Shirt, Size M",Good,A,Red T-Shirt,Size M
1,SKU-B-203,"Blue Jeans, Size L",Excellent,B,Blue Jeans,Size L
2,SKU-A-105,"Red Hoodie, Size M",Good,A,Red Hoodie,Size M
4,SKU-B-207,"Blue Shorts, Size L",Excellent,B,Blue Shorts,Size L


In [8]:
# 85. Use str.get_dummies() on the 'Description' column to create dummy variables for each word. 
df_text['Description'].str.get_dummies(sep=' ')

Unnamed: 0,Blue,"Cap,",Green,"Hoodie,","Jeans,",L,M,One,Red,"Shorts,",Size,"T-Shirt,"
0,0,0,0,0,0,0,1,0,1,0,1,1
1,1,0,0,0,1,1,0,0,0,0,1,0
2,0,0,0,1,0,0,1,0,1,0,1,0
3,0,1,1,0,0,0,0,1,0,0,1,0
4,1,0,0,0,0,1,0,0,0,1,1,0


In [9]:
# 86. In the 'Description' column, replace the word 'Size' with 'Sz.'. 
df_text['Description'] = df_text['Description'].str.replace('Size','Sz.', regex=False)
df_text[['Description']]

Unnamed: 0,Description
0,"Red T-Shirt, Sz. M"
1,"Blue Jeans, Sz. L"
2,"Red Hoodie, Sz. M"
3,"Green Cap, One Sz."
4,"Blue Shorts, Sz. L"


In [10]:
# 87. Create a new boolean column 'Is_Red' that is True if the 'Description' contains the word 'Red'. 
df_text['Is_Red'] = df_text['Description'].str.contains('Red', case=False)
df_text[['Description','Is_Red']]

Unnamed: 0,Description,Is_Red
0,"Red T-Shirt, Sz. M",True
1,"Blue Jeans, Sz. L",False
2,"Red Hoodie, Sz. M",True
3,"Green Cap, One Sz.",False
4,"Blue Shorts, Sz. L",False


In [11]:
# 88. Concatenate the 'SKU' and 'Product_Type' columns (from question 82) into a new column called 'Product_ID', separated by a hyphen. 
df_text['Product_ID'] = df_text['SKU'] + '-' + df_text['Product_Type']
df_text[['Product_ID']]

Unnamed: 0,Product_ID
0,SKU-A-101-Red T-Shirt
1,SKU-B-203-Blue Jeans
2,SKU-A-105-Red Hoodie
3,SKU-C-301-Green Cap
4,SKU-B-207-Blue Shorts


In [12]:
# 89. Count the occurrences of each category in the ordered 'Rating' column. 
df_text['Rating'].value_counts()

Rating
Good         2
Excellent    2
Average      1
Name: count, dtype: int64

In [13]:
# 90. Find the SKUs of all products that are either a 'T-Shirt' or a 'Hoodie'. 
mask = df_text['Product_Type'].str.contains('T-Shirt|Hoodie', case=False, na=False)
df_text.loc[mask, 'SKU']

0    SKU-A-101
2    SKU-A-105
Name: SKU, dtype: object