In [None]:
## Installing the required package
pip install ucimlrepo

In [44]:
import pandas as pd
from ucimlrepo import fetch_ucirepo 
  
# Fetch dataset 
breast_cancer = fetch_ucirepo(id=14) 
  
# Getting data (as pandas dataframes) 
X = breast_cancer.data.features 
y = breast_cancer.data.targets 
  
#Join the X and y datasets into a general dataset
data = pd.concat([X, y], axis=1)
data.head(5)

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,Class
0,30-39,premeno,30-34,0-2,no,3,left,left_low,no,no-recurrence-events
1,40-49,premeno,20-24,0-2,no,2,right,right_up,no,no-recurrence-events
2,40-49,premeno,20-24,0-2,no,2,left,left_low,no,no-recurrence-events
3,60-69,ge40,15-19,0-2,no,2,right,left_up,no,no-recurrence-events
4,40-49,premeno,0-4,0-2,no,2,right,right_low,no,no-recurrence-events


## 1. Melt function

In [25]:
melted_data = pd.melt(data.head(), id_vars=['age', 'menopause'], var_name='Attribute', value_name='Value')

melted_data.head(10)

Unnamed: 0,age,menopause,Attribute,Value
0,30-39,premeno,tumor-size,30-34
1,40-49,premeno,tumor-size,20-24
2,40-49,premeno,tumor-size,20-24
3,60-69,ge40,tumor-size,15-19
4,40-49,premeno,tumor-size,0-4
5,30-39,premeno,inv-nodes,0-2
6,40-49,premeno,inv-nodes,0-2
7,40-49,premeno,inv-nodes,0-2
8,60-69,ge40,inv-nodes,0-2
9,40-49,premeno,inv-nodes,0-2


## 2. Pivot function

In [45]:
# Check for a column that we can use to demonstrate the pivot function without duplicates.
# We'll look at the counts of unique values in each categorical column.
unique_value_counts = {column: data[column].nunique() for column in data.columns if data[column].dtype == 'object'}
unique_value_counts

{'age': 6,
 'menopause': 3,
 'tumor-size': 11,
 'inv-nodes': 7,
 'node-caps': 2,
 'breast': 2,
 'breast-quad': 5,
 'irradiat': 2,
 'Class': 2}

In [46]:
# Creating a new DataFrame with a unique combination of 'age' and 'menopause' status
unique_combinations = data.drop_duplicates(subset=['age', 'menopause'])

# For the purpose of demonstration, we will pivot on the first few records to ensure no duplicates
pivot_subset = unique_combinations.head(10)

# Pivot the data
pivoted_data = pivot_subset.pivot(index='age', columns='menopause', values='deg-malig')

# Display the pivoted data
pivoted_data

menopause,ge40,lt40,premeno
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20-29,,,2.0
30-39,,,3.0
40-49,3.0,,2.0
50-59,2.0,2.0,2.0
60-69,2.0,1.0,
70-79,3.0,,


## 3. Aggregation function

In [36]:
# Checking the data type of the 'deg-malig' column
data['deg-malig'].dtype

dtype('int64')

In [37]:
# Demonstrating the aggregation function
# We'll group by 'menopause' and then aggregate the 'deg-malig' column by its mean.
aggregated_data = data.groupby('menopause').agg({'deg-malig': 'mean'}).reset_index()

# Display the aggregated data
aggregated_data

Unnamed: 0,menopause,deg-malig
0,ge40,2.093023
1,lt40,1.714286
2,premeno,2.026667


## 4. Iteration function

In [42]:
# We cast the new column to a boolean data type before assignment
data['high_deg_malig'] = False
# Calculate the mean of 'deg-malig' for the entire dataset
mean_deg_malig = data['deg-malig'].mean()
# Demonstrating iteration using iterrows
# We will add a new column 'high_deg_malig' to flag if 'deg-malig' is above the mean
for index, row in data.iterrows():
    data.at[index, 'high_deg_malig'] = row['deg-malig'] > mean_deg_malig
# Display the first few rows of the dataframe to show the new 'high_deg_malig' column
data.head()

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,Class,high_deg_malig
0,30-39,premeno,30-34,0-2,no,3,left,left_low,no,no-recurrence-events,True
1,40-49,premeno,20-24,0-2,no,2,right,right_up,no,no-recurrence-events,False
2,40-49,premeno,20-24,0-2,no,2,left,left_low,no,no-recurrence-events,False
3,60-69,ge40,15-19,0-2,no,2,right,left_up,no,no-recurrence-events,False
4,40-49,premeno,0-4,0-2,no,2,right,right_low,no,no-recurrence-events,False


## 5. Groupby function

In [43]:
# We'll group by 'age' and then count the occurrences of each 'Class'
grouped_data = data.groupby('age')['Class'].value_counts().unstack().reset_index()
# Display the grouped data
grouped_data

Class,age,no-recurrence-events,recurrence-events
0,20-29,1.0,
1,30-39,21.0,15.0
2,40-49,63.0,27.0
3,50-59,71.0,25.0
4,60-69,40.0,17.0
5,70-79,5.0,1.0
