# Import necessary libraries

In [1]:
import pandas as pd

# Call database

In [2]:
!pip install ucimlrepo

from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
breast_cancer = fetch_ucirepo(id=14) 
  
# data (as pandas dataframes) 
data = breast_cancer.data.features
data['class'] = breast_cancer.data.targets  # Add the target column as 'class'


[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip




In [3]:
# Convert all columns to string data type to prevent date interpretation
data = data.astype(str)  # Convierte todas las columnas a texto

# Print

In [4]:
# Display metadata about the dataset
print("Metadata:")
print(breast_cancer.metadata)

# Display variable information for the dataset
print("\nVariable Information:")
print(breast_cancer.variables)

# Display the first few rows of the dataset
print("\nDataset Preview:")
data

Metadata:
{'uci_id': 14, 'name': 'Breast Cancer', 'repository_url': 'https://archive.ics.uci.edu/dataset/14/breast+cancer', 'data_url': 'https://archive.ics.uci.edu/static/public/14/data.csv', 'abstract': 'This breast cancer domain was obtained from the University Medical Centre, Institute of Oncology, Ljubljana, Yugoslavia. This is one of three domains provided by the Oncology Institute that has repeatedly appeared in the machine learning literature. (See also lymphography and primary-tumor.)', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 286, 'num_features': 9, 'feature_types': ['Categorical'], 'demographics': ['Age'], 'target_col': ['Class'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1988, 'last_updated': 'Thu Mar 07 2024', 'dataset_doi': '10.24432/C51P4M', 'creators': ['Matjaz Zwitter', 'Milan Soklic'], 'intro_paper': None, 'additional_info': {'summ

Unnamed: 0,age,menopause,tumor-size,inv-nodes,node-caps,deg-malig,breast,breast-quad,irradiat,class
0,30-39,premeno,30-34,0-2,no,3,left,left_low,no,no-recurrence-events
1,40-49,premeno,20-24,0-2,no,2,right,right_up,no,no-recurrence-events
2,40-49,premeno,20-24,0-2,no,2,left,left_low,no,no-recurrence-events
3,60-69,ge40,15-19,0-2,no,2,right,left_up,no,no-recurrence-events
4,40-49,premeno,0-4,0-2,no,2,right,right_low,no,no-recurrence-events
...,...,...,...,...,...,...,...,...,...,...
281,30-39,premeno,30-34,0-2,no,2,left,left_up,no,recurrence-events
282,30-39,premeno,20-24,0-2,no,3,left,left_up,yes,recurrence-events
283,60-69,ge40,20-24,0-2,no,1,right,left_up,no,recurrence-events
284,40-49,ge40,30-34,5-Mar,no,3,left,left_low,no,recurrence-events


# Exploring the database

In [5]:
for column in data.columns:
    print(f"Unique values in '{column}':")
    print(data[column].unique())
    print("\n" + "-"*40 + "\n")

Unique values in 'age':
['30-39' '40-49' '60-69' '50-59' '70-79' '20-29']

----------------------------------------

Unique values in 'menopause':
['premeno' 'ge40' 'lt40']

----------------------------------------

Unique values in 'tumor-size':
['30-34' '20-24' '15-19' '0-4' '25-29' '50-54' '14-Oct' '40-44' '35-39'
 '9-May' '45-49']

----------------------------------------

Unique values in 'inv-nodes':
['0-2' '8-Jun' '11-Sep' '5-Mar' '15-17' '14-Dec' '24-26']

----------------------------------------

Unique values in 'node-caps':
['no' 'yes' 'nan']

----------------------------------------

Unique values in 'deg-malig':
['3' '2' '1']

----------------------------------------

Unique values in 'breast':
['left' 'right']

----------------------------------------

Unique values in 'breast-quad':
['left_low' 'right_up' 'left_up' 'right_low' 'central' 'nan']

----------------------------------------

Unique values in 'irradiat':
['no' 'yes']

----------------------------------------

U

It was identified that 'tumor-size' contains misencoded values: '14-Oct' and '9-May'.
The variable 'inv-nodes' also has misencoded values such as '8-Jun', "11-Sep, '5-Mar', and '14-Dec'.
Therefore, these values will be modified.

In [6]:
# Correct misencoded values in 'tumor-size'
data['tumor-size'] = data['tumor-size'].replace({
    '14-Oct': '10-14',
    '9-May': '5-9'
})

# Correct misencoded values in 'inv-nodes'
data['inv-nodes'] = data['inv-nodes'].replace({
    '8-Jun': '6-8',
    '5-Mar': '3-5',
    '11-Sep': '9-11',
    '14-Dec': '12-14'
})

# Display unique values in 'tumor-size' and 'inv-nodes' to verify the corrections
print("Unique values in 'tumor-size':")
print(data['tumor-size'].unique())
print("\nUnique values in 'inv-nodes':")
print(data['inv-nodes'].unique())


Unique values in 'tumor-size':
['30-34' '20-24' '15-19' '0-4' '25-29' '50-54' '10-14' '40-44' '35-39'
 '5-9' '45-49']

Unique values in 'inv-nodes':
['0-2' '6-8' '9-11' '3-5' '15-17' '12-14' '24-26']


# Melt

In [7]:
# Apply melt to transform selected columns into rows
melted_data = pd.melt(data, id_vars=['class'], value_vars=['age', 'tumor-size', 'inv-nodes'],
                      var_name='attribute', value_name='value')

# Display the result of melt
melted_data


Unnamed: 0,class,attribute,value
0,no-recurrence-events,age,30-39
1,no-recurrence-events,age,40-49
2,no-recurrence-events,age,40-49
3,no-recurrence-events,age,60-69
4,no-recurrence-events,age,40-49
...,...,...,...
853,recurrence-events,inv-nodes,0-2
854,recurrence-events,inv-nodes,0-2
855,recurrence-events,inv-nodes,0-2
856,recurrence-events,inv-nodes,3-5


# Pivot

In [8]:
# Create a pivot table to see the frequency of deg-malig by tumor-size
pivot_data = data.pivot_table(index='tumor-size', columns='deg-malig', values='class', aggfunc='count')

# Display the result of pivot
print("\nPivot table:")
pivot_data


Pivot table:


deg-malig,1,2,3
tumor-size,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0-4,3.0,4.0,1.0
10-14,14.0,12.0,2.0
15-19,9.0,15.0,6.0
20-24,10.0,27.0,13.0
25-29,11.0,26.0,17.0
30-34,13.0,21.0,26.0
35-39,2.0,7.0,10.0
40-44,5.0,9.0,8.0
45-49,1.0,1.0,1.0
5-9,2.0,2.0,


# Aggregation

In [13]:
# Aggregation: Calculate mean, standard deviation, minimum, and maximum for 'deg-malig'
deg_malig_stats = data['deg-malig'].agg(['mean', 'std', 'min', 'max'])

# Display the aggregated statistics
print("Aggregated Statistics for 'deg-malig':")
print(deg_malig_stats)

TypeError: Could not convert string '3222222122321331233122222213223231122121121211211122131112112112111332122221212221221321312322122212332221223111212213111231122222232322312222123222333333323111322112222221222222233313321222322222122132122232312231222231313333333122313322322333323332321333122323311323323332233333323133' to numeric

# Iteration

In [10]:
print("Iterating through each row and displaying selected values:\n")

for index, row in data.iterrows():
    # Display the 'age', 'tumor-size', and 'class' values for each row
    print(f"Index: {index}, Age: {row['age']}, Tumor Size: {row['tumor-size']}, Class: {row['class']}")
    
    # Limiting output to the first 20 rows
    if index >= 19:
        break

Iterating through each row and displaying selected values:

Index: 0, Age: 30-39, Tumor Size: 30-34, Class: no-recurrence-events
Index: 1, Age: 40-49, Tumor Size: 20-24, Class: no-recurrence-events
Index: 2, Age: 40-49, Tumor Size: 20-24, Class: no-recurrence-events
Index: 3, Age: 60-69, Tumor Size: 15-19, Class: no-recurrence-events
Index: 4, Age: 40-49, Tumor Size: 0-4, Class: no-recurrence-events
Index: 5, Age: 60-69, Tumor Size: 15-19, Class: no-recurrence-events
Index: 6, Age: 50-59, Tumor Size: 25-29, Class: no-recurrence-events
Index: 7, Age: 60-69, Tumor Size: 20-24, Class: no-recurrence-events
Index: 8, Age: 40-49, Tumor Size: 50-54, Class: no-recurrence-events
Index: 9, Age: 40-49, Tumor Size: 20-24, Class: no-recurrence-events
Index: 10, Age: 40-49, Tumor Size: 0-4, Class: no-recurrence-events
Index: 11, Age: 50-59, Tumor Size: 25-29, Class: no-recurrence-events
Index: 12, Age: 60-69, Tumor Size: 10-14, Class: no-recurrence-events
Index: 13, Age: 50-59, Tumor Size: 25-29, Cl

# Groupby

In [12]:
# Frequency and percentage table for categorical columns using aggregation
# Group by 'age' column and count frequencies
freq_table = data.groupby(['age']).size().reset_index(name='n')

# Calculate percentage
total = freq_table['n'].sum()
freq_table['%'] = (freq_table['n'] / total) * 100

# Display the frequency and percentage table
print("Frequency (n) and percentage (%) for age:")
freq_table

Frequency (n) and percentage (%) for age:


Unnamed: 0,age,n,%
0,20-29,1,0.34965
1,30-39,36,12.587413
2,40-49,90,31.468531
3,50-59,96,33.566434
4,60-69,57,19.93007
5,70-79,6,2.097902
