In [None]:
import numpy as np
import pandas as pd

In [None]:
mamm_csv = "mamografias.csv"

In [None]:
missing_values_format = ["?"]
mamm_data = pd.read_csv(mamm_csv, na_values=missing_values_format)
# ma_df = pd.read_csv(mamm_data, na_values=missing_values_format, usecols=range(1,6))
mamm_data.head()

In [None]:
mamm_data.dtypes

In [None]:
#mamm_data.describe(include=("all"))
mamm_data.describe()

In [None]:
mamm_data.corr()

In [None]:
mamm_data.shape

In [None]:
mamm_data['Severity'].value_counts()

# Visualización de los datos

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

In [None]:
fig, axes = plt.subplots(1,5, sharey=False, figsize=(18,4))
ax1, ax2, ax3, ax4, ax5 = axes.flatten()

ax1.hist(mamm_data['BI-RADS'], bins=6, color="lightslategray")
ax2.hist(mamm_data['Age'], bins=12, color="skyblue")
ax3.hist(mamm_data['Shape'], bins=5, color="steelblue")
ax4.hist(mamm_data['Margin'], bins=5, color="mediumslateblue")
ax5.hist(mamm_data['Density'], bins=4, color="darkslategray")
ax1.set_xlabel('BI-RADS', fontsize="large")
ax2.set_xlabel('Age', fontsize="large")
ax3.set_xlabel('Shape', fontsize="large")
ax4.set_xlabel('Margin', fontsize="large")
ax5.set_xlabel('Density', fontsize="large")

ax1.set_ylabel("counts", fontsize="large")

plt.suptitle('Comparación de las distribuciones', ha='center', fontsize='x-large')
plt.savefig("figures_python/compare_distribucions.pdf")
plt.show()
sns.despine(offset=10, trim=True)
sns.set(rc={'figure.figsize':(15,15)})
sns.set(style="whitegrid", color_codes=True)

In [None]:
sns.set(rc={'figure.figsize':(10,5)})
sns.countplot('Shape',data=mamm_data,hue = 'Severity')
plt.savefig("figures_python/countplot_shape_severity.pdf")

In [None]:
sns.countplot('BI-RADS',data=mamm_data,hue = 'Severity')
plt.savefig("figures_python/countplot_bi_severity.pdf")

In [None]:
#sns.boxplot(x='BI-RADS', y="Shape", data=mamm_data, hue="Severity")

# PREPROCESADO DE DATOS

There are several different types of categorical data including:

* Binary: A variable that has only 2 values. For example, True/False or Yes/No.

* Ordinal: A variable that has some order associated with it like our place example above. The machine learning model may be able to use the order information to make better predictions and we want to preserve it.

* Nominal: A variable that has no numerical importance, for example color or city.

In [None]:
cols = [col for col in mamm_data.columns if col not in ['Severity']]
data = mamm_data[cols]
data_values = data.values
target = mamm_data['Severity']
data.head()

In [None]:
for col in ['BI-RADS', 'Age', 'Shape', 'Margin', 'Density', 'Severity']:
    print(col, mamm_data[col].unique())

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn import impute

## LabelEncoder

In [None]:
mamm_data_for_order = mamm_data.copy()

In [None]:
le = LabelEncoder()

In [None]:
mamm_data["Severity"] = le.fit_transform(mamm_data['Severity']) # maligno (1), benigno (0)
mamm_data["Shape"] = le.fit_transform(mamm_data['Shape'])

In [None]:
mamm_data.head()  

## Encoding with order

In [None]:
le = LabelEncoder()

In [None]:
shape_order = {'N': 1.0, 'R': 2.0, 'O': 3.0, 'L': 4.0, 'I': 5.0}
mamm_data_for_order["Severity"] = le.fit_transform(mamm_data_for_order['Severity']) # maligno (1), benigno (0)
mamm_data_for_order['Shape'] = mamm_data_for_order['Shape'].map(shape_order)

In [None]:
mamm_data_for_order.head()

In [None]:
f,ax = plt.subplots(figsize=(10, 10))
sns.heatmap(mamm_data.corr(), annot=True, linewidths=0.5, fmt= '.2f',ax=ax)
plt.suptitle('Correlación sin ordenar Shape en el encoding', ha='center', fontsize='x-large')
plt.savefig("figures_python/corr.pdf")
plt.show()

In [None]:
f,ax = plt.subplots(figsize=(10, 10))
sns.heatmap(mamm_data_for_order.corr(), annot=True, linewidths=0.5, fmt= '.2f',ax=ax)
plt.suptitle('Correlación ordenando Shape', ha='center', fontsize='x-large')
plt.savefig("figures_python/corr_shape_ordered.pdf")
plt.show()