In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error


sns.set_theme(style="whitegrid")


In [None]:
dataset_file_name = 'diamonds.csv'


## 1) Importação de dados

In [None]:
df = pd.read_csv(dataset_file_name)
df.columns


In [None]:
df = df.drop(columns=['Unnamed: 0'])


## 2) Descritiva

In [None]:
df.describe().T


### Boxplot dos preços

In [None]:
sns.boxplot(data=df, y='price')
plt.title('Preços')
plt.xlabel('Preço')
plt.show()


### Scatterplot quilates

In [None]:
sns.scatterplot(data=df, x='carat', y='price')
plt.title("Quilates e preços")
plt.xlabel("Quilate")
plt.ylabel("Preço")
plt.show()


### Boxplot, cores de diamantes e preços

In [None]:
# Mapeando cores respectivas a suas letras
colors_map = {'D': '#FFFFFF', 'E': '#FEFFDF', 'F': '#FEFFDF',
              'G': '#FEFFBF', 'H': '#FEFFBF', 'I': '#FEFF9F', 'J': '#FEFF9F'}

sns.boxplot(data=df, x='color', y='price', palette=colors_map)
plt.title("Cores e preços")
plt.xlabel("Cor")
plt.ylabel('Preço')
plt.show()


## 3) Correlações

In [None]:
correlations = df.corr(numeric_only=True)


In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
mask = np.zeros_like(correlations, dtype=bool)
mask[np.triu_indices_from(mask)] = True

sns.heatmap(correlations, annot=True, ax=ax, cmap='YlGnBu', mask=mask)
plt.title("Correlações")
plt.show()


## 4) Feature engineering

In [None]:
def missing_percentage(df: pd.DataFrame) -> pd.DataFrame:
    """Calcula a porcentagem de valores nulos em cada coluna de um DataFrame.

    Parameters
    ----------
    df : pandas.DataFrame
        The input DataFrame.

    Returns
    -------
    pandas.DataFrame
        A DataFrame with two columns: "Total" and "Percent".
        The "Total" column shows the total number of missing values in each column of the input DataFrame,
        in descending order, while the "Percent" column shows the percentage of missing values relative to
        the total size of the DataFrame, also in descending order.
    """

    # Calcula o número total de valores nulos em cada coluna do DataFrame de entrada
    # Contagem dos valores nulos em cada coluna
    total = df.isnull().sum()
    # Ordena do maior para o menor número de valores nulos
    total = total.sort_values(ascending=False)
    # Remove as colunas sem valores nulos
    total = total[total != 0]

    # Calcula a porcentagem de valores nulos em relação ao tamanho total do DataFrame de entrada
    # Calcula a porcentagem de valores nulos em cada coluna
    percent = (total / len(df)) * 100
    # Arredonda para duas casas decimais
    percent = percent.round(2)
    # Remove as colunas sem valores nulos
    percent = percent[percent != 0]

    result = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

    return result


In [None]:
missing_percentage(df)  # 0%


In [None]:
df.dtypes


In [None]:
df['area'] = df.x * df.y * df.z
df.head()


## 5) Modelagem

### Treinando com 20% do dataset

In [None]:
X = df[['carat', 'depth', 'table', 'area']]
y = df['price']
X_train, X_test, y_train, y_test = train_test_split(
    X, y, train_size=0.2, random_state=95347)


In [None]:
reg = LinearRegression()


In [None]:
reg.fit(X_train, y_train)


## 6) Métricas

In [None]:
accuracy = reg.score(X_test, y_test)
accuracy


In [None]:
reg.intercept_


In [None]:
reg.coef_


### Precisão de 85.19%

In [None]:
print(f"Precisão de {round(accuracy*100, 2)}%")


In [None]:
y_pred = reg.predict(X=X_test)


In [None]:
print('%.2f' % mean_squared_error(y_test, y_pred))


In [None]:
print('%.2f' % mean_absolute_error(y_test, y_pred))


In [None]:
y_train.mean()


In [None]:
y_pred.mean()
