In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv('G:\Mi unidad\Descargas\diamonds.csv')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31
2,0.23,Good,E,VS1,56.9,65.0,327,4.05,4.07,2.31
3,0.29,Premium,I,VS2,62.4,58.0,334,4.2,4.23,2.63
4,0.31,Good,J,SI2,63.3,58.0,335,4.34,4.35,2.75


In [3]:
df.describe()

Unnamed: 0,carat,depth,table,price,x,y,z
count,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0,53940.0
mean,0.79794,61.749405,57.457184,3932.799722,5.731157,5.734526,3.538734
std,0.474011,1.432621,2.234491,3989.439738,1.121761,1.142135,0.705699
min,0.2,43.0,43.0,326.0,0.0,0.0,0.0
25%,0.4,61.0,56.0,950.0,4.71,4.72,2.91
50%,0.7,61.8,57.0,2401.0,5.7,5.71,3.53
75%,1.04,62.5,59.0,5324.25,6.54,6.54,4.04
max,5.01,79.0,95.0,18823.0,10.74,58.9,31.8


- price price in US dollars (\$326--\$18,823)
- carat weight of the diamond (0.2--5.01)
- cut quality of the cut (Fair, Good, Very Good, Premium, Ideal)
- color diamond colour, from J (worst) to D (best)
- clarity a measurement of how clear the diamond is (I1 (worst), SI2, SI1, VS2, VS1, VVS2, VVS1, IF (best))

- x length in mm (0--10.74)

- y width in mm (0--58.9)

- z depth in mm (0--31.8)

- depth total depth percentage = z / mean(x, y) = 2 * z / (x + y) (43--79)

- table width of top of diamond relative to widest point (43--95)

In [4]:
df.color.value_counts()

color
G    11292
E     9797
F     9542
H     8304
D     6775
I     5422
J     2808
Name: count, dtype: int64

In [5]:
df.cut.value_counts()

cut
Ideal        21551
Premium      13791
Very Good    12082
Good          4906
Fair          1610
Name: count, dtype: int64

In [6]:
df.clarity.unique()

array(['SI2', 'SI1', 'VS1', 'VS2', 'VVS2', 'VVS1', 'I1', 'IF'],
      dtype=object)

ver si no hay registros nulos

In [7]:
df.isna().sum()

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64

Usando CODIFICACION ORDINAL

In [8]:
categorias = {"cut": {"Fair": 0, "Good": 1, "Very Good": 2, "Premium": 3, "Ideal": 4},
              "color": {"J": 0, "I": 1, "H": 2, "G": 3, "F": 4, "E": 5, "D": 6},
               "clarity": {"I1": 0, "SI2": 1, "SI1": 2, "VS2": 3, "VS1": 4, "VVS2": 5, "VVS1": 6, "IF": 8}}

In [9]:
new_columns = pd.DataFrame(columns= categorias)
new_columns.head()

Unnamed: 0,cut,color,clarity


In [11]:
data = df.replace(categorias)
data

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,4,5,1,61.5,55.0,326,3.95,3.98,2.43
1,0.21,3,5,2,59.8,61.0,326,3.89,3.84,2.31
2,0.23,1,5,4,56.9,65.0,327,4.05,4.07,2.31
3,0.29,3,1,3,62.4,58.0,334,4.20,4.23,2.63
4,0.31,1,0,1,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...,...
53935,0.72,4,6,2,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,1,6,2,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,2,6,2,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,3,2,1,61.0,58.0,2757,6.15,6.12,3.74


Escalamiento de los valores

In [12]:
y = data['cut']
X = data.loc[:, data.columns != 'cut']
X

Unnamed: 0,carat,color,clarity,depth,table,price,x,y,z
0,0.23,5,1,61.5,55.0,326,3.95,3.98,2.43
1,0.21,5,2,59.8,61.0,326,3.89,3.84,2.31
2,0.23,5,4,56.9,65.0,327,4.05,4.07,2.31
3,0.29,1,3,62.4,58.0,334,4.20,4.23,2.63
4,0.31,0,1,63.3,58.0,335,4.34,4.35,2.75
...,...,...,...,...,...,...,...,...,...
53935,0.72,6,2,60.8,57.0,2757,5.75,5.76,3.50
53936,0.72,6,2,63.1,55.0,2757,5.69,5.75,3.61
53937,0.70,6,2,62.8,60.0,2757,5.66,5.68,3.56
53938,0.86,2,1,61.0,58.0,2757,6.15,6.12,3.74


Dividiendo conjunto de entrenamiento y prueba

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [15]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

aplicando metodos de clasificacion 

In [16]:
lr = LogisticRegression()
lr.fit(X_train_scaled, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [18]:
lr_pred = lr.predict(X_test_scaled)

In [21]:
lr_accuracy = accuracy_score(y_test, lr_pred)
print(f"Logistic Regression Accuracy: {lr_accuracy:.2f}")

Logistic Regression Accuracy: 0.66


In [32]:
lr_precision = precision_score(y_test, lr_pred, average = 'micro')
print(f"Logistic Regression Precission: {lr_precision:.2f}")

Logistic Regression Precission: 0.66


In [22]:
knn = KNeighborsClassifier()
knn.fit(X_train_scaled, y_train)

In [23]:
knn_pred = knn.predict(X_test_scaled)

In [24]:
knn_accuracy = accuracy_score(y_test, knn_pred)
print(f"KNN Accuracy: {knn_accuracy:.2f}")

KNN Accuracy: 0.66


In [35]:
knn_precision = precision_score(y_test, lr_pred, average = 'micro')
print(f"KNN Precission: {knn_precision:.2f}")

KNN Precission: 0.66


In [25]:
svm = SVC()
svm.fit(X_train_scaled, y_train)

In [26]:
svm_pred = svm.predict(X_test_scaled)

In [27]:
svm_accuracy = accuracy_score(y_test, svm_pred)
print(f"SVM Accuracy: {svm_accuracy:.2f}")

SVM Accuracy: 0.75


In [34]:
svm_precision = precision_score(y_test, lr_pred, average = 'micro')
print(f"SVM Precission: {svm_precision:.2f}")

SVM Precission: 0.66


In [28]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)

In [29]:
dt_pred = dt.predict(X_test)

In [30]:
dt_accuracy = accuracy_score(y_test, dt_pred)
print(f"Decision Trees Accuracy: {dt_accuracy:.2f}")

Decision Trees Accuracy: 0.71


In [36]:
dt_precision = precision_score(y_test, lr_pred, average = 'micro')
print(f"DT Precission: {dt_precision:.2f}")

DT Precission: 0.66
