In [None]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix
from math import e
import seaborn as sns
import matplotlib.pyplot as plt
plt.style.use('ggplot')

%matplotlib inline

In [None]:
diamonds = pd.read_csv("../input/diamonds.csv", low_memory=False)
diamonds.reset_index(inplace=True)
diamonds = diamonds.drop('index',axis=1)

### Exploratory analysis

In [None]:
diamonds.info()

### All data seems to be in the correct format. Let's visualize it to inspect more.

In [None]:
diamonds.head()

### All rows have values, non NaN

### Lets plot a correlation matrix in order to get an idea of the most important features in the dataset.

In [None]:
corr = diamonds.corr()
sns.heatmap(corr,annot=True,cmap='coolwarm')

### We can plot a Cluster map that will tell us the the importance of every variable by hierarchy and confirm the correlation between them.

In [None]:
sns.clustermap(corr,cmap='coolwarm', standard_scale=1)

### In order to be able to model the data it needs to be all numeric, non-numeric will be mapped to a number in order extracted from the data Dicctionary.
### CUT = (Fair, Good, Very Good, Premium, Ideal)
### COLOR = (D, E, F, G , H, I, J)
### CLARITY in order from best to worst = ( FL,IF, VVS1, VVS2, VS1, VS2, SI1, SI2, I1, I2, I3 )
### Descending orders will be inverted

In [None]:
diamonds['cut'] = diamonds.cut.map({'Fair':1, 'Good':2, 'Very Good':3, 'Premium':4, 'Ideal':5})
diamonds['color'] = diamonds.color.map({'J':1, 'I':2, 'H':3, 'G':4, 'F':5, 'E':6, 'D':7})
diamonds['clarity'] = diamonds.clarity.map({'I1':1, 'SI2':2, 'SI1':3, 'VS2':4, 'VS1':5, 'VVS2':6, 'VVS1':7, 'IF':8})

### The dataset has changed, variables have changed from object (text) to integers

In [None]:
diamonds.info()

In [None]:
diamonds.head()

### Let’s split the data set, "X" are the features required by the algorithm, "y" are the "labels" to be classified. Since the column price are numeric values if the algorithm trains to classify all these label it will have a very low accuracy. A range of prices are better for the algorithm giving it more chance to be right.
### The data set will be splitted in train and test to prove it's efficiency (80% train 20% test).


In [None]:
X=diamonds[['carat','cut','color','clarity','x','y','z']]
y=diamonds['price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)
nClasses = 12
yClasses = np.linspace(0, y.max(),nClasses)
yTC = np.digitize(y_train,bins=yClasses)
yTestC = np.digitize(y_test,bins=yClasses)

### The range of the price labels can be understand by visualazing the bias price. For example, range 1 goes from 0 to 1711.18

In [None]:
print yClasses

### Let's train SVM algorithm

In [None]:
model = SVC()
model.fit(X_train, yTC)

### Next we can see the results of testing by printing the metrics of the algorithm: Fisrt, confusion matrix, then Precision, Recall, F1-Score (F1-Measure), Support is the amount of every price range label.

In [None]:
predictions = model.predict(X_test)
print(confusion_matrix(yTestC,predictions))
print(classification_report(yTestC,predictions))

### Precision achieves 85% which is good, but it is possible to improve the algorithm by tunning the parameter using Grid Search.

In [None]:
param_grid = {'C': [0.1,1, 10, 100], 'gamma': [1,0.1,0.01], 'kernel': ['rbf','linear']}
grid = GridSearchCV(SVC(),param_grid,refit=True,verbose=3)
grid.fit(X_train,yTC)

In [None]:
grid.best_params_

In [None]:
grid.best_estimator_

In [None]:
grid_predictions = grid.predict(X_test)

In [None]:
model = SVC(C=100, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
model.fit(X_train, yTC)
predictions = model.predict(X_test)
print(confusion_matrix(yTestC,predictions))
print(classification_report(yTestC,predictions))

### Not a big improvement but it is an improvement. We can test other algorithm, Random Forest Classification to seek an improvement.

In [None]:
rfc = RandomForestClassifier(n_estimators=1000)
rfc.fit(X_train,yTC)
predictions = rfc.predict(X_test)
print(confusion_matrix(yTestC,predictions))
print(classification_report(yTestC,predictions))

### The algorithm does not make an improvement. SVM is a very good one. Something noticeable in the metrics is that data has more instances of lower price range labels, which makes sense if you think that it become more unlikely to get diamonds with the characteristics of an expensive one. A plot of the distribution of the diamonds is in the next image.

In [None]:
sns.distplot(diamonds['price'],kde=True)