In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
data = pd.read_csv('../input/diamonds.csv')
data.head()

In [None]:
data.columns

In [None]:
data.shape

In [None]:
data.isnull().sum() #checking for NAN values

In [None]:
data['cut'].value_counts()

In [None]:
sns.countplot(data=data, x='cut')

In [None]:
sns.boxplot(x='cut', y='price', data=data)

In [None]:
sns.FacetGrid(data,hue='cut', size=6).map(sns.kdeplot, "price").add_legend()

In [None]:
from sklearn import linear_model, utils, metrics
X = data.drop(['Unnamed: 0','cut', 'clarity', 'price', 'color'], axis=1)
X = np.array(X,dtype=int) #change everything to an integer (just in case)
y = data['cut']
X,y = utils.shuffle(X, y)

In [None]:
# Split the data
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   test_size=0.3)

In [None]:
# Preprocessing the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
#randm forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(max_depth=2, random_state=0)
rf.fit(X_train,y_train)
y_pred = rf.predict(X_test)
print(classification_report(y_test,y_pred))
cf = confusion_matrix(y_test,y_pred)
sns.heatmap(cf, annot=True, fmt="d")


In [None]:
#LogistigRegression
logres = linear_model.LogisticRegression()
logres.fit(X_train, y_train)
y_pred = logres.predict(X_test)
print(metrics.classification_report(y_test, y_pred))

In [None]:
from sklearn import cross_validation


#Prepare the training and test sets again
X_train = X[:X.shape[0]]
y_train = y[:X.shape[0]]

#perform cross validation
y_pred = cross_validation.cross_val_predict(logres, X_train, y_train,cv=10)

#print the results
print(metrics.classification_report(y_train, y_pred))