In [1]:
import pandas as pd 
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, LabelEncoder
import os
import random
from sklearn.ensemble import RandomForestRegressor
from pprint import pprint

In [2]:
df_train = pd.read_csv("train_wine.csv")
df_test = pd.read_csv("test_wine.csv")
df_train = df_train.drop_duplicates()
df_train.duplicated().value_counts()

False    4355
dtype: int64

In [3]:
# mapping = {'poor': 0,'medium': 1, 'good': 2,  'excellent': 3}
# df_train['class'] = [mapping[item] for item in df_train['class']] 

In [4]:
dummies = pd.get_dummies(df_train['color'], drop_first = True)
df_train = df_train.drop(['color'], axis=1)
df_train['isWhite'] = dummies

df_train = df_train.drop(['condition', 'chlor.class', 'vineyard', 'acidity.variance', 
                          'sulfur.taste', 'acid.sulfur', 'acid.taste', 'id'], axis=1)

dummies = pd.get_dummies(df_test['color'], drop_first = True)
df_test = df_test.drop(['color'], axis=1)
df_test['isWhite'] = dummies

df_test = df_test.drop(['condition', 'chlor.class', 'vineyard', 'acidity.variance', 
                          'sulfur.taste', 'acid.sulfur', 'acid.taste'], axis=1)

df_test_without_id = df_test.drop(['id'], axis=1)


In [5]:
df_train = df_train.drop_duplicates()
print(df_train.duplicated().value_counts())
print(df_train['class'].value_counts())

False    3796
dtype: int64
good         2276
medium       1251
poor          161
excellent     108
Name: class, dtype: int64


In [6]:
poor_samples = df_train[df_train['class'] == 'poor']
excellent_samples = df_train[df_train['class'] == 'excellent']

In [7]:
dictionary_excellent = {}
for index, value in enumerate(excellent_samples.drop(['class'], axis=1).columns):
    dictionary_excellent['min_' + value] = excellent_samples[value].min()
    dictionary_excellent['max_' + value] = excellent_samples[value].max()    
dictionary_poor = {}
for index, value in enumerate(poor_samples.drop(['class'], axis=1).columns):
    dictionary_poor['min_' + value] = poor_samples[value].min()
    dictionary_poor['max_' + value] = poor_samples[value].max() 


In [8]:
df_excellent = pd.DataFrame()
for i in range(648):
    dictionary = {}
    for key in excellent_samples.drop(['class'], axis=1).columns:
        if key == 'isWhite':
            outcome = random.uniform(dictionary_excellent['min_' + key],dictionary_excellent['max_' + key])
            if outcome >= 0.15:
                dictionary[key] = 1
            else:
                dictionary[key] = 0
        else:
            dictionary[key] = [random.uniform(dictionary_excellent['min_' + key],dictionary_excellent['max_' + key])] 
    df_from_dictionary = pd.DataFrame.from_dict(dictionary)
    df_excellent = pd.concat([df_excellent,df_from_dictionary])
    
df_poor = pd.DataFrame()
for i in range(966):
    dictionary = {}
    for key in poor_samples.drop(['class'], axis=1).columns:
        if key == 'isWhite':
            outcome = random.uniform(dictionary_excellent['min_' + key],dictionary_excellent['max_' + key])
            if outcome >= 0.33:
                dictionary[key] = 1
            else:
                dictionary[key] = 0
        else:
            dictionary[key] = [random.uniform(dictionary_poor['min_' + key],dictionary_poor['max_' + key])] 
    df_from_dictionary = pd.DataFrame.from_dict(dictionary)
    df_poor = pd.concat([df_poor,df_from_dictionary])

In [9]:
df_poor['class'] = 'poor'
df_excellent['class'] = 'excellent'
df_poor = df_poor[['class','fixed.acidity', 'volatile.acidity', 'citric.acid', 'residual.sugar',
       'chlorides', 'free.sulfur.dioxide', 'total.sulfur.dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'isWhite']]
df_excellent = df_excellent[['class','fixed.acidity', 'volatile.acidity', 'citric.acid', 'residual.sugar',
       'chlorides', 'free.sulfur.dioxide', 'total.sulfur.dioxide', 'density',
       'pH', 'sulphates', 'alcohol' , 'isWhite']]

In [10]:
df_train_with_extra_samples = pd.concat([df_train, df_poor, df_excellent])

In [11]:
mapping = {'poor': 0,'medium': 1, 'good': 2,  'excellent': 3}
df_train_with_extra_samples['class'] = [mapping[item] for item in df_train_with_extra_samples['class']] 
print(df_train_with_extra_samples['class'].value_counts())

2    2276
1    1251
0    1127
3     756
Name: class, dtype: int64


In [12]:
sc = StandardScaler()

In [13]:
y = df_train_with_extra_samples['class']
x = df_train_with_extra_samples.drop(['class'], axis= 1)
# X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.1)
# mapping = {'poor': 0,'medium': 1, 'good': 2,  'excellent': 3}
# y_train = [mapping[item] for item in y_train] 
# mapping = {'poor':0 ,'medium':1 , 'good': 2,  'excellent': 3}
# y_test = [mapping[item] for item in y_test] 
# X_test = sc.fit_transform(X_test)
# X_train = sc.fit_transform(X_train)
x = sc.fit_transform(x)
df_test_without_id = sc.fit_transform(df_test_without_id)

In [None]:
# rfc = RandomForestClassifier(n_estimators=50)
# rfc.fit(X_train,y_train)
# predictions = rfc.predict(X_test)
# print(classification_report(y_test,predictions))
# print(confusion_matrix(y_test,predictions))


In [None]:
from sklearn.model_selection import RandomizedSearchCV
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 500, num = 3)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 100, num = 5)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)
{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [100, 200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}

In [None]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

In [None]:
best_random = rf_random.best_estimator_
print(best_random)

In [None]:
best_random = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=100, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=5,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False).fit(X_train,y_train)
predictions = best_random.predict(X_test)
print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))

In [None]:
rfc_eval = cross_val_score(estimator = best_random, X = X_train, y = y_train, cv = 10)

In [None]:
rfc_eval.max()

In [None]:
predictions = best_random.predict(df_test_without_id)

In [None]:
mapping = {0:"poor",1:"medium",2:"good",3:"excellent"}
predictions =  [mapping[item] for item in predictions]

In [17]:
import csv
output = pd.DataFrame({"id": df_test.id, "class": predictions})
output.to_csv('xgb.csv', index=False, quoting=csv.QUOTE_NONNUMERIC)
print("Your submission was successfully saved!")

Your submission was successfully saved!


In [None]:
clf = SVC()
clf.fit(X_train, y_train) 

In [None]:
ytrain_pred = clf.predict(X_test)

In [None]:
print(classification_report(y_test,ytrain_pred))
print(confusion_matrix(y_test,ytrain_pred))

In [14]:
import xgboost as xgb

In [15]:
xgb_classifier = xgb.XGBClassifier(max_depth = 6, subsample = 1, n_estimators=400, learning_rate=0.025)
xgb_classifier

XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, gamma=None,
              gpu_id=None, importance_type='gain', interaction_constraints=None,
              learning_rate=0.025, max_delta_step=None, max_depth=6,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=400, n_jobs=None, num_parallel_tree=None,
              objective='binary:logistic', random_state=None, reg_alpha=None,
              reg_lambda=None, scale_pos_weight=None, subsample=1,
              tree_method=None, validate_parameters=False, verbosity=None)

In [None]:
xgb_classifier.fit(x,y)
predictions = xgb_classifier.predict(X_test)
predictions2 = xgb_classifier.predict(X_train)
print(classification_report(y_test,predictions))
print(confusion_matrix(y_test,predictions))
print(classification_report(y_train,predictions2))
print(confusion_matrix(y_train,predictions2))

In [16]:
xgb_classifier.fit(x,y)
predictions = xgb_classifier.predict(df_test_without_id)