# Obtain data and carry out EDA to understand the dataset

In [None]:
# Data from http://archive.ics.uci.edu/ml/datasets/Wine
!git clone https://gist.github.com/tijptjik/9408623

# Read in as a dataframe for EDA
wine_df = pd.read_csv('9408623/wine.csv')
print(wine_df.head())
print(wine_df.describe())
print(wine_df.columns)
print(wine_df.dtypes)

# Read in data as an array to begin analysis. Drop header row as it is # not needed for svm classification.
wine_data = np.genfromtxt('9408623/wine.csv', delimiter=',')
wine_data = wine_data[1:] 

print(wine_data[0])

# Preprocess data and prepare in LIBSVM format

In [None]:
with_features_index = []
data_with_features_index = []

for j in range(len(wine_data)):
    each_row = wine_data[j,0:]

    with_features_index.append(each_row[0])

    for l in range(1,14):
        with_features_index.append(str(l)+ ":" + str(each_row[l]))

    data_with_features_index.append(with_features_index)  
    with_features_index = []

# Split data into train and test datasets

In [None]:
# Split into train and test datasets
train_data, test_data = train_test_split(data_with_features_index, test_size=0.33, random_state=42)

# Convert to Numpy array
train_data = np.asarray(train_data)
test_data = np.asarray(test_data)
    
# Get y_label from test data to compare against model result later
y_label = np.array(test_data[:,0], dtype=np.float32)
y_label

# Save train and test data to files
np.savetxt('wine_svm_train_data', train_data, delimiter=" ", fmt="%s") 
np.savetxt('wine_svm_test_data', test_data, delimiter=" ", fmt="%s") 

# Train the model

In [None]:
def svm_model(train_pathname, test_pathname):

  # Validate that train and test files exist
  assert os.path.exists(train_pathname),"training file not found"
  file_name = os.path.split(train_pathname)[1]

  # Create files to store scaled train data, range metadata for scaled data, and trained model
  scaled_file = file_name + ".scale"
  model_file = file_name + ".model"
  range_file = file_name + ".range" # store scale range for train data to be used to scale test data

  file_name = os.path.split(test_pathname)[1]
  assert os.path.exists(test_pathname),"testing file not found"

  # Create file for scaled test data and predicted output
  scaled_test_file = file_name + ".scale"
  predict_test_file = file_name + ".predict"

  # Scaling by range [-1, 1]
  cmd = '{0} -l {4} -u {5} -s "{1}" "{2}" > "{3}"'.format(svmscale_exe, range_file, train_pathname, scaled_file, -1, 1)
  print('Scaling train data')
  Popen(cmd, shell = True, stdout = PIPE).communicate()

  # Tuning c and g hyperparameters using a 5-fold grid search
  cmd = '{0} -v {4} -svmtrain "{1}" -gnuplot "{2}" "{3}"'.format(grid_py, svmtrain_exe, "null", scaled_file, 5)
  print('Cross validation')
  f = Popen(cmd, shell = True, stdout = PIPE).stdout

  line = ''
  while True:
      last_line = line
      line = f.readline()
      if not line: break
  c,g,rate = map(float,last_line.split())

  print('Best c={0}, g={1} CV rate={2}'.format(c,g,rate))

  cmd = '{0} -c {1} -g {2} "{3}" "{4}"'.format(svmtrain_exe,c,g,scaled_file,model_file)
  print('Training model')
  Popen(cmd, shell = True, stdout = PIPE).communicate()

  print('Output model: {0}'.format(model_file))

  cmd = '{0} -l {4} -u {5} -r "{1}" "{2}" > "{3}"'.format(svmscale_exe, range_file, test_pathname, scaled_test_file, -1, 1)
  print('Scaling test data')
  Popen(cmd, shell = True, stdout = PIPE).communicate()

  cmd = '{0} "{1}" "{2}" "{3}"'.format(svmpredict_exe, scaled_test_file, model_file, predict_test_file)
  print('Testing model\n')
  f = Popen(cmd, shell = True, stdout = PIPE).stdout
  result = (str(f.readline()).replace("\\n'", '')).replace("b'", '')
  print("{} \n".format(result))

  print('Output prediction: {0}'.format(predict_test_file))

# Test model on test data

In [None]:
svm_model(wine_svm_data_train_file, wine_svm_data_test_file)

# Evaluate the performance of the model

In [None]:
def evaluate_model(y_label, predict_test_file):


  # Creating the y_label for the confusion matrix
  f=open(predict_test_file,'r')
  y_pred = np.genfromtxt(f,dtype = 'float')

  # Confusion matrix
  cf_matrix = confusion_matrix(y_label, y_pred)

  # Plot heatmap
  ax = sns.heatmap(cf_matrix/np.sum(cf_matrix), annot=True, 
          fmt='.2%', cmap='Blues')

  # Format plot
  ax.set_title('Seaborn Confusion Matrix with wine class labels\n\n');
  ax.set_xlabel('\nPredicted action')
  ax.set_ylabel('Actual action ');

  # Ticket labels - List must be in alphabetical order
  ax.xaxis.set_ticklabels(['1', '2', '3'])
  ax.yaxis.set_ticklabels(['1', '2', '3'])

  # Display the visualization of the Confusion Matrix.
  plt.show()



In [None]:
 evaluate_model(y_label, 'wine_svm_test_data.predict')