In [8]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from scipy.stats import multivariate_normal
import seaborn as sns
from sklearn.mixture import GaussianMixture
from sklearn.metrics import confusion_matrix
%matplotlib inline

In [11]:
# Read in initial dataframe, create df2, dictionary of blocks
df_train = pd.read_csv("Train_Arabic_Digit.csv", header = None)
df_train["block"] = df_train.isnull().all(axis=1).cumsum()

df_test = pd.read_csv("Test_Arabic_Digit.csv", header = None)
df_test["block"] = df_test.isnull().all(axis=1).cumsum()

df2_train = {i: df_train.loc[df_train.block == i, df_train.columns.drop('block')].dropna()
       for i in range(0, df_train.block.iat[-1])}

# Create df3, a dictionary of concatenated blocks by digit
df_train["block"] = np.floor(df_train["block"] / 660)
df3_train = {i: df_train.loc[df_train.block == i, df_train.columns.drop('block')].dropna()
       for i in range(10)}

# Create a dataset of just the first, second cepstral coefs for efficiency
# [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 'block']
df_cut_train = df_train.dropna()
df_cut_train = {i: df_cut_train.loc[df_cut_train.block == i, df_cut_train.columns.drop('block')]
       for i in range(10)}

# [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 'block']
# df_test['block'] = np.floor(df_test['block'] / 220)
df_cut_test = df_test.dropna()
df_cut_test = {i: df_cut_test.loc[df_cut_test.block == i, df_cut_test.columns.drop('block')]
       for i in range(0, df_cut_test.block.iat[-1])}

In [None]:
gm0 = GaussianMixture(n_components=5).fit(df_cut_train[0])

In [None]:
gm0.score(df_cut_test[210])

In [None]:
gmms = []
for i in range(10):
    gmm = GaussianMixture(n_components=8, covariance_type='diag').fit(df_cut_train[i])
    gmms.append(gmm)

In [None]:
predictions = []
for i in range(2199):
    max_score = -10000
    max_GMM = -1
    for gmm_num in range(len(gmms)):
        score = gmms[gmm_num].score(df_cut_test[i])
        if score > max_score:
            max_GMM = gmm_num
            max_score = score
    predictions.append(max_GMM)

In [None]:
correct = 0
wrongs = np.zeros(len(predictions))
wrongs = wrongs - 1
for i in range(len(predictions)):
    if predictions[i] == np.floor(i / 220):
        correct += 1
        wrongs[i] = predictions[i]

correct / 2199

In [None]:
labels = []
for i in range(2199):
    labels.append(int(i/220))

cm = pd.DataFrame(confusion_matrix(labels, predictions))
sns.heatmap(cm, annot=True, fmt='g')
plt.title('Confusion Matrix for EM Gaussian Mixture Model Classification')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')

plt.savefig("em_confusion_matrix.png", dpi=300)