# Import

In [1]:
import numpy as np
import pandas as pd

# Data

In [2]:
data = pd.read_table("verified_pb.txt", header = None)
data.columns = ['M/F/C', 'SPKR','Phoneme-Number','Phoneme-Ascii','F0','F1','F2','F3']
data

Unnamed: 0,M/F/C,SPKR,Phoneme-Number,Phoneme-Ascii,F0,F1,F2,F3
0,1,1,1,IY,160.0,240.0,2280.0,2850.0
1,1,1,1,IY,186.0,280.0,2400.0,2790.0
2,1,1,2,IH,203.0,390.0,2030.0,2640.0
3,1,1,2,IH,192.0,310.0,1980.0,2550.0
4,1,1,3,EH,161.0,490.0,1870.0,2420.0
...,...,...,...,...,...,...,...,...
1515,3,76,8,UH,322.0,610.0,1550.0,3400.0
1516,3,76,9,UW,345.0,520.0,1250.0,3460.0
1517,3,76,9,UW,334.0,500.0,1140.0,3380.0
1518,3,76,10,ER,308.0,740.0,1850.0,2160.0


# Data preparing

### Starred

In [3]:
male = data[:80]
female = data[660:740]
children = data[1220:1260]
concat = [male, female, children]
test_set_star= pd.concat(concat)
#pd.set_option('display.max_columns', None)
test_set_star

Unnamed: 0,M/F/C,SPKR,Phoneme-Number,Phoneme-Ascii,F0,F1,F2,F3
0,1,1,1,IY,160.0,240.0,2280.0,2850.0
1,1,1,1,IY,186.0,280.0,2400.0,2790.0
2,1,1,2,IH,203.0,390.0,2030.0,2640.0
3,1,1,2,IH,192.0,310.0,1980.0,2550.0
4,1,1,3,EH,161.0,490.0,1870.0,2420.0
...,...,...,...,...,...,...,...,...
1255,3,63,8,UH,294.0,570.0,1450.0,3500.0
1256,3,63,9,UW,333.0,350.0,1280.0,3650.0
1257,3,63,9,UW,290.0,340.0,1160.0,2950.0
1258,3,63,10,*ER,275.0,560.0,1740.0,2460.0


In [4]:
other_male = data[80:660]
other_female = data[740:1220]
other_children = data[1260:]
concat2 = [other_male, other_female, other_children]
train_set_star = pd.concat(concat2)
train_set_star

Unnamed: 0,M/F/C,SPKR,Phoneme-Number,Phoneme-Ascii,F0,F1,F2,F3
80,1,5,1,IY,140.0,310.0,2310.0,2820.0
81,1,5,1,IY,131.0,260.0,2250.0,2850.0
82,1,5,2,IH,137.0,440.0,2060.0,2640.0
83,1,5,2,IH,134.0,430.0,1880.0,2450.0
84,1,5,3,EH,140.0,580.0,1910.0,2500.0
...,...,...,...,...,...,...,...,...
1515,3,76,8,UH,322.0,610.0,1550.0,3400.0
1516,3,76,9,UW,345.0,520.0,1250.0,3460.0
1517,3,76,9,UW,334.0,500.0,1140.0,3380.0
1518,3,76,10,ER,308.0,740.0,1850.0,2160.0


### Unstarred

In [5]:
test_set_nostar = test_set_star[~test_set_star["Phoneme-Ascii"].str.startswith("*")]
test_set_nostar

Unnamed: 0,M/F/C,SPKR,Phoneme-Number,Phoneme-Ascii,F0,F1,F2,F3
0,1,1,1,IY,160.0,240.0,2280.0,2850.0
1,1,1,1,IY,186.0,280.0,2400.0,2790.0
2,1,1,2,IH,203.0,390.0,2030.0,2640.0
3,1,1,2,IH,192.0,310.0,1980.0,2550.0
4,1,1,3,EH,161.0,490.0,1870.0,2420.0
...,...,...,...,...,...,...,...,...
1254,3,63,8,UH,285.0,560.0,1440.0,3500.0
1255,3,63,8,UH,294.0,570.0,1450.0,3500.0
1256,3,63,9,UW,333.0,350.0,1280.0,3650.0
1257,3,63,9,UW,290.0,340.0,1160.0,2950.0


In [6]:
train_set_nostar = train_set_star[~train_set_star["Phoneme-Ascii"].str.startswith("*")]
train_set_nostar

Unnamed: 0,M/F/C,SPKR,Phoneme-Number,Phoneme-Ascii,F0,F1,F2,F3
80,1,5,1,IY,140.0,310.0,2310.0,2820.0
81,1,5,1,IY,131.0,260.0,2250.0,2850.0
82,1,5,2,IH,137.0,440.0,2060.0,2640.0
83,1,5,2,IH,134.0,430.0,1880.0,2450.0
84,1,5,3,EH,140.0,580.0,1910.0,2500.0
...,...,...,...,...,...,...,...,...
1515,3,76,8,UH,322.0,610.0,1550.0,3400.0
1516,3,76,9,UW,345.0,520.0,1250.0,3460.0
1517,3,76,9,UW,334.0,500.0,1140.0,3380.0
1518,3,76,10,ER,308.0,740.0,1850.0,2160.0


# Decision tree

In [7]:
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [8]:
feature_cols = ['F0', 'F1', 'F2', 'F2']

In [9]:
X_train_star = train_set_star[feature_cols]
Y_train_star = train_set_star['Phoneme-Number']
X_train_nostar = train_set_nostar[feature_cols]
Y_train_nostar = train_set_nostar['Phoneme-Number']

In [10]:
X_test_star = test_set_star[feature_cols]
Y_test_star = test_set_star['Phoneme-Number']
X_test_nostar = test_set_nostar[feature_cols]
Y_test_nostar = test_set_nostar['Phoneme-Number']

In [11]:
clf = DecisionTreeClassifier()

In [12]:
clf = clf.fit(X_train_star,Y_train_star)
y_pred = clf.predict(X_test_star)
print("Starred/starred Accuracy:",metrics.accuracy_score(Y_test_star, y_pred))

Starred/starred Accuracy: 0.725


In [13]:
clf = clf.fit(X_train_nostar,Y_train_nostar)
y_pred = clf.predict(X_test_nostar)
print("Unstarred/unstarred Accuracy:",metrics.accuracy_score(Y_test_nostar, y_pred))

Unstarred/unstarred Accuracy: 0.8291139240506329


In [14]:
clf = clf.fit(X_train_star,Y_train_star)
y_pred = clf.predict(X_test_nostar)
print("Starred/nostarred Accuracy:",metrics.accuracy_score(Y_test_nostar, y_pred))

Starred/nostarred Accuracy: 0.7531645569620253


In [15]:
clf = clf.fit(X_train_nostar,Y_train_nostar)
y_pred = clf.predict(X_test_star)
print("Nostarred/starred Accuracy:",metrics.accuracy_score(Y_test_star, y_pred))

Nostarred/starred Accuracy: 0.77


# GradientBoosting

In [16]:
from sklearn.ensemble import GradientBoostingClassifier

In [17]:
clf1 = GradientBoostingClassifier(random_state=0)

In [18]:
clf1 = clf1.fit(X_train_star,Y_train_star)
y_pred = clf1.predict(X_test_star)
print("Starred/starred Accuracy:",metrics.accuracy_score(Y_test_star, y_pred))

Starred/starred Accuracy: 0.82


In [19]:
clf1 = clf1.fit(X_train_nostar,Y_train_nostar)
y_pred = clf1.predict(X_test_nostar)
print("Unstarred/unstarred Accuracy:",metrics.accuracy_score(Y_test_nostar, y_pred))

Unstarred/unstarred Accuracy: 0.879746835443038


In [20]:
clf1 = clf1.fit(X_train_star,Y_train_star)
y_pred = clf1.predict(X_test_nostar)
print("Starred/nostarred Accuracy:",metrics.accuracy_score(Y_test_nostar, y_pred))

Starred/nostarred Accuracy: 0.8607594936708861


In [21]:
clf1 = clf1.fit(X_train_nostar,Y_train_nostar)
y_pred = clf1.predict(X_test_star)
print("Nostarred/starred Accuracy:",metrics.accuracy_score(Y_test_star, y_pred))

Nostarred/starred Accuracy: 0.82


# RandomForest

In [22]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification

In [23]:
clf2 = RandomForestClassifier(max_depth=20,random_state=0)

In [24]:
clf2 = clf2.fit(X_train_star,Y_train_star)
y_pred = clf2.predict(X_test_star)
print("Starred/starred Accuracy:",metrics.accuracy_score(Y_test_star, y_pred))

Starred/starred Accuracy: 0.845


In [25]:
clf2 = clf2.fit(X_train_nostar,Y_train_nostar)
y_pred = clf2.predict(X_test_nostar)
print("Unstarred/unstarred Accuracy:",metrics.accuracy_score(Y_test_nostar, y_pred))

Unstarred/unstarred Accuracy: 0.9113924050632911


In [26]:
clf2 = clf2.fit(X_train_star,Y_train_star)
y_pred = clf2.predict(X_test_nostar)
print("Starred/nostarred Accuracy:",metrics.accuracy_score(Y_test_nostar, y_pred))

Starred/nostarred Accuracy: 0.8924050632911392


In [27]:
clf2 = clf2.fit(X_train_nostar,Y_train_nostar)
y_pred = clf2.predict(X_test_star)
print("Nostarred/starred Accuracy:",metrics.accuracy_score(Y_test_star, y_pred))

Nostarred/starred Accuracy: 0.855


# Prototype classifier - Nearest centroid classifier

https://en.wikipedia.org/wiki/Nearest_centroid_classifier

In [28]:
from sklearn.neighbors import NearestCentroid

In [29]:
clf3 = NearestCentroid()

In [30]:
clf3 = clf3.fit(X_train_star,Y_train_star)
y_pred = clf3.predict(X_test_star)
print("Starred/starred Accuracy:",metrics.accuracy_score(Y_test_star, y_pred))

Starred/starred Accuracy: 0.505


In [31]:
clf3 = clf3.fit(X_train_nostar,Y_train_nostar)
y_pred = clf3.predict(X_test_nostar)
print("Unstarred/unstarred Accuracy:",metrics.accuracy_score(Y_test_nostar, y_pred))

Unstarred/unstarred Accuracy: 0.5


In [32]:
clf3 = clf3.fit(X_train_star,Y_train_star)
y_pred = clf3.predict(X_test_nostar)
print("Starred/nostarred Accuracy:",metrics.accuracy_score(Y_test_nostar, y_pred))

Starred/nostarred Accuracy: 0.5189873417721519


In [33]:
clf3 = clf3.fit(X_train_nostar,Y_train_nostar)
y_pred = clf3.predict(X_test_star)
print("Nostarred/starred Accuracy:",metrics.accuracy_score(Y_test_star, y_pred))

Nostarred/starred Accuracy: 0.495


# Classifier comparison

In [34]:
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.ensemble import BaggingClassifier

In [41]:
names = ["Nearest Neighbors", "GradBoost", "Bagging", "Linear SVM", "RBF SVM", "Decision Tree", "Random Forest deep:5", 
         "Random Forest deep:30", "Neural Net", "AdaBoost", "Naive Bayes", "QDA", "NearestCentroid"]

classifiers = [
    KNeighborsClassifier(10),
    GradientBoostingClassifier(random_state=0),
    BaggingClassifier(base_estimator=SVC(), n_estimators=10, random_state=0),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    DecisionTreeClassifier(max_depth=15),
    RandomForestClassifier(max_depth=5, random_state = 0),
    RandomForestClassifier(max_depth=30, random_state = 0),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
    NearestCentroid()]

In [42]:
for i in range(len(classifiers)):
    print("Classifier:",names[i])
    clf = classifiers[i]
    clf = clf.fit(X_train_star,Y_train_star)
    y_pred = clf.predict(X_test_star)
    print("\t Starred train/starred test Accuracy:", metrics.accuracy_score(Y_test_star, y_pred))
    clf = clf.fit(X_train_nostar,Y_train_nostar)
    y_pred = clf.predict(X_test_nostar)
    print("\t Unstarred train/unstarred test Accuracy:",metrics.accuracy_score(Y_test_nostar, y_pred))
    clf = clf.fit(X_train_star,Y_train_star)
    y_pred = clf.predict(X_test_nostar)
    print("\t Starred train/nostarred test Accuracy:",metrics.accuracy_score(Y_test_nostar, y_pred))
    clf = clf.fit(X_train_nostar,Y_train_nostar)
    y_pred = clf.predict(X_test_star)
    print("\t Nostarred train/starred test Accuracy:",metrics.accuracy_score(Y_test_star, y_pred), "\n")

Classifier: Nearest Neighbors
	 Starred train/starred test Accuracy: 0.855
	 Unstarred train/unstarred test Accuracy: 0.879746835443038
	 Starred train/nostarred test Accuracy: 0.9050632911392406
	 Nostarred train/starred test Accuracy: 0.835 

Classifier: GradBoost
	 Starred train/starred test Accuracy: 0.82
	 Unstarred train/unstarred test Accuracy: 0.879746835443038
	 Starred train/nostarred test Accuracy: 0.8607594936708861
	 Nostarred train/starred test Accuracy: 0.82 

Classifier: Bagging
	 Starred train/starred test Accuracy: 0.8
	 Unstarred train/unstarred test Accuracy: 0.8354430379746836
	 Starred train/nostarred test Accuracy: 0.8670886075949367
	 Nostarred train/starred test Accuracy: 0.78 

Classifier: Linear SVM
	 Starred train/starred test Accuracy: 0.85
	 Unstarred train/unstarred test Accuracy: 0.8860759493670886
	 Starred train/nostarred test Accuracy: 0.879746835443038
	 Nostarred train/starred test Accuracy: 0.86 

Classifier: RBF SVM
	 Starred train/starred test Ac

