In [2]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn import tree
from sklearn.model_selection import KFold
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV



In [3]:
def read_corpus(corpus_file, use_sentiment):
    documents = []
    labels = []
    with open(corpus_file, encoding='utf-8') as f:
        for line in f:
            
            tokens = line.strip().split()
                    
            # remove stopwords
            from nltk.corpus import stopwords
            stop = stopwords.words('english')
            tokens = [token for token in tokens if token not in stop]
            
            # porter stemming
            from nltk.stem.porter import PorterStemmer
            st = PorterStemmer()
            tokens = [st.stem(word) for word in tokens]
            
            documents.append(tokens[3:])

            if use_sentiment:
                # 2-class problem: positive vs negative
                labels.append( tokens[1] )
            else:
                # 6-class problem: books, camera, dvd, health, music, software
                labels.append( tokens[0] )

    return documents, labels
    
# a dummy function that just returns its input
def identity(x):
    return x

# reads the corpus and split to a training and test set
X, Y = read_corpus('trainset.txt', use_sentiment=False)
split_point = int(0.75*len(X))
Xtrain = X[:split_point]
Ytrain = Y[:split_point]
Xtest = X[split_point:]
Ytest = Y[split_point:]
X = np.array(X)
Y = np.array(Y)

# let's use the TF-IDF vectorizer
tfidf = True

# we use a dummy function as tokenizer and preprocessor,
# since the texts are already preprocessed and tokenized.
if tfidf:
    vec = TfidfVectorizer(preprocessor = identity,
                          tokenizer = identity)
else:
    vec = CountVectorizer(preprocessor = identity,
                          tokenizer = identity)

In [7]:
# NAIVE BAYES

params = {'cls__alpha': np.arange(0.50, 0.65, 0.01)}

# combine the vectorizer with a Naive Bayes classifier
classifier = Pipeline( [('vec', vec),
                        ('cls', MultinomialNB())] )

GS = GridSearchCV(classifier, params, cv=5, scoring='f1_micro')

GS.fit(X, Y)

print(GS.best_params_)
print(GS.best_score_)

{'cls__alpha': 0.53}
0.9095


In [15]:
GS.best_params_['cls__alpha']

0.53

In [None]:
#DECISION TREE

params = {'cls__max_depth': range(10,30)}

# combine the vectorizer with a Decision Tree classifier
classifier = Pipeline( [('vec', vec),
                        ('cls', tree.DecisionTreeClassifier())])

GS = GridSearchCV(classifier, params, cv=5, scoring='f1_micro')

GS.fit(X, Y)

print(GS.best_params_)
print(GS.best_score_)

In [None]:
leaving numbers, weights='uniform' 0.839
leaving numbers, weights='distance' 0.845

In [10]:
# K-NN

params = {'cls__n_neighbors': range(1,40)}

# combine the vectorizer with a classifier
classifier = Pipeline( [('vec', vec),
                        ('cls', KNeighborsClassifier(weights='distance'))])
GS = GridSearchCV(classifier, params, cv=5, scoring='f1_micro')

GS.fit(X, Y)
                        
print(GS.best_params_)
print(GS.best_score_)

{'cls__n_neighbors': 31}
0.8453333333333334


In [None]:
# complexity evaluation of NB, DT and KNN
import time

# combine the vectorizer with a Naive Bayes classifier
classifier = Pipeline( [('vec', vec), ('cls', MultinomialNB())] )

t0 = time.time()
classifier.fit(X, Y)
train_time = time.time() - t0
print("training time: ", train_time)

# combine the vectorizer with a Decision Tree classifier
classifier = Pipeline( [('vec', vec),
                        ('cls', tree.DecisionTreeClassifier())])

t0 = time.time()
classifier.fit(X, Y)
train_time = time.time() - t0
print("training time: ", train_time)


# combine the vectorizer with a KNN classifier
classifier = Pipeline( [('vec', vec),  ('cls', KNeighborsClassifier(n_neighbors=1))])

t0 = time.time()
classifier.fit(X, Y)
train_time = time.time() - t0
print("training time: ", train_time)

0.8036666666666666

In [21]:
# complexity evaluation of NB, DT and KNN
import time

# combine the vectorizer with a Naive Bayes classifier
classifier = Pipeline( [('vec', vec), ('cls', MultinomialNB())] )

classifier.fit(X, Y)
t0 = time.time()
y_guess = classifier.predict(Xtest)
test_time = time.time() - t0
print("test time: ", test_time)

# combine the vectorizer with a Decision Tree classifier
classifier = Pipeline( [('vec', vec),
                        ('cls', tree.DecisionTreeClassifier())])


classifier.fit(X, Y)
t0 = time.time()
y_guess = classifier.predict(Xtest)

test_time = time.time() - t0
print("test time: ", test_time)


# combine the vectorizer with a KNN classifier
classifier = Pipeline( [('vec', vec),  ('cls', KNeighborsClassifier(n_neighbors=1))])

classifier.fit(X, Y)
t0 = time.time()
y_guess = classifier.predict(Xtest)
test_time = time.time() - t0
print("test time: ", test_time)

test time:  0.21404695510864258
test time:  0.2054767608642578
test time:  1.3513238430023193


In [None]:
from sklearn.ensemble import RandomForestClassifier
params = {'cls__max_depth': range(1,30), 'cls__n_estimators': range(5, 30)}

# combine the vectorizer with a classifier
classifier = Pipeline( [('vec', vec),
                        ('cls', RandomForestClassifier(random_state=0))])
GS = GridSearchCV(classifier, params, cv=5, scoring='f1_micro')

GS.fit(X, Y)
                        
print(GS.best_params_)
print(GS.best_score_)


0.6453333333333333

In [9]:
K = range(1,50)
f_score = []

for k in K:
    print(k)
    params = {'cls__n_neighbors': np.arange(k, k+1, 1)}
    classifier = Pipeline( [('vec', vec),  ('cls', KNeighborsClassifier())])
    
    GS = GridSearchCV(classifier, params, cv=5, scoring='f1_micro')
    GS.fit(X, Y)
    f_score.append(GS.best_params_['cls__n_neighbors'])

1
2


KeyboardInterrupt: 

In [None]:
plt.plot(k, f_score)