### Import nltk modules


In [12]:
import numpy as np
import nltk

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem.wordnet import WordNetLemmatizer




### Initialize Word net lemmatizer and define utility functions


In [13]:

lemmatizer = WordNetLemmatizer()


def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

''
def lemmatize(word):
    pos_tag = nltk.pos_tag([word])[0][1][0].upper()
    pos_tag_dict = {
        'J': wordnet.ADJ,
        'N': wordnet.NOUN,
        'V': wordnet.VERB,
        'R': wordnet.ADV
    }

    return lemmatizer.lemmatize(word.lower(), pos_tag_dict.get(pos_tag, wordnet.NOUN))


def lemmatize_corpus(sentences):
    lemmatized_sentences = []

    for i, sentence in enumerate(sentences):
        words = [lemmatize(word) for word in word_tokenize(sentence)]
        lemmatized_sentences.append(' '.join(words))

    return lemmatized_sentences




### Initialize Training and Test Data



In [14]:
from sklearn.model_selection import train_test_split


%run ../include/util.ipynb

# /Users/dduru/PythonProjects/data/smsspamcollection/SMSSpamCollection
df = read_csv_frame(delimiter='\t', header=None)

print('Number of spam messages: %s' % df[df[0] == 'spam'][0].count())
print('Number of ham messages: %s\n\n' % df[df[0] == 'ham'][0].count())

X = df[1].values
Y = df[0].values

train_sentences, test_sentences, train_verdicts, test_verticts = train_test_split(X, Y)


# train_sentences = [
#     'I am happy to see you today',
#     'We were excited by the progress',
#     'She could not contain her joy',
#     'They were afraid of the clown',
#     'She died in car crash last week',
#     'We are not happy with the development',
#     'Always optimistic about the future',
#     'We want to do our best and succeed',
#     'Competition broke out and hurt our business',
#     'Keep working hard, things will work out',
#     'Despite all the hard work, we failed',
#     'We don\'t know the way forward and have no plan to continue'
# ]

# test_sentences = [
#     'Jolly good fellows came around today',
#     'We don\'t like the new arrange',
#     'People are skeptical and refuse to commit',
#     'Harry is confident in his chances of success',
#     'Lola broke her neck and cannot continue'
# ]

# train_verdicts = [
#     'positive',
#     'positive',
#     'positive',
#     'negative',
#     'negative', 
#     'negative', 
#     'positive',
#     'positive',
#     'negative',
#     'positive',
#     'negative',
#     'negative'
# ]

# test_verticts = [
#     'positive',
#     'negative',
#     'negative',
#     'positive', 
#     'negative',
# ]






Number of spam messages: 747
Number of ham messages: 4825




In [15]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer


binarizer = LabelBinarizer()
vectorizer = TfidfVectorizer()

lemmatized_train_sentences = lemmatize_corpus(train_sentences)
lemmatized_test_sentences = lemmatize_corpus(test_sentences)

x_train = vectorizer.fit_transform(lemmatized_train_sentences)
x_test = vectorizer.transform(lemmatized_test_sentences)

y_train = binarizer.fit_transform(train_verdicts)
y_test = binarizer.transform(test_verticts)


# x_train = np.array(x_train)
# y_train = np.array(y_train)
# x_test = np.array(x_test)
# y_test = np.array(y_test)

In [16]:
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression


estimators = [
    ('rf', RandomForestClassifier(n_estimators=3)),
    ('knn', KNeighborsClassifier(n_neighbors=3)),
    ('lr', LogisticRegression())
]

classifier = StackingClassifier(
    estimators = estimators, 
    final_estimator=LogisticRegression()
)

classifier.fit(x_train.todense(), y_train)
print(classifier.score(x_test.todense(), y_test))

text = input('Enter text: ')
while text != '-1':
    lbl = binarizer.inverse_transform(
        classifier.predict(vectorizer.transform(lemmatize_corpus([text])).todense())
    )[0]
    print(f'Label: {lbl}')
    text = input('Enter text: ')
    print(f'Text: {text}')




0.9842067480258435
