In [1]:
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble
#!pip install textblob

import pandas as pd
import pandas, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

Using TensorFlow backend.


In [2]:
# load the dataset
trainDF = pd.read_csv('group_data.csv')
trainDF.dropna(axis=1, how='all')
trainDF.head()

Unnamed: 0,name,class,description
0,Clothing Swap And Events,0,"Clothing Swaps are fashionable, fun and philan..."
1,Santa Cruz Women's Clothing Swap,0,"Meet with locals in a women-only, relaxed and ..."
2,San Francisco Organic + Plant-Based Food Meetup,0,"Meet people who are interested in organic, raw..."
3,African Heritage Resources Network,0,"Connections, Innovation, Transformation, Total..."
4,Craft Labs,0,Get your creative juices flowing at Craft Labs...


In [3]:
# split the dataset into training and validation datasets 
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['description'], trainDF['class'])
# label encode the target variable 
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y.values.astype('U'))
valid_y = encoder.fit_transform(valid_y.values.astype('U'))

#User data 
user_Data = pd.read_csv('user_data.csv', encoding= 'utf-8', )
user_x = user_Data['text']


In [4]:
# create a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['description'].values.astype('U'))

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x.values.astype('U'))
xvalid_count =  count_vect.transform(valid_x.values.astype('U'))
#User 
xuser_count =  count_vect.transform(user_x.values.astype('U'))

In [5]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['description'].values.astype('U'))
xtrain_tfidf =  tfidf_vect.transform(train_x.values.astype('U'))
xvalid_tfidf =  tfidf_vect.transform(valid_x.values.astype('U'))


In [6]:
# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['description'].values.astype('U'))
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x.values.astype('U'))
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x.values.astype('U'))

In [7]:
# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(trainDF['description'].values.astype('U'))
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x.values.astype('U')) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x.values.astype('U')) 

In [8]:
# load the pre-trained word-embedding vectors 
embeddings_index = {}
for i, line in enumerate(open('wiki-news-300d-1M.vec/wiki-news-300d-1M.vec',encoding="utf8")):
    values = line.split()
    embeddings_index[values[0]] = numpy.asarray(values[1:], dtype='float32')

In [9]:
# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(trainDF['description'].values.astype('U'))
word_index = token.word_index


In [10]:
# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_x.values.astype('U')), maxlen=70)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(valid_x.values.astype('U')), maxlen=70)

# create token-embedding mapping
embedding_matrix = numpy.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [11]:
trainDF['char_count'] = trainDF['description'].apply(str).apply(len)

trainDF['word_count'] = trainDF['description'].apply(str).apply(lambda x: len(x.split()))
trainDF['word_density'] = trainDF['char_count'] / (trainDF['word_count']+1)
trainDF['punctuation_count'] = trainDF['description'].apply(str).apply(lambda x: len("".join(_ for _ in x if _ in string.punctuation))) 
trainDF['title_word_count'] = trainDF['description'].apply(str).apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
trainDF['upper_case_word_count'] = trainDF['description'].apply(str).apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))

In [12]:
pos_family = {'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
def check_pos_tag(x, flag):
    cnt = 0
    try:
        wiki = textblob.TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_family[flag]:
                cnt += 1
    except:
        pass
    return cnt

trainDF['noun_count'] = trainDF['description'].apply(lambda x: check_pos_tag(x, 'noun'))
trainDF['verb_count'] = trainDF['description'].apply(lambda x: check_pos_tag(x, 'verb'))
trainDF['adj_count'] = trainDF['description'].apply(lambda x: check_pos_tag(x, 'adj'))
trainDF['adv_count'] = trainDF['description'].apply(lambda x: check_pos_tag(x, 'adv'))
trainDF['pron_count'] = trainDF['description'].apply(lambda x: check_pos_tag(x, 'pron'))

In [13]:
# train a LDA Model
lda_model = decomposition.LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20)
X_topics = lda_model.fit_transform(xtrain_count)
topic_word = lda_model.components_ 
vocab = count_vect.get_feature_names()

# view the topic models
n_top_words = 4
topic_summaries = []
for i, topic_dist in enumerate(topic_word):
    topic_words = numpy.array(vocab)[numpy.argsort(topic_dist)][:-(n_top_words+1):-1]
    topic_summaries.append(' '.join(topic_words))

In [18]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    #return metrics.accuracy_score(predictions, valid_y),predictions, valid_y , classifier
    return metrics.accuracy_score(predictions, valid_y)

In [19]:
# Naive Bayes on Count Vectors
accuracy= train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print ("NB, Count Vectors: ", accuracy)
# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("NB, N-Gram Vectors: ", accuracy)

# Naive Bayes on Character Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print ("NB, CharLevel Vectors: ", accuracy)


NB, Count Vectors:  0.701943844492
NB, WordLevel TF-IDF:  0.596112311015
NB, N-Gram Vectors:  0.55939524838
NB, CharLevel Vectors:  0.548596112311


In [16]:
# Confusion Matrix Naive Bayes
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
confusionMatrix = confusion_matrix(valid_y_NBcv, predictions_NBcv)
print(precision_score(valid_y_NBcv, predictions_NBcv, average="macro"))
print(recall_score(valid_y_NBcv, predictions_NBcv, average="macro"))
print (confusionMatrix)


NameError: name 'valid_y_NBcv' is not defined

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import itertools
def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Compute confusion matrix
cnf_matrix = confusion_matrix(valid_y,predictions)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=[0,1,2],
                      title='Confusion matrix, Multinomial Naive Bayes')
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV
import numpy as np
grid = GridSearchCV(estimator=naive_bayes.MultinomialNB(), param_grid=dict(alpha=np.array([1,0.1,0.01,0.001,1.5,0])))


In [20]:
# Linear Classifier on Count Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count)
print ("LR, Count Vectors: ", accuracy)
# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("LR, WordLevel TF-IDF: ", accuracy)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("LR, N-Gram Vectors: ", accuracy)

# Linear Classifier on Character Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print ("LR, CharLevel Vectors: ", accuracy)

LR, Count Vectors:  0.61555075594
LR, WordLevel TF-IDF:  0.66090712743
LR, N-Gram Vectors:  0.555075593952
LR, CharLevel Vectors:  0.624190064795


In [21]:
# SVM on Ngram Level TF IDF Vectors

accuracy = train_model(svm.LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=0, tol=0.0001,
     verbose=0), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("SVM, N-Gram Vectors: ", accuracy)

SVM, N-Gram Vectors:  0.5313174946


In [22]:
# Linear Classifier on Count Vectors

accuracy = train_model(svm.SVC(), xtrain_count, train_y, xvalid_count)
print ("SVM, Count Vectors: ", accuracy)
# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf, train_y, xvalid_tfidf)
print ("SVM, WordLevel TF-IDF: ", accuracy)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print ("SVM, N-Gram Vectors: ", accuracy)

# Linear Classifier on Character Level TF IDF Vectors
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print ("SVM, CharLevel Vectors: ", accuracy)

SVM, Count Vectors:  0.447084233261
SVM, WordLevel TF-IDF:  0.447084233261
SVM, N-Gram Vectors:  0.447084233261
SVM, CharLevel Vectors:  0.447084233261


In [198]:
from sklearn.externals import joblib
joblib.dump(classifier, 'finalized_model.pkl') 

['finalized_model.pkl']

In [233]:
prediction_nb = classifier.predict(xuser_count)

In [234]:
import collections
counter=collections.Counter(prediction_nb)
print(counter)
# Counter({1: 4, 2: 4, 3: 2, 5: 2, 4: 1})
print(counter.values())
# [4, 4, 2, 1, 2]
print(counter.keys())

Counter({0: 108, 1: 85, 2: 7})
dict_values([7, 85, 108])
dict_keys([2, 1, 0])


In [235]:
len(prediction_nb)

200