In [714]:
import pandas as pd
import numpy as np
import string
from string import punctuation
from scipy import sparse
import nltk
from nltk.collocations import *
%matplotlib inline
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.corpus import brown
sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
from sklearn.feature_extraction.text import L
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score
from sklearn import cross_validation

# Reading & Preparing Data

In [715]:
training_data = pd.read_csv("newtrain.csv", low_memory=False)
testing_data = pd.read_csv("newtest.csv", low_memory=False)

In [716]:
# Randomize data
random_index = np.random.permutation(training_data.index)
training_data = training_data.ix[random_index]
training_data.reset_index(drop=True, inplace=True)

In [717]:
training_data.head() # peek at training data to make sure nothing went horribly wrong

Unnamed: 0,Category,Text
0,7,if light travels faster than sound
1,3,what books would you like to see made into a m...
2,6,what type of mpenis do u think is the best?cut...
3,1,how do i delete search words?
4,6,what is philtrum?


# Features
### Combining Tf-idf & Count Vectorizers

In [718]:
# custom stopwords
my_stopwords = ['yahoo', 'best', 'know', 'what', 'how', "what's", 'why', "i'm", "xa", "would",'and', 'or',
                "anyone", 'someone', 'help', 'think', 'find', 'want', 'one', 'the', 'to', 'is', 'of', 'so', 
                'in', 'do', 'you', 'can', 'it', 'for', 'my', 'on', 'are', 'have', 'is the', 'that', 'with', 
                'if', 'me', 'does', 'be', 'there', 'was', 'this', 'an', 'but', 'about', 'should', 'any', 
                'am', 'has', 'just', 'anybody', 'somebody', 'had', 'not', 'some', 'except', 'these', 'those', 
                'could', 'over', 'will']
# single-letter words
stops = list(string.ascii_lowercase) + my_stopwords

#Using both tfidf and count vectorizers and combining them
tfvec = TfidfVectorizer(ngram_range=(1, 3), min_df = 2, stop_words = stops, token_pattern = r'\b\w+\b')
countVec = CountVectorizer(ngram_range=(1, 3), min_df = 2, stop_words = stops, 
                           token_pattern = r'\b\w+\b')

###Adding more training data just for Category One
Have experimented with adding more training data for category one vs. adding training data for all other categories since confusion matrix indidates category one is where most error occur. It only improved the accuracy by a bit

In [None]:
categoryOne = training_data[training_data.Category == 1]
training_data = training_data.append(categoryOne)

###Apply Vectorizers and combine the features

In [719]:
# training set for tfidf
arr_train_feature_sparse = tfvec.fit_transform(training_data.Text)
arr_train_feature = arr_train_feature_sparse.toarray()
# testing set for tfidf
arr_test_feature_sparse = tfvec.transform(testing_data.Text)
arr_test_feature = arr_test_feature_sparse.toarray()
# training set for count
arr_train_feature_sparse_Count = countVec.fit_transform(training_data.Text)
arr_train_feature_Count = arr_train_feature_sparse_Count.toarray()
arr_train_feature_Count.shape
# testing set for count
arr_test_feature_sparse_Count = countVec.transform(testing_data.Text)
arr_test_feature_Count = arr_test_feature_sparse_Count.toarray()
arr_test_feature_Count.shape

(1874, 3463)

In [720]:
# Combining both vectorizers
arr_train_feature = np.hstack((arr_train_feature, arr_train_feature_Count))
arr_test_feature = np.hstack((arr_test_feature, arr_test_feature_Count))
arr_test_feature.shape

(1874, 6926)

## Additional Feature Experiments (We didn't end up using these features)
### Feature application method

In [25]:
# accepts a feature function and applies it to
# the training and testing sets;
# returns train_feats, test_feats
def apply_feature(fn):
    # for the training set
    feats = []
    for i in training_data.Text:
        feats.append(fn(i))
    train_feats = np.array( pd.DataFrame(feats) )
    
    # for the testing set
    feats_test = []
    for i in testing_data.Text:
        feats_test.append(fn(i))
    test_feats = np.array( pd.DataFrame(feats_test) )
    
    return train_feats, test_feats

### Number of words in question *(not used)*
tanks nb accuracy (did not end up using)

In [16]:
def countLength(text):
    result = text.split()
    return(str(len(result)))

### Number of non-alpha characters in a question *(not used)*
kills nb accuracy (did not end up using)

In [17]:
def nonNumeric(string):
    count = 0
    for char in string:
        if not char.isalpha():
            count += 1
    return count

### Number of words longer than 9 characters in a question *(not used)*
maybe for logical regression, but not nb (did not end up using)

In [18]:
def longWord(string):
    count = 0
    for word in word_tokenize(string):
        if len(word) > 9:
            count += 1
    return count

### Number of punctuation marks in a question *(not used)*
not effective (did not end up using)

In [206]:
def puncts(quest):
    punctuation_marks = [char for char in list(quest) if char in set(string.punctuation)]
    return len(punctuation_marks)

### Length of longest word in a question *(not used)*
decreased accuracy for nb, same for lr (did not end up using)

In [207]:
def longest_word_length(quest):
    return len(max(word_tokenize(quest), key=len))

###Number of verbs in a question, with stopwords removed *(not used)*
questionably effective (did not end up using)

In [19]:
# gotta build a tagger to get parts of speech
def build_backoff_tagger (train_sents):
    t0 = nltk.DefaultTagger('X')
    t1 = nltk.UnigramTagger(train_sents, backoff=t0)
    t2 = nltk.BigramTagger(train_sents, backoff=t1)
    return t2
# hopefully these brown categories will be good enough; I spot checked some results
brown_tagged_sents = brown.tagged_sents(tagset='universal', 
                                        categories=['reviews', 'news', 'romance', 'adventure', 'fiction', 
                                                    'hobbies', 'religion', 'science fiction', 'novel'] )
tagger = build_backoff_tagger(brown_tagged_sents)

In [20]:
def num_verbs(quest):
    sents = sent_tokenizer.tokenize(quest)
    verb_count = 0
    for sent in [tagger.tag(word_tokenize(s)) for s in sents]:
        verb_count = verb_count + len([w[0] for w in sent if w[1]=='VERB' and w[0] not in stopwords.words('english')])
    return verb_count

###Number of adjectives in a question, with stopwords removed *(not used)*
nothing or worse :( (did not end up using)

In [21]:
def num_adjs(quest):
    sents = sent_tokenizer.tokenize(quest)
    adj_count = 0
    for sent in [tagger.tag(word_tokenize(s)) for s in sents]:
        adj_count = adj_count + len([w[0] for w in sent if w[1]=='ADJ' and w[0] not in stopwords.words('english')])
    return adj_count

###Percentage adjectives (without stopwords) in a question
eh, doesn't do much, might include this (did not end up using)

In [22]:
def percent_adjs(quest):
    sents = sent_tokenizer.tokenize(quest)
    adj_count = 0
    word_count = 0
    for sent in [tagger.tag(word_tokenize(s)) for s in sents]:
        adj_count = adj_count + len([w[0] for w in sent if w[1]=='ADJ'])
        word_count = word_count + len(sent)
    return adj_count/word_count

###Percentage of verbs (with stopwords) in a question
helps a little on nb (did not end up using)

In [23]:
def percent_verbs(quest):
    sents = sent_tokenizer.tokenize(quest)
    verb_count = 0
    word_count = 0
    for sent in [tagger.tag(word_tokenize(s)) for s in sents]:
        verb_count = verb_count + len([w[0] for w in sent if w[1]=='VERB'])
        word_count = word_count + len(sent)
    return verb_count/word_count

###Percentage of nouns (without stopwords) in a question *(not in use)*
don't use (did not end up using)

In [213]:
def percent_nouns(quest):
    sents = sent_tokenizer.tokenize(quest)
    noun_count = 0
    word_count = 0
    for sent in [tagger.tag(word_tokenize(s)) for s in sents]:
        noun_count = noun_count + len([w[0] for w in sent if w[1]=='NOUN'])
        word_count = word_count + len(sent)
    return noun_count/word_count

##Apply Selected Features Here

In [26]:
train_feats, test_feats = apply_feature(percent_verbs)

# add new features to our feature arrays
arr_train_feature = np.hstack((arr_train_feature, train_feats))
arr_test_feature = np.hstack((arr_test_feature, test_feats))

## Feature View/Debugging

In [706]:
arr_train_feature.shape

(2698, 6926)

#Testing

In [707]:
def crossvalidation(func, arr_train_feature, predictions_train, cv=10):
    scores = cross_validation.cross_val_score(func, arr_train_feature, predictions_train, cv=cv)
    print("Accuracy: %0.3f (+/- %0.3f)" % (scores.mean(), scores.std() * 2))

In [708]:
nb = MultinomialNB()
nb_model = nb.fit(arr_train_feature, training_data.Category)
nb_predictions = nb_model.predict(arr_test_feature)
#nb_predictions[:1]
crossvalidation(nb_model, arr_train_feature, training_data.Category)

Accuracy: 0.563 (+/- 0.044)


In [709]:
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
logreg_model = logreg.fit(arr_train_feature, training_data.Category)
log_predictions = logreg_model.predict(arr_test_feature)
crossvalidation(logreg_model, arr_train_feature, training_data.Category)

Accuracy: 0.537 (+/- 0.046)


#Output

In [688]:
_id = testing_data['Id']
_category = nb_predictions
final_d = {"ID":_id}
final_df = pd.DataFrame(data=final_d)

In [689]:
final_d2 = {"Category":_category}
final_df2 = pd.DataFrame(data=final_d2)

In [691]:
df_new = pd.concat([final_df, final_df2], axis = 1)

In [692]:
df_new.head()

Unnamed: 0,ID,Category
0,1,4
1,2,4
2,3,1
3,4,2
4,5,2


In [693]:
df_new.to_csv("result8.csv", index=False)