# Kaggle-based Text Classification


In [1]:
import nltk
from urllib.request import urlopen
import urllib.parse
import string
from nltk.util import ngrams
import re
from nltk.corpus import brown
from nltk.stem.wordnet import WordNetLemmatizer
import random
from nltk.collocations import *

import pandas as pd
# import cPickle as pickle
import matplotlib.pyplot as plt
# import seaborn as sns
import numpy as np
from pandas.tools.plotting import scatter_matrix
from ast import literal_eval 
from collections import defaultdict
import datetime

%matplotlib inline

In [2]:
raw_data = pd.read_csv('yelp_data_official_training.csv', sep='|', low_memory=False)
reviews_tmp = list(raw_data['Review Text'])
categories = list(raw_data['Category'])
data_list = []
for i in range(len(categories)):
    data_list.append((reviews_tmp[i], categories[i]))
random.shuffle(data_list)
data_list[0]

('All I can say is I love this place!  Everyone here is absolutely amazing.  The entire staff is friendly, caring and treat you as if you and your furry friend are like extended family.  \nI recently moved about 35 miles away from here and I would NEVER change vets.  I still drive here every week.  \nIf you ever need an honest and caring vet this is the place to come.',
 5)

In [51]:
def tokenize_text(corpus):
    '''Split text into sentences and tokenize
    '''
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
    raw_sents = sent_tokenizer.tokenize(corpus.replace("\ufeff", "")) 
    return [nltk.word_tokenize(str(word)) for word in raw_sents]


def build_stop_words():
    '''Build stop words from SMART (Salton,1971).  Available at ftp://ftp.cs.cornell.edu/pub/smart/english.stop
    '''
    stop_words = []
    file = "SmartStoplist.txt"
    try:
        with open(file, 'r') as fp:
            tmp = fp.readlines()
            stop_words = [ word.replace('\n', '') for word in tmp ]
    except:
        print("Can't open specified file: {0}".format(file))
        
    return stop_words

def extract_unigram(sents):
    '''Extract unigram candidates, and prune the candidates.

    '''
    unigram_raw_candidates = []
    unigram_candidates = []
    lm = WordNetLemmatizer()
    for sent in sents:
        unigram_raw_candidates += list(ngrams(sent,1))    
    
    # Unigram pruning: remove punctions, stop words, words that are capitalized in the first character, words less than 2 characters.
    stopwords = build_stop_words()
    
    unigram_pattern = r"\d+|\'+|\`+|^[A-Z]\w*|\w+\'\w+|\.+"
#     unigram_pattern = r"\d+|\'+|\`+|\.+"
    for element in unigram_raw_candidates:
#         if re.match(unigram_pattern, element[0]) or element[0] in string.punctuation or len(element[0])<2 :        
        if re.match(unigram_pattern, element[0]) or element[0] in string.punctuation or element[0] in stopwords or len(element[0])<2 :
            continue
        else:
            unigram_candidates.append(lm.lemmatize(element[0], 'v'))
    return unigram_candidates

def extract_bigram(sents):
    bigram_candidates = []
    bigram_raw_candidates = []
    for sent in sents:
        bigram_raw_candidates += list(ngrams(sent, 2))
    
    bigram_pattern = r".*[\!\"\#\$\%\&\'\(\)\*\+,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\`\{\|\}\~]+.*"                             
    for element in bigram_raw_candidates:
        element_combined = " ".join(element)
        if re.match(bigram_pattern, element_combined):
            continue
        else:
            bigram_candidates.append(element_combined)
    return bigram_candidates


def build_backoff_tagger(train_sents):
    t0 = nltk.DefaultTagger('NN')
    t1 = nltk.UnigramTagger(train_sents, backoff=t0)
    t2 = nltk.BigramTagger(train_sents, backoff=t1)
    return t2

def train_tagger(already_tagged_sents):
    ngram_tagger = build_backoff_tagger(already_tagged_sents)
#     print ("%0.3f pos accuracy on test set" % ngram_tagger.evaluate(test_sents))
    return ngram_tagger

def clean_output(candidates):
    '''Output clean phrases from freqdist format, and limit characters under count.
    '''
    keyphrases = []
    sum = 0
    for (keyphrase, count) in candidates:
        tmp = list(keyphrase.split())
        kp = " ".join([ word.replace("(", "").replace("'", "").replace(",", "") for word in tmp[0::2] ])
        keyphrases.append((kp))
    return keyphrases

# train tagger
brown_tagged_sents = brown.tagged_sents(categories=['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies',
    'humor', 'learned', 'lore', 'mystery', 'religion', 'reviews', 'romance', 'science_fiction'])

tagger = train_tagger(brown_tagged_sents)

### Generate features

In [53]:
# unigram feature
def review_features1(review):
    feature_dict = {}
    try:
        # unigram features
        words = extract_unigram(tokenize_text(review))
        for (word, count) in nltk.FreqDist(words).most_common():
            feature_dict[word] = count
    except:
        print("Error occurred for review ")
    return feature_dict

# unigram feature by Nihar
def review_features2(review):
    try:
        feature_dict = {}
        pattern = r'''(?x)          # set flag to allow verbose regexps
            (?:[A-Z]\.)+        # abbreviations, e.g. U.S.A.
             | \w+(?:['-]\w+)*        # words with optional internal hyphens
             | \$?\d+(?:\.\d+)?%?  # currency and percentages, e.g. $12.40, 82%
             | \.\.\.              # ellipsis
             | [][.,;"'?():_`-]    # these are separate tokens; includes ], ['''
        words = nltk.regexp_tokenize(review, pattern)
        stopwords = nltk.corpus.stopwords.words('english')
        norm_words = [word.lower() for word in words if len(word) > 2 and word.lower() not in stopwords]
        wnl = nltk.WordNetLemmatizer()
        lemma_words = [wnl.lemmatize(t) for t in norm_words]
        freqdist = nltk.FreqDist(lemma_words)
        for (word, count) in freqdist.most_common():
             feature_dict[word] = count
    except: 
        # print("Error occurred for review " + str(review))
        print("Error occurred for review ")
    return feature_dict

# keyphrase chunking feature
def review_features3(review):
    feature_dict = {}      
    # Stem words. Without stemming, top keyphrases consist of phrases with same structure but different forms.
    lm = WordNetLemmatizer()
    try:
        sents = tokenize_text(review)
        txt_tagged = []
        sents_stemmed = []
        for sent in sents:
            sents_stemmed.append([lm.lemmatize(word).lower() for word in sent])
        for sent in sents_stemmed:
            txt_tagged.append(tagger.tag(sent))

        # descriptive keyphrases pattern
#         {(<JJ>* <NN.*>+ <IN>)? <JJ>+ <NN.*>+}
        dkp = nltk.RegexpParser('DK: {(<JJ>|<NN>)+(<NN>|<IN>)|<NN>}') 

        descriptive_keyphrases = []
        for sent in txt_tagged:
            tree = dkp.parse(sent)
            for subtree in tree.subtrees():
                if subtree.label() == 'DK': 
                    dk = " ".join(str(e) for e in subtree[0:])
                    descriptive_keyphrases.append(dk)

        for (keyphrase, count) in nltk.FreqDist(descriptive_keyphrases).most_common():
            tmp = list(keyphrase.split())
            kp = " ".join([ str(word).replace("(", "").replace("'", "").replace(",", "") for word in tmp[0::2] ])
            feature_dict[kp] = count
    except:
        print("Error occurred for review ")
    return feature_dict

# collocation feature
def review_features4(review):
    feature_dict = {}
    try:
        # corpus_words = nltk.word_tokenize(corpus)
        sents = tokenize_text(review)
        bigram_measures = nltk.collocations.BigramAssocMeasures()

        # stem sentences
        lm = WordNetLemmatizer()
        sents_stemmed = []
        for sent in sents:
            sents_stemmed.append([lm.lemmatize(word).lower() for word in sent])

        candidates_tmp = []
        for sent in sents_stemmed:
            # tag collocation words with trained brown tagger
            finder = BigramCollocationFinder.from_words(tagger.tag(sent))
            tmp = finder.nbest(bigram_measures.pmi, 10000)
            candidates_tmp += tmp

        candidates_2 = nltk.FreqDist(candidates_tmp).most_common()
        stopwords = build_stop_words()
        candidates_ = []
        for candidate in candidates_2:
            # filter out descriptive noun phrases
            if re.match(r'NN.*', candidate[0][1][1]):
                # filter out punctuation, stopwords
                if  (candidate[0][0][0] not in string.punctuation) and (candidate[0][0][0] not in stopwords) and (candidate[0][1][0] not in stopwords):
                    kp = candidate[0][0][0] + " " + candidate[0][1][0]
                    feature_dict[kp] = candidate[1]
    except:
        print("Error occurred for review")
    return feature_dict

# biagram feature
def review_features5(review):
    feature_dict = {}
    try:
        words = extract_bigram(tokenize_text(review))
        for (word, count) in nltk.FreqDist(words).most_common():
            feature_dict[word] = count
    except:
        print("Biagram feature error.")
    return feature_dict



def review_features(review):
    unigram = review_features1(review)
    chunking = review_features5(review)
#     collocation = review_features4(review)
    features = unigram.copy()
    features.update(chunking)
#     features.update(collocation)
    return features

# review_features5("This is what I am going to test. This is a descriptive sentence.")

{'I am': 1,
 'This is': 2,
 'a descriptive': 1,
 'am going': 1,
 'descriptive sentence': 1,
 'going to': 1,
 'is a': 1,
 'is what': 1,
 'to test': 1,
 'what I': 1}

In [63]:
featuresets = [(review_features1(key), value) for (key, value) in data_list ]
# featuresets5 = [(review_features5(key), value) for (key, value) in data_list ]
# featuresets = featuresets1 + featuresets5
featuresets[:10]

Error occurred for review 
Error occurred for review 


[({'absolutely': 1,
   'amaze': 1,
   'care': 2,
   'change': 1,
   'drive': 1,
   'entire': 1,
   'extend': 1,
   'family': 1,
   'friend': 1,
   'friendly': 1,
   'furry': 1,
   'honest': 1,
   'love': 1,
   'miles': 1,
   'move': 1,
   'place': 2,
   'recently': 1,
   'staff': 1,
   'treat': 1,
   'vet': 2,
   'week': 1},
  5),
 ({'absolutely': 1,
   'amaze': 1,
   'amount': 1,
   'ca': 1,
   'careful': 1,
   'choose': 1,
   'clean': 1,
   'comfortable': 1,
   'courteous': 1,
   'dermatologists': 1,
   'drop': 1,
   'experience': 1,
   'expertise': 1,
   'facility': 1,
   'feel': 2,
   'felt': 1,
   'good': 1,
   'great': 1,
   'group': 1,
   'hand': 1,
   'health': 1,
   'high': 1,
   'knowledgeable': 1,
   'level': 1,
   'mother': 1,
   'pick': 1,
   'prior': 1,
   'professional': 2,
   'quality': 1,
   'receive': 1,
   'recommendation': 1,
   'referrals': 1,
   'research': 1,
   'service': 2,
   'situations': 1,
   'slack': 1,
   'staff': 1,
   'start': 2,
   'trust': 1,
   'unco

In [64]:
def create_training_sets (feature_function, items, return_items=False):
    # Create the features sets.  Call the function that was passed in.
    # For names data, key is the name, and value is the gender
    featuresets = [(feature_function(key), value) for (key, value) in items]
    
    # Divided training and testing in thirds.  Could divide in other proportions instead.
    split = int(float(len(featuresets)) * 3/ 4.0)
    
    train_set, dev_set = featuresets[0:split], featuresets[split:]
    train_items, dev_items = items[0:split], items[split:]
    if return_items == True:
        return train_set, dev_set, train_items, dev_items
    else:
        return train_set, dev_set

In [65]:
train_set, dev_set = create_training_sets (review_features, data_list, return_items=False)

Error occurred for review 
Biagram feature error.
Error occurred for review 
Biagram feature error.


In [62]:
cl = nltk.NaiveBayesClassifier.train(train_set)
print ("%.3f" % nltk.classify.accuracy(cl, dev_set))

0.669


In [71]:
test_data = pd.read_csv('yelp_data_official_test_nocategories.csv', sep='|', low_memory=False)

In [72]:
reviews_test = list(test_data['Review Text'])
id_test = list(test_data['ID'])
test_list = []
for i in range(len(id_test)):
    test_list.append((id_test[i], reviews_test[i]))

In [73]:
with open("yelp_data_official_test_submission_leon.csv", "w") as f:
    f.write("Id,Category\n")
    for (id_,review) in test_list:
        category = cl.classify(review_features(review))
        f.write(str(id_) + "," + str(category) + "\n")