In [1]:
import pandas as pd
import numpy as np
import os, glob 

os.chdir('/Users/daniellee/Desktop/Kaggle/data/stackoverflow_data/')

train = pd.read_csv('cleaned/topic_model_df_train.csv'); train.head(3)
test = pd.read_csv('cleaned/topic_model_df_test.csv'); train.head(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,id,title,content,tags,category,combined,content_size,title_size,tags_size,TitleNTags,ContentNTags,title_pred,content_pred,combined_pred,title_nouns,content_nouns
0,0,0,0,1,criticality ribosome binding site relative sta...,prokaryotic translation critical efficient tra...,"['ribosome', 'binding-sites', 'translation', '...",biology,criticality ribosome binding site relative sta...,24,9,4,2,2,criticality ribosome prokaryotic codon,translation 7b observable prokaryotic,translation prokaryotic ribosome codon,criticality ribosome binding site relative sta...,prokaryotic translation critical efficient tra...
1,1,1,1,2,rnase contamination rna based experiments prev...,anyone suggestions prevent rnase contamination...,"['rna', 'biochemistry']",biology,rnase contamination rna based experiments prev...,21,6,2,1,1,prevented rnase contamination experiments,rnase depc pipette degradation,rnase contamination rna depc,rnase contamination rna based experiment preve...,doe anyone suggestion prevent rnase contaminat...
2,2,2,2,3,lymphocyte sizes clustered two groups,tortora writes principles anatomy physiology l...,"['immunology', 'cell-biology', 'hematology']",biology,lymphocyte sizes clustered two groups tortora ...,31,5,3,0,0,clustered lymphocyte groups sizes,lymphocytes diameter tortora 14,lymphocytes clustered groups sizes,lymphocyte size clustered group,tortora principle anatomy physiology lymphocyt...


## Create Train and Test Partitions of Data

In [2]:
from sklearn.cross_validation import train_test_split

partitions = train_test_split(train.drop('category',axis=1), train['category'])

os.chdir('/Users/daniellee/Desktop/Kaggle/data/stackoverflow_data/experiment_test_set/')
for name, part in zip(['trainX', 'testX', 'trainy', 'testy'], partitions):
    part.to_csv(name+'.csv')

In [3]:
trainX, testX, trainy, testy = [df.reset_index() for df in partitions] 
trainX = trainX.ix[:,5:]
testX = testX.ix[:,5:]

## Perform Text Cleaning 

### Remove NA values

In [4]:
# Remove blanks
trainX['content'] = trainX['content'].fillna('')
testX['content'] = testX['content'].fillna('')

trainBlankLoc = trainX['content'] == ''
testBlankLoc = testX['content'] == ''
 
trainX.ix[trainBlankLoc, 'combined'] = trainX.ix[trainBlankLoc, 'title'] + trainX.ix[trainBlankLoc, 'content']
testX.ix[testBlankLoc, 'combined'] = testX.ix[testBlankLoc, 'title'] + testX.ix[testBlankLoc, 'content']

### Stemmer

In [5]:
trainX['stemmed'] = ''
testX['stemmed'] = ''

In [6]:
from nltk.stem import SnowballStemmer

snow = SnowballStemmer('english')
def stemmer(x):
    return ' '.join([snow.stem(word) for word in x.split()]) 

trainX['combined'] = trainX['combined'].map(stemmer)

### Numerics

In [7]:
def is_digit(text):
    try:
        float(text)
        return True 
    except ValueError:
        return False

def numeric_replacer(x):
    new_set = []
    for word in x.split():
        if is_digit(word):
            new_set.append('digitstring')
        else:
            new_set.append(word)
    
    return ' '.join(new_set)

trainX['combined'] = trainX['combined'].map(numeric_replacer)

## TF-IDF Calculation

In [18]:
from operator import itemgetter
from itertools import chain

import pandas as pd
import numpy as np

class GetBowDummies(object):

    """
    Inputs: (1) Series with a text vector (2) Bag of Words for features
    Output: Dataframe with dummy variables indicating whether a feature word is present in a row.

    Examples
    --------

    train = pd.Series(['I dont know','polish dont','fire','healthcare know','healthcare'])
    test  = pd.Series(['I dont know','healthcare know'])
    feats = ['dont','know','healthcare']

    train_bow_dummies = GetBowDummies(train, feats).get_bow_dummies()

    test_bow_dummies = GetBowDummies(test, feats).get_bow_dummies()

    test_bow_dummies
     >> dont  know  healthcare
        0     0     0           0
        1     0     0           0

    """

    # Initialize
    def __init__(self, series, features):
        """
        :param series: A column containing raw text
        :param features: A list of feature words
        """
        self.series = series
        self.index  = self.series.index
        self.features = features

        # Define dimension
        self.nrows = series.shape[0]
        self.ncols = len(features)
        self.dim   = (self.nrows, self.ncols)

    def index_feats_dict(self):
        """
        For every document row, features present in doc
        identified.
        """
        doc_features_dict = {}

        for index, doc in zip(self.index, self.series):
            # Sets for a doc and feature words
            
            doc_set = set(doc.split())
            feat_set = set(self.features)

            # Shared words between the two sets
            interset_words = doc_set.intersection(feat_set)

            # Append to doc_features_dict
            doc_features_dict[index] = list(interset_words)

        return doc_features_dict

    def get_bow_dummies(self):
        """
        Replace 0's with 1 in positions of a bow dataframe
        to indicate that feature words are present in docs
        """

        # Get an np matrix of zeros based on defined dim
        zero_matrix = np.zeros(self.dim, np.int)

        # Create a dataframe containing feature columns and 0's
        self.zero_df = pd.DataFrame(zero_matrix, columns=self.features)

        # Get a dictionary of index and features per doc
        doc_features_dict = self.index_feats_dict()
        doc_ids = doc_features_dict.keys()
        doc_feats = doc_features_dict.values()

        # For each row in zero_df, indicate 1 for every
        # feature word present in a doc of a dataframe
        for index, feats in zip(doc_ids, doc_feats):
            self.zero_df.ix[index, feats] = 1

        return self.zero_df

In [9]:
class GetBowDummies_Array(object):

    """
    Inputs: (1) Series with a text vector (2) Bag of Words for features
    Output: Dataframe with dummy variables indicating whether a feature word is present in a row.

    Examples
    --------

    train = pd.Series(['I dont know','polish dont','fire','healthcare know','healthcare'])
    test  = pd.Series(['I dont know','healthcare know'])
    feats = ['dont','know','healthcare']

    train_bow_dummies = GetBowDummies(train, feats).get_bow_dummies()

    test_bow_dummies = GetBowDummies(test, feats).get_bow_dummies()

    test_bow_dummies
     >> dont  know  healthcare
        0     0     0           0
        1     0     0           0

    """

    # Initialize
    def __init__(self, series, features):
        """
        :param series: A column containing raw text
        :param features: A list of feature words
        """
        features.sort()
        
        self.series = series
        self.index  = self.series.index
        self.features = np.asarray(features)

        # Define dimension
        self.nrows = series.shape[0]
        self.ncols = len(features)
        self.dim   = (self.nrows, self.ncols)

    def index_feats_dict(self):
        """
        For every document row, features present in doc
        identified.
        """
        doc_features_list = []

        for i, doc in enumerate(self.series):
            # Sets for a doc and feature words
            
            doc_set = set(doc.split())
            feat_set = set(self.features)

            # Shared words between the two sets
            interset_words = np.asarray(list(doc_set.intersection(feat_set)))
            
            if len(interset_words) != 0: 
                ndx = np.searchsorted(self.features, interset_words)
                doc_features_list.append(np.asarray([1 if i in ndx else 0 for i in range(self.ncols)]))
            else:
                doc_features_list.append(np.asarray([0 for i in range(self.ncols)])) 

        return np.matrix(doc_features_list)

        # feat1 feat2 feat3
        # Find element location of feat and generate sparse one row at a time
        # append it to another array 
        
        #return # self.zero_df

In [19]:
class GetBowDummies_Array2(object):

    """
    Inputs: (1) Series with a text vector (2) Bag of Words for features
    Output: Dataframe with dummy variables indicating whether a feature word is present in a row.

    Examples
    --------

    train = pd.Series(['I dont know','polish dont','fire','healthcare know','healthcare'])
    test  = pd.Series(['I dont know','healthcare know'])
    feats = ['dont','know','healthcare']

    train_bow_dummies = GetBowDummies(train, feats).get_bow_dummies()

    test_bow_dummies = GetBowDummies(test, feats).get_bow_dummies()

    test_bow_dummies
     >> dont  know  healthcare
        0     0     0           0
        1     0     0           0

    """

    # Initialize
    def __init__(self, series, features):
        """
        :param series: A column containing raw text
        :param features: A list of feature words
        """
        features.sort()
        
        self.series = series
        self.index  = self.series.index
        self.features = np.asarray(features)

        # Define dimension
        self.nrows = series.shape[0]
        self.ncols = len(features)
        self.dim   = (self.nrows, self.ncols)

    def index_feats_dict(self):
        """
        For every document row, features present in doc
        identified.
        """
        # doc_features_list = []
        zero_matrix = np.zeros(self.dim, np.int)

        for i, doc in enumerate(self.series):
            # Sets for a doc and feature words
            
            doc_set = set(doc.split())
            feat_set = set(self.features)

            # Shared words between the two sets
            interset_words = np.asarray(list(doc_set.intersection(feat_set)))
            
            if len(interset_words) != 0: 
                ndx = np.searchsorted(self.features, interset_words)
                zero_matrix[i,ndx] = 1
            else:
                continue
                
        return zero_matrix

        # feat1 feat2 feat3
        # Find element location of feat and generate sparse one row at a time
        # append it to another array 
        
        #return # self.zero_df

# Performance

In [375]:
k_to_test = [100]#, 100, 500] #, 1000, 5000]
trainF1 = []
testF1  = []
k_model_dict = {}

le = LabelEncoder()
y_encode = le.fit(trainy['category'])
train_true_y = le.transform(trainy['category'])
test_true_y = le.transform(testy['category'])

for k in k_to_test:
    
    top_k_features_per_topic = {}
    for topic in topics:
        top_k_features = tfidf_at_k(mod, topic_vector[topic], word_key, k)
        top_k_features_per_topic[topic] = top_k_features

    # Create Bow dummies
    feats = list(chain.from_iterable(top_k_features_per_topic.values()))
    #train_bow = GetBowDummies_Array(trainX['combined'], feats).index_feats_dict()
    #test_bow = GetBowDummies(testX['combined'], feats).get_bow_dummies()

In [382]:
start_time = time.time()
_1 = GetBowDummies(trainX['combined'], feats).get_bow_dummies()
print("--- %s seconds ---" % (time.time() - start_time))

--- 29.993335962295532 seconds ---


In [377]:
start_time = time.time()
_2 = GetBowDummies_Array(trainX['combined'], feats).index_feats_dict()
print("--- %s seconds ---" % (time.time() - start_time))

--- 58.2199809551239 seconds ---


In [378]:
start_time = time.time()
_3 = GetBowDummies_Array2(trainX['combined'], feats).index_feats_dict()
print("--- %s seconds ---" % (time.time() - start_time))

--- 17.118018865585327 seconds ---


## Evaluation

In [384]:
# 1
mnb = MultinomialNB()
mnb.fit(X=_1, y=trainy['category'])
train_pred_y = mnb.predict(_1)
train_pred_y = le.transform(train_pred_y)
f1_score(train_pred_y, train_true_y, average='macro')

0.88096212692135933

In [385]:
# 2
mnb = MultinomialNB()
mnb.fit(X=_2, y=trainy['category'])
train_pred_y = mnb.predict(_2)
train_pred_y = le.transform(train_pred_y)
f1_score(train_pred_y, train_true_y, average='macro')

0.88097379044320878

In [386]:
# 3
mnb = MultinomialNB()
mnb.fit(X=_3, y=trainy['category'])
train_pred_y = mnb.predict(_3)
train_pred_y = le.transform(train_pred_y)
f1_score(train_pred_y, train_true_y, average='macro')

0.88097379044320878

## Delte above

In [20]:
from gensim import corpora, models, similarities
#from operator import itemgetter
#from itemgetter import chain

def topic_index(series, topic):
    return series.ix[series == topic].index

def text_joiner(x):
    return ' '.join(x)

def corpusCreator(df, labelSeries):
    
    text = {}
    for topic in labelSeries.unique():
        index = topic_index(labelSeries, topic)
        topicText = []
        for sent in df[index]:
            topicText += sent.split()
        text[topic] = topicText 
    
    return text
    
def tfidf(bags_of_words):
    
    """ Fetches the top words in k based on TF-IDF scores.
        Returns a list of words with top K.
    """
    
    # Fetch id for each word
    idDict = corpora.Dictionary(bags_of_words)
    
    # Get the reverse key-vaule mapping
    inv_Dict = {v:k for v, k in idDict.items()}
    
    # Transform tCorpus into vector form
    vCorpus = [idDict.doc2bow(tokens) for tokens in bags_of_words]

    # Fit TFIDF
    tfidf = models.TfidfModel(vCorpus)
    
    return tfidf, inv_Dict, vCorpus

def tfidf_at_k(tfidf, doc_vector, word_key, k):
    
    top_k_id_scores = sorted(tfidf[doc_vector], key=itemgetter(1), reverse=True)[:k]
    
    return [word_key[key] for key, word in top_k_id_scores] 

## Conduct Experiment

In [22]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.externals import joblib
import time

## Perform Feature Selection using top K TF-IDF per Class

# Choose kth threshold

class_dictionary = corpusCreator(trainX['combined'], trainy['category'])
topics = class_dictionary.keys()
mod, word_key, vCorpus = tfidf(list(class_dictionary.values()))
topic_vector = {topic: document for topic, document in zip(topics, vCorpus)}

k_to_test = [50, 100, 500, 1000, 3000, 5000, 7500, 10000]
model_result_dict = {}
k_model_dict = {}

le = LabelEncoder()
y_encode = le.fit(trainy['category'])
train_true_y = le.transform(trainy['category'])
test_true_y = le.transform(testy['category'])

for k in k_to_test:
    
    start_time = time.time()
    top_k_features_per_topic = {}
    for topic in topics:
        top_k_features = tfidf_at_k(mod, topic_vector[topic], word_key, k)
        top_k_features_per_topic[topic] = top_k_features

    # Create Bow dummies
    feats = list(chain.from_iterable(top_k_features_per_topic.values()))
    train_bow = GetBowDummies_Array2(trainX['combined'], feats).index_feats_dict()
    test_bow = GetBowDummies_Array2(testX['combined'], feats).index_feats_dict()
    
    # Fit Model 
    mnb = MultinomialNB()
    mnb.fit(X=train_bow, y=trainy['category'])
    k_model_dict[k] = mnb
    joblib.dump(mnb, str(k)+'_'+'mnb_model.pkl') 
    
    # Make train and test predictions
    train_pred_y = mnb.predict(train_bow)
    test_pred_y = mnb.predict(test_bow)
    
    # Measure time
    duration = time.time() - start_time
    
    # Apply encoder 
    train_pred_y = le.transform(train_pred_y)
    test_pred_y = le.transform(test_pred_y)
    
    # Evaluate F1
    trainF1 = f1_score(train_pred_y, train_true_y, average='macro') 
    testF1 = f1_score(test_pred_y, test_true_y, average='macro')
    
    # Store the result in dictionary 
    model_result = {'duration': duration, 'trainF1': trainF1, 'testF1': testF1, \
                    'train_pred_y': train_pred_y, 'test_pred_y': test_pred_y}

    model_result_dict[k] = model_result
    
    print(k, 'completed')

50 completed
100 completed
500 completed
1000 completed
3000 completed
5000 completed
7500 completed
10000 completed


### Feature Columns for Train and Test

In [24]:
model_result_dict

{50: {'duration': 12.492578029632568,
  'testF1': 0.71763763281824777,
  'test_pred_y': array([3, 1, 3, ..., 3, 1, 3]),
  'trainF1': 0.83320617933125707,
  'train_pred_y': array([3, 3, 2, ..., 3, 3, 5])},
 100: {'duration': 22.529353141784668,
  'testF1': 0.77021222471766493,
  'test_pred_y': array([3, 1, 3, ..., 3, 1, 3]),
  'trainF1': 0.87302681194389875,
  'train_pred_y': array([3, 3, 2, ..., 3, 3, 5])},
 500: {'duration': 106.49285817146301,
  'testF1': 0.86393227552622731,
  'test_pred_y': array([4, 1, 1, ..., 3, 1, 3]),
  'trainF1': 0.93847142014563634,
  'train_pred_y': array([3, 3, 2, ..., 3, 3, 5])},
 1000: {'duration': 867.7104859352112,
  'testF1': 0.88319030556919287,
  'test_pred_y': array([4, 1, 1, ..., 3, 1, 3]),
  'trainF1': 0.95267765337301091,
  'train_pred_y': array([3, 3, 2, ..., 3, 3, 5])},
 3000: {'duration': 891.8288180828094,
  'testF1': 0.9030717877691804,
  'test_pred_y': array([4, 1, 1, ..., 3, 1, 3]),
  'trainF1': 0.96321019421099818,
  'train_pred_y': array

## Assesing Prediction

In [14]:
result = pd.DataFrame({'pred':test_pred_y,'true':test_true_y})
result['pred'] = result['pred'].map(lambda x: le.inverse_transform(x)) 
result['true'] = result['true'].map(lambda x: le.inverse_transform(x)) 

In [15]:
# Create a crosstab of true/false prediction counts per category
pd.crosstab(result['pred'], result['true'])

true,biology,cooking,crypto,diy,robotics,travel
pred,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
biology,177,192,130,346,34,252
cooking,845,1015,713,1708,163,1305
crypto,379,449,309,753,90,552
diy,877,1026,632,1616,172,1236
robotics,319,410,261,673,76,496
travel,698,838,541,1316,138,1013


DIY is overpowering prediction, which can be partially explained given that from the train features contain mostly diy topics. Another reason could be that words in a lot of words in DIY are commonly found in other topics so it might be difficult for the classification to clearly distinguish whether a topic is DIY or any other topic.

In [16]:
pd.Series(train_true_y).value_counts()

3    19506
5    14425
1    11474
0     9901
2     7846
4     2098
dtype: int64

In [17]:
# Are there a lot of overlapping features indicated between DIY vs Biology in train matrix such that it's
# difificult for the model can't distinguish between the two vs if it was robotics vs crypto? 

# K-Features 
# Features in Train
# Features in Test
# Model Accuracy is poor given that the data isn't Gaussian?

# Distribution of mutual information???? 

#### Conclusion

Rare occurrences are what's disrupting accurate prediction. For instance, the term polar bear is a rare term that would result in low TF-IDF score in biology, so when the k-th TIFDF were chosen, the term was weeded out from the features for classifying biology. Some combined contents contain feature words found in other topic, but not in the ones that are actually from the topic itself. 

One possible solution is to expand the feature set costing more computation time or re-design feature selection and engineering or even possibly dimensionality reduction.

Other feature suggestions:

- Number of paragraphs
- Capitalized acronyms
- Dimensionality reduction