In [1]:
import pandas as pd
import numpy as np
import os, glob 

os.chdir('/Users/daniellee/Desktop/Kaggle/data/stackoverflow_data/')

train = pd.read_csv('cleaned/topic_model_df_train.csv'); train.head(3)
test = pd.read_csv('cleaned/topic_model_df_test.csv'); train.head(3)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,id,title,content,tags,category,combined,content_size,title_size,tags_size,TitleNTags,ContentNTags,title_pred,content_pred,combined_pred,title_nouns,content_nouns
0,0,0,0,1,criticality ribosome binding site relative sta...,prokaryotic translation critical efficient tra...,"['ribosome', 'binding-sites', 'translation', '...",biology,criticality ribosome binding site relative sta...,24,9,4,2,2,criticality ribosome prokaryotic codon,translation 7b observable prokaryotic,translation prokaryotic ribosome codon,criticality ribosome binding site relative sta...,prokaryotic translation critical efficient tra...
1,1,1,1,2,rnase contamination rna based experiments prev...,anyone suggestions prevent rnase contamination...,"['rna', 'biochemistry']",biology,rnase contamination rna based experiments prev...,21,6,2,1,1,prevented rnase contamination experiments,rnase depc pipette degradation,rnase contamination rna depc,rnase contamination rna based experiment preve...,doe anyone suggestion prevent rnase contaminat...
2,2,2,2,3,lymphocyte sizes clustered two groups,tortora writes principles anatomy physiology l...,"['immunology', 'cell-biology', 'hematology']",biology,lymphocyte sizes clustered two groups tortora ...,31,5,3,0,0,clustered lymphocyte groups sizes,lymphocytes diameter tortora 14,lymphocytes clustered groups sizes,lymphocyte size clustered group,tortora principle anatomy physiology lymphocyt...


## Create Train and Test Partitions of Data

In [2]:
from sklearn.cross_validation import train_test_split

partitions = train_test_split(train.drop('category',axis=1), train['category'])

os.chdir('/Users/daniellee/Desktop/Kaggle/data/stackoverflow_data/experiment_test_set/')
for name, part in zip(['trainX', 'testX', 'trainy', 'testy'], partitions):
    part.to_csv(name+'.csv')

In [3]:
trainX, testX, trainy, testy = [df.reset_index() for df in partitions] 
trainX = trainX.ix[:,5:]
testX = testX.ix[:,5:]

## Perform Text Cleaning 

### Remove NA values

In [4]:
# Remove blanks
trainX['content'] = trainX['content'].fillna('')
testX['content'] = testX['content'].fillna('')

trainBlankLoc = trainX['content'] == ''
testBlankLoc = testX['content'] == ''
 
trainX.ix[trainBlankLoc, 'combined'] = trainX.ix[trainBlankLoc, 'title'] + trainX.ix[trainBlankLoc, 'content']
testX.ix[testBlankLoc, 'combined'] = testX.ix[testBlankLoc, 'title'] + testX.ix[testBlankLoc, 'content']

### Stemmer

In [5]:
trainX['stemmed'] = ''
testX['stemmed'] = ''

In [6]:
from nltk.stem import SnowballStemmer

snow = SnowballStemmer('english')
def stemmer(x):
    return ' '.join([snow.stem(word) for word in x.split()]) 

trainX['combined'] = trainX['combined'].map(stemmer)

### Numerics

In [7]:
def is_digit(text):
    try:
        float(text)
        return True 
    except ValueError:
        return False

def numeric_replacer(x):
    new_set = []
    for word in x.split():
        if is_digit(word):
            new_set.append('digitstring')
        else:
            new_set.append(word)
    
    return ' '.join(new_set)

trainX['combined'] = trainX['combined'].map(numeric_replacer)

## TF-IDF Calculation

In [8]:
from operator import itemgetter
from itertools import chain

import pandas as pd
import numpy as np

class GetBowDummies(object):

    """
    Inputs: (1) Series with a text vector (2) Bag of Words for features
    Output: Dataframe with dummy variables indicating whether a feature word is present in a row.

    Examples
    --------

    train = pd.Series(['I dont know','polish dont','fire','healthcare know','healthcare'])
    test  = pd.Series(['I dont know','healthcare know'])
    feats = ['dont','know','healthcare']

    train_bow_dummies = GetBowDummies(train, feats).get_bow_dummies()

    test_bow_dummies = GetBowDummies(test, feats).get_bow_dummies()

    test_bow_dummies
     >> dont  know  healthcare
        0     0     0           0
        1     0     0           0

    """

    # Initialize
    def __init__(self, series, features):
        """
        :param series: A column containing raw text
        :param features: A list of feature words
        """
        self.series = series
        self.index  = self.series.index
        self.features = features

        # Define dimension
        self.nrows = series.shape[0]
        self.ncols = len(features)
        self.dim   = (self.nrows, self.ncols)

    def index_feats_dict(self):
        """
        For every document row, features present in doc
        identified.
        """
        doc_features_dict = {}

        for index, doc in zip(self.index, self.series):
            # Sets for a doc and feature words
            
            doc_set = set(doc.split())
            feat_set = set(self.features)

            # Shared words between the two sets
            interset_words = doc_set.intersection(feat_set)

            # Append to doc_features_dict
            doc_features_dict[index] = list(interset_words)

        return doc_features_dict

    def get_bow_dummies(self):
        """
        Replace 0's with 1 in positions of a bow dataframe
        to indicate that feature words are present in docs
        """

        # Get an np matrix of zeros based on defined dim
        zero_matrix = np.zeros(self.dim, np.int)

        # Create a dataframe containing feature columns and 0's
        self.zero_df = pd.DataFrame(zero_matrix, columns=self.features)

        # Get a dictionary of index and features per doc
        doc_features_dict = self.index_feats_dict()
        doc_ids = doc_features_dict.keys()
        doc_feats = doc_features_dict.values()

        # For each row in zero_df, indicate 1 for every
        # feature word present in a doc of a dataframe
        for index, feats in zip(doc_ids, doc_feats):
            self.zero_df.ix[index, feats] = 1

        return self.zero_df

In [9]:
from gensim import corpora, models, similarities

def topic_index(series, topic):
    return series.ix[series == topic].index

def text_joiner(x):
    return ' '.join(x)

def corpusCreator(df, labelSeries):
    
    text = {}
    for topic in labelSeries.unique():
        index = topic_index(labelSeries, topic)
        topicText = []
        for sent in df[index]:
            topicText += sent.split()
        text[topic] = topicText 
    
    return text
    
def tfidf(bags_of_words):
    
    """ Fetches the top words in k based on TF-IDF scores.
        Returns a list of words with top K.
    """
    
    # Fetch id for each word
    idDict = corpora.Dictionary(bags_of_words)
    
    # Get the reverse key-vaule mapping
    inv_Dict = {v:k for v, k in idDict.items()}
    
    # Transform tCorpus into vector form
    vCorpus = [idDict.doc2bow(tokens) for tokens in bags_of_words]

    # Fit TFIDF
    tfidf = models.TfidfModel(vCorpus)
    
    return tfidf, inv_Dict, vCorpus

def tfidf_at_k(tfidf, doc_vector, word_key, k):
    
    top_k_id_scores = sorted(tfidf[doc_vector], key=itemgetter(1), reverse=True)[:k]
    
    return [word_key[key] for key, word in top_k_id_scores] 

## Conduct Experiment

In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.externals import joblib

## Perform Feature Selection using top K TF-IDF per Class

# Choose kth threshold

class_dictionary = corpusCreator(trainX['combined'], trainy['category'])
topics = class_dictionary.keys()
mod, word_key, vCorpus = tfidf(list(class_dictionary.values()))
topic_vector = {topic: document for topic, document in zip(topics, vCorpus)}

k_to_test = [50, 100, 500, 1000, 5000, 8000, 10000]
trainF1 = []
testF1  = []
k_model_dict = {}

le = LabelEncoder()
y_encode = le.fit(trainy['category'])
train_true_y = le.transform(trainy['category'])
test_true_y = le.transform(testy['category'])

for k in k_to_test:
    
    top_k_features_per_topic = {}
    for topic in topics:
        top_k_features = tfidf_at_k(mod, topic_vector[topic], word_key, k)
        top_k_features_per_topic[topic] = top_k_features

    # Create Bow dummies
    feats = list(chain.from_iterable(top_k_features_per_topic.values()))
    train_bow = GetBowDummies(trainX['combined'], feats).get_bow_dummies()
    test_bow = GetBowDummies(testX['combined'], feats).get_bow_dummies()
    
    # Fit Model 
    mnb = MultinomialNB()
    mnb.fit(X=train_bow, y=trainy['category'])
    k_model_dict[k] = mnb
    joblib.dump(mnb, str(k)+'_'+'mnb_model.pkl') 
    
    # Make train and test predictions
    train_pred_y = mnb.predict(train_bow)
    test_pred_y = mnb.predict(test_bow)
    
    # Apply encoder 
    train_pred_y = le.transform(train_pred_y)
    test_pred_y = le.transform(test_pred_y)
    
    # Evaluate F1
    trainF1.append(f1_score(train_pred_y, train_true_y, average='macro'))
    testF1.append(f1_score(test_pred_y, test_true_y, average='macro'))

### Feature Columns for Train and Test

In [None]:
trainF1

In [None]:
testF1