# Load Data and Prep Data

In [174]:
# Modules Requires

# Common
import numpy as np
import pandas as pd
import time
import os
import glob

# Special tools
from operator import itemgetter
from itertools import chain

# Sklearn 
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from sklearn.externals import joblib

In [183]:
# Fetch data
os.chdir('/Users/daniellee/Desktop/Kaggle/data/stackoverflow_data/part1_approach1_output/')
testX, trainX = map(pd.read_csv, glob.glob('*.csv')[:2])
testy = pd.read_csv('testy.csv')
trainy = pd.read_csv('trainy.csv')

In [179]:
# Load Dictionary
os.chdir('/Users/daniellee/Desktop/Kaggle/data/stackoverflow_data/')
tfidf_selection = np.load('top_k_features.npy').item()

# 1) K Topic Chooser Algorithm

In [180]:
def k_number_chooser(target_series, k_per_class):
    """ Choose k features per topic based on inverse quanitity
        of document class count and number of k features selected
        per class
    """
    
    # Get number of vectors
    _len = len(target_series)
    unique_nums = len(target_series.unique())
    total_features = k_per_class * unique_nums
    
    # Get a pandas groupby with proportions of k features
    frac = target_series.value_counts().map(lambda x: round(1 - float(x)/_len, 2))
    frac_sum = sum(frac)
    new_prop = frac.map(lambda x: round(x/frac_sum * total_features))
    
    # If the sum of the k features and the series length 
    # unequal that randomly add one to one of the class feature
    if sum(new_prop) > total_features:
        plusone = np.random.choice(unique_nums)
        new_prop[plusone] = new_prop[plusone] - (sum(new_prop) - total_features)
        
    elif sum(new_prop) < total_features:
        plusone = np.random.choice(unique_nums)
        new_prop[plusone] = new_prop[plusone] + (sum(new_prop) - total_features)

    return new_prop.astype('int')

In [160]:
len(tfidf_selection['diy'])

10000

In [181]:
class GetBowDummies_Array2(object):

    """
    Inputs: (1) Series with a text vector (2) Bag of Words for features
    Output: Dataframe with dummy variables indicating whether a feature word is present in a row.

    Examples
    --------

    train = pd.Series(['I dont know','polish dont','fire','healthcare know','healthcare'])
    test  = pd.Series(['I dont know','healthcare know'])
    feats = ['dont','know','healthcare']

    train_bow_dummies = GetBowDummies(train, feats).get_bow_dummies()

    test_bow_dummies = GetBowDummies(test, feats).get_bow_dummies()

    test_bow_dummies
     >> dont  know  healthcare
        0     0     0           0
        1     0     0           0

    """

    # Initialize
    def __init__(self, series, features):
        """
        :param series: A column containing raw text
        :param features: A list of feature words
        """
        features.sort()
        
        self.series = series
        self.index  = self.series.index
        self.features = np.asarray(features)

        # Define dimension
        self.nrows = series.shape[0]
        self.ncols = len(features)
        self.dim   = (self.nrows, self.ncols)

    def index_feats_dict(self):
        """
        For every document row, features present in doc
        identified.
        """
        # doc_features_list = []
        zero_matrix = np.zeros(self.dim, np.int)

        for i, doc in enumerate(self.series):
            # Sets for a doc and feature words
            
            doc_set = set(doc.split())
            feat_set = set(self.features)

            # Shared words between the two sets
            interset_words = np.asarray(list(doc_set.intersection(feat_set)))
            
            if len(interset_words) != 0: 
                ndx = np.searchsorted(self.features, interset_words)
                zero_matrix[i,ndx] = 1
            else:
                continue
                
        return zero_matrix

In [None]:
## Perform Feature Selection using top K TF-IDF per Class

# Choose kth threshold

k_to_test = [50, 100, 500, 1000, 3000, 5000, 7500, 10000]
model_result_dict = {}
k_model_dict = {}

le = LabelEncoder()
y_encode = le.fit(trainy['category'])
train_true_y = le.transform(trainy['category'])
test_true_y = le.transform(testy['category'])

for k in k_to_test:
    
    # Fetch k features to select
    k_value_counts = k_number_chooser(trainy['category'], k)
    
    
    start_time = time.time()
    topic_k_groupby = k_number_chooser(trainy['category'], k)
    
    top_k_features_per_topic = {}
    for topic in trainy['category'].unique():
        feats_k_count = topic_k_groupby[topic]
        top_k_features_per_topic[topic] = tfidf_selection[topic][:feats_k_count]

    # Create Bow dummies
    feats = list(chain.from_iterable(top_k_features_per_topic.values()))
    train_bow = GetBowDummies_Array2(trainX['combined'], feats).index_feats_dict()
    test_bow = GetBowDummies_Array2(testX['combined'], feats).index_feats_dict()
    
    # Fit Model 
    mnb = MultinomialNB()
    mnb.fit(X=train_bow, y=trainy['category'])
    k_model_dict[k] = mnb
    
    # Make train and test predictions
    train_pred_y = mnb.predict(train_bow)
    test_pred_y = mnb.predict(test_bow)
    
    # Measure time
    duration = time.time() - start_time
    
    # Apply encoder 
    train_pred_y = le.transform(train_pred_y)
    test_pred_y = le.transform(test_pred_y)
    
    # Evaluate F1
    trainF1 = f1_score(train_pred_y, train_true_y, average='macro') 
    testF1 = f1_score(test_pred_y, test_true_y, average='macro')
    
    # Store the result in dictionary 
    model_result = {'duration': duration, 'trainF1': trainF1, 'testF1': testF1, \
                    'train_pred_y': train_pred_y, 'test_pred_y': test_pred_y}

    model_result_dict[k] = model_result
    
    print(k, 'completed')

50 completed
100 completed
500 completed
1000 completed
3000 completed
5000 completed
7500 completed


In [None]:
model_result_dict