In [274]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from predictNew import CUSTOM_STOP
from sklearn.metrics import log_loss
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [44]:
chris_complete = pd.read_csv('../data/chris_complete_training.csv', index_col=0)

In [35]:
chris_sample = pd.read_csv('../data/chris_training.csv', index_col=0)

In [41]:
sample_df = pd.DataFrame(chris_sample.iloc[:48,:])

In [43]:
sample_df.to_csv('../data/chris_model_eval.csv')

In [58]:
sample_complete = pd.DataFrame(chris_complete.iloc[:48,:])

In [183]:
X = sample_df.jobs

In [185]:
y = sample_complete.labels

In [187]:
clean_y = y.replace(-1, 0)

In [312]:
vectorizer = TfidfVectorizer(stop_words=CUSTOM_STOP)
vectorizer.fit(X)
transformed_matrix = vectorizer.transform(X)
tfidf_df = pd.DataFrame(transformed_matrix.toarray())

In [313]:
forest = RandomForestClassifier(n_estimators=1000, criterion='entropy', n_jobs=-1)
forest.fit(tfidf_df, clean_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [320]:
class PruneForest():
    def __init__(self, param=None):
        self.param = param

    def get_feature_importances(self, model):
        feature_importances = model.feature_importances_
        idxs = np.nonzero(feature_importances)[0]
        return idxs
    
    def get_reverse_term_dict(self, vectorizer):
        word_dict = vectorizer.vocabulary_
        reverse_dict = {value: key for key, value in word_dict.items()}
        return reverse_dict
    
    def get_word_list(self, reverse_dict, idxs):
        word_list = [reverse_dict[key] for key in list(idxs)]
        return word_list
    
    def get_vocabulary(self, model, vectorizer):
        indices = self.get_feature_importances(model)
        reverse_dict = self.get_reverse_term_dict(vectorizer)
        word_list = self.get_word_list(reverse_dict, indices)
        return word_list

In [315]:
pruned_forest = PruneForest()

In [316]:
vocabulary = pruned_forest.get_vocabulary(forest, vectorizer)

In [336]:
sss = StratifiedShuffleSplit(n_splits=10, test_size=0.3)

In [406]:
forest_loss = []
for train_index, test_index in sss.split(X, clean_y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    i = 0
    
    vectorizer = TfidfVectorizer(stop_words=CUSTOM_STOP)
    vectorizer.fit(X_train)
    transformed_matrix = vectorizer.transform(X_train)
    tfidf_df = pd.DataFrame(transformed_matrix.toarray())
    
    forest = RandomForestClassifier(n_estimators=2000, criterion='entropy', n_jobs=-1)
    forest.fit(tfidf_df, y_train)
    
    pruned_forest = PruneForest()
    vocabulary = pruned_forest.get_vocabulary(forest, vectorizer)
    
    improved_vectorizer = TfidfVectorizer(stop_words=CUSTOM_STOP, vocabulary=vocabulary)
    improved_vectorizer.fit(X_train)
    improved_transformed_matrix = improved_vectorizer.transform(X_train)
    improved_tfidf_df = pd.DataFrame(improved_transformed_matrix.toarray())
    
    grad_boost = GradientBoostingClassifier(learning_rate=0.001, n_estimators=500, subsample=0.5)
    grad_boost.fit(improved_tfidf_df, y_train)
    tfidf_X_test = improved_vectorizer.transform(X_test)
    y_pred = grad_boost.predict_proba(tfidf_X_test)
    forest_loss.append(log_loss(y_test, y_pred))
    i += 1

In [407]:
forest_loss

[0.42413966297658345,
 0.43003535914764146,
 0.4198280033421206,
 0.3848275526302786,
 0.47633649805361106,
 0.43278766568051336,
 0.42712084243306486,
 0.4678194142783675,
 0.4303735543764162,
 0.39113662811166006]

In [408]:
np.mean(forest_loss)

0.4284405181030257

In [270]:
grad_loss = []
for train_index, test_index in sss.split(X, clean_y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = clean_y[train_index], clean_y[test_index]
    i = 0
    vectorizer = TfidfVectorizer(stop_words=CUSTOM_STOP)
    model = GradientBoostingClassifier(learning_rate=0.001, n_estimators=500, subsample=0.5)
    vectorizer.fit(X_train)
    transformed_matrix = vectorizer.transform(X_train)
    tfidf_df = pd.DataFrame(transformed_matrix.toarray())
    model.fit(tfidf_df, y_train)
    tfidf_X_test = vectorizer.transform(X_test)
    y_pred = model.predict_proba(tfidf_X_test)
    grad_loss.append(log_loss(y_test, y_pred))
    i += 1

In [271]:
np.mean(grad_loss)

0.47205696292003446