In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from predictNew import CUSTOM_STOP
from sklearn.metrics import log_loss
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin

In [2]:
chris_complete = pd.read_csv('../data/chris_complete_training.csv', index_col=0)

In [3]:
chris_sample = pd.read_csv('../data/chris_training.csv', index_col=0)

In [4]:
sample_df = pd.DataFrame(chris_sample.iloc[:48,:])

In [5]:
sample_df.to_csv('../data/chris_model_eval.csv')

In [6]:
sample_complete = pd.DataFrame(chris_complete.iloc[:48,:])

In [7]:
X = sample_df.jobs

In [8]:
y = sample_complete.labels

In [30]:
class PruneForest():
    def __init__(self, param=None):
        self.param = param

    def get_feature_importances(self, model):
        feature_importances = model.feature_importances_
        idxs = np.nonzero(feature_importances)[0]
        return idxs
    
    def get_reverse_term_dict(self, vectorizer):
        word_dict = vectorizer.vocabulary_
        reverse_dict = {value: key for key, value in word_dict.items()}
        return reverse_dict
    
    def get_word_list(self, reverse_dict, idxs):
        word_list = [reverse_dict[key] for key in list(idxs)]
        return word_list
    
    def get_vocabulary(self, model, vectorizer):
        indices = self.get_feature_importances(model)
        reverse_dict = self.get_reverse_term_dict(vectorizer)
        word_list = self.get_word_list(reverse_dict, indices)
        return word_list

In [10]:
sss = StratifiedShuffleSplit(n_splits=10, test_size=0.3)

In [11]:
forest_loss = []
for train_index, test_index in sss.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    i = 0
    
    vectorizer = TfidfVectorizer(stop_words=CUSTOM_STOP)
    vectorizer.fit(X_train)
    transformed_matrix = vectorizer.transform(X_train)
    tfidf_df = pd.DataFrame(transformed_matrix.toarray())
    
    forest = RandomForestClassifier(n_estimators=2000, criterion='entropy', n_jobs=-1)
    forest.fit(tfidf_df, y_train)
    
    pruned_forest = PruneForest()
    vocabulary = pruned_forest.get_vocabulary(forest, vectorizer)
    
    improved_vectorizer = TfidfVectorizer(stop_words=CUSTOM_STOP, vocabulary=vocabulary)
    improved_vectorizer.fit(X_train)
    improved_transformed_matrix = improved_vectorizer.transform(X_train)
    improved_tfidf_df = pd.DataFrame(improved_transformed_matrix.toarray())
    
    grad_boost = GradientBoostingClassifier(learning_rate=0.001, n_estimators=500, subsample=0.5)
    grad_boost.fit(improved_tfidf_df, y_train)
    tfidf_X_test = improved_vectorizer.transform(X_test)
    y_pred = grad_boost.predict_proba(tfidf_X_test)
    forest_loss.append(log_loss(y_test, y_pred))
    i += 1

In [69]:
class FitModel():
    def __init__(self):
        self.vectorizer = TfidfVectorizer(stop_words=CUSTOM_STOP)
        self.forest = RandomForestClassifier(n_estimators=2000, criterion='entropy', n_jobs=-1)
        self.grad_boost = GradientBoostingClassifier(learning_rate=0.001, n_estimators=500, subsample=0.5)
        self.pruned_forest = PruneForest()
    
    def get_Xy(self, name):
        df = pd.read_csv(name, index_col=0)
        culled_df = df[df.labels != 0.0]
        X = culled_df.iloc[1:, 1]
        y = culled_df.iloc[1:, 2]
        return X, y
    
    def fit_primary(self, X, y):
        self.vectorizer.fit(X)
        transformed_matrix = self.vectorizer.transform(X)
        tfidf_df = pd.DataFrame(transformed_matrix.toarray())
        self.forest.fit(tfidf_df, y)
        return self.forest, self.vectorizer 
    
    def prune_forest(self, forest, vectorizer):
        vocabulary = self.pruned_forest.get_vocabulary(forest, vectorizer)
        return vocabulary
    
    def improve(self, X, vocabulary):
        imp_vectorizer = TfidfVectorizer(stop_words=CUSTOM_STOP, vocabulary=vocabulary)
        imp_vectorizer.fit(X)
        transformed_matrix = imp_vectorizer.transform(X)
        tfidf_df = pd.DataFrame(transformed_matrix.toarray())
        return tfidf_df
    
    def fit_secondary(self, X, y):
        return self.grad_boost.fit(X, y)
        
    def transform(self, name):
        X, y = self.get_Xy(name)
        forest, vectorizer = self.fit_primary(X, y)
        vocabulary = self.prune_forest(forest, vectorizer)
        tfidf_df = self.improve(X, vocabulary)
        return self.fit_secondary(tfidf_df, y)


In [81]:
test = FitModel()

In [82]:
model = test.transform('../data/LABELEDTEST.csv')