In [2]:
from utils import *
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
def tfidf_featurize(X_train, X_test, max_features, only_positives=True):
    tfidf_vect, xtrain_tfidf, xtest_tfidf = get_features(X_train, X_test, max_features, only_positives=only_positives)
    return tfidf_vect, xtrain_tfidf, xtest_tfidf

In [7]:
def get_features(X_train, X_test, max_features, only_positives=True, ngram=False):
    
    if ngram:
        ngram_range = (2, 3)
    else:
        ngram_range = (1, 1)

    train_x = load_pickle(window_path, fp.train_x_filename)
    test_x = load_pickle(window_path, fp.test_x_filename)

    remove_pickle(feats_path, train_file)
    remove_pickle(feats_path, test_file)
    # word level tf-idf
    tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', 
                                 max_features=max_features, ngram_range=ngram_range)
   
    if only_positives:
        g_truth = train_g_truth
        train_x_positives = [text for text, g_truth in zip(train_x['clean_text'], 
                                                           train_x['g_truth']) if g_truth == 1]
        tfidf_vect.fit(train_x_positives)
    else:
        tfidf_vect.fit(train_x['clean_text'])  # aqui pasar solo los positivos???

    save_pickle(fp.pickles_path, "tfidf_vectorizer.pkl", tfidf_vect)

    xtrain_tfidf = tfidf_vect.transform(train_x["clean_text"])
    xtest_tfidf = tfidf_vect.transform(test_x["clean_text"])
    
    return tfidf_vect, xtrain_tfidf, xtest_tfidf

In [6]:
from scipy.sparse import hstack

def combine_features(train_feats, test_feats): # both are a list of the features we want to combine

    train_combined_features = hstack(train_feats)
    test_combined_features = hstack(test_feats)

    print("Is the combined different from tfidf: {}".format(
        train_feats[0].toarray() == train_combined_features.toarray()))
    
    return train_combined_features, test_combined_features

In [None]:
# training traditional

In [5]:
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn import naive_bayes, ensemble
import sklearn.metrics as metrics    
import xgboost
from utils import *





In [4]:
def train_and_evaluate(X_train, y_train, X_test, y_test, train_weights, classifier_name="svm", strategy="weights"):
    
    classifier = train(X_train, y_train, train_weights, classifier_name, strategy)
    y_pred = evaluate(classifier, X_test, y_test)
    
    return y_pred, classifier
    

def train(x_train, y_train, train_weights, classifier_name="svm", strategy="weights"):

    train_feats = x_train
    train_labels = y_train

    if strategy == 'weights':
        strategy=None
    elif strategy == 'normal':
        strategy=None
        train_weights = None
    else:
        train_weights=None

    if classifier_name == "svm":
        classifier = SVC(class_weight=strategy)
    elif classifier_name == "linear_svm":
        classifier = LinearSVC(class_weight=strategy)
    elif classifier_name == "forest":
        classifier = ensemble.RandomForestClassifier(class_weight=strategy)
    elif classifier_name == "xgboost":
        classifier = xgboost.XGBClassifier(class_weight=strategy)
    else:
        classifier = naive_bayes.MultinomialNB()

    #if feats != 'text':
    train_feats = train_feats.tocsc()

    # if train_feats.isnull().values.any():
    #     train_feats = train_feats.fillna(value=0,axis=0)

    classifier.fit(train_feats, train_labels, sample_weight=train_weights)
    
    return classifier

def evaluate(classifier, x_test, y_test):
    
    test_feats = x_test.tocsc()
    y_pred = classifier.predict(test_feats)
    
    classification_report = metrics.classification_report(y_test, y_pred)
    confusion_matrix = metrics.confusion_matrix(y_test, y_pred)
    
    logger(classification_report)
    logger(confusion_matrix)
    
    return y_pred
    