In [None]:
import sys, os 
import pandas as pd 
import numpy as np
import utils

from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_recall_fscore_support, classification_report, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer


from sklearn.neural_network import MLPClassifier

In [None]:
def load_data():

    data = os.path.join("data", "train.csv")

    df = pd.read_csv(data)
    X_train = df[['id', 'comment_text']]
    y_train = df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

    X_test = pd.read_csv(os.path.join("data", "test.csv"))
    y_test = pd.read_csv(os.path.join("data", "test_labels.csv"))
    test = X_test.merge(y_test, on='id')
    test = test[ (test['toxic']!=-1) | (test['severe_toxic']!=-1) | 
                (test['obscene']!=-1) | (test['threat']!=-1) | (test['insult']!=-1) 
                | (test['identity_hate']!=-1) ]
    test = test.reset_index(drop=True) 
    
    X_test = test[['id', 'comment_text']]
    y_test = test[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']]

    return X_train, y_train, X_test, y_test 

In [None]:
# Constants 

ngrams = 1
penalty = 'l2'
Cs = 10
cv = 5
max_iter = 1000 
scoring = 'accuracy'

In [None]:
X_train, y_train, X_test, y_test = utils.load_data()
np.random.seed(42)

In [None]:
y_train

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1, ngrams))

In [None]:
def tfidf_transform(corpus, is_train=False):
    if is_train: 
        X = vectorizer.fit_transform(corpus)
    else:
        X = vectorizer.transform(corpus)
    return X

In [None]:
def print_results(model, X, y):
    """
    Evaluate the model, prints results, and return as tuple
    """
    prediction = model.predict(X)
    precision, recall, fbeta_score, support = \
        precision_recall_fscore_support(y, prediction)
    accuracy = accuracy_score(y, prediction)

    print ("Precision: {}\nRecall: {}\nF-Score: {}\nSupport: {}\nAccuracy {}\n".format(
            precision, recall, fbeta_score, support, accuracy))

    print (classification_report(y, prediction))

    return (precision, recall, fbeta_score, support, accuracy)

In [None]:
# turn the corpus into a list to pass into the vectorizer 
train_corpus = X_train['comment_text'].values.tolist()
test_corpus = X_test['comment_text'].values.tolist()

X_train_tfidf = tfidf_transform(train_corpus, is_train=True)
X_test_tfidf = tfidf_transform(test_corpus, is_train=False)

In [None]:
def run_model(): 
    labels = ['toxic' , 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    
    for label in labels: 
        print("Train model on the \"{}\" label.\n".format(label))
        y_train_label = y_train[label].values.tolist()
        y_test_label = y_test[label].values.tolist()
        
        lr = LogisticRegressionCV(   class_weight="balanced", 
                                     Cs = Cs, 
                                     cv = cv, 
                                     penalty = penalty, 
                                     scoring = scoring, 
                                     max_iter = max_iter)
        
        lr.fit(X_train_tfidf, y_train_label)
        
        print("Train Results\n")
        print_results(lr, X_train_tfidf, y_train_label)
        
        print("Test Results\n")
        
        print_results(lr, X_test_tfidf, y_test_label)

In [None]:
run_model()



In [None]:
X_train, y_train, X_test, y_test = utils.load_reddit_data()
np.random.seed(42)

In [None]:
# turn the corpus into a list to pass into the vectorizer 
train_corpus = X_train['body'].values.tolist()
test_corpus = X_test['body'].values.tolist()

X_train_tfidf = tfidf_transform(train_corpus, is_train=True)
X_test_tfidf = tfidf_transform(test_corpus, is_train=False)

In [None]:
X_train_tfidf

In [None]:
X_test_tfidf

In [7]:
X_train, y_train, X_test, y_test = utils.load_reddit_data()



In [6]:
X_train

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
...,...,...,...,...,...,...
63973,0,0,0,0,0,0
63974,0,0,0,0,0,0
63975,0,0,0,0,0,0
63976,1,0,1,0,1,0
