In [1]:
OPINION_FILE = "../scripts/data/plastic_bags_so_opinions.txt"
RESPONSE_FILE = "../data/raw/plastic_bags_so.txt"

OPINION_FILE = "../scripts/data/eating_meat_so_opinions.txt"
RESPONSE_FILE = "../data/raw/methane_so.tsv"

with open(OPINION_FILE) as i:
    opinionated_responses = set([line.strip() for line in i])
    
with open(RESPONSE_FILE) as i:
    all_responses = set([line.strip() for line in i])

objective_responses = set([r for r in all_responses if r not in opinionated_responses])
    
print("Opinionated:", len(opinionated_responses))
print("Non-opinionated:", len(objective_responses))
print("All:", len(all_responses))

Opinionated: 550
Non-opinionated: 286
All: 836


In [2]:
from sklearn.model_selection import train_test_split

text_and_labels = []

for text in opinionated_responses:
    text_and_labels.append((text, "opinionated"))

for text in objective_responses:
    text_and_labels.append((text, "objective"))
    
texts, labels = zip(*text_and_labels)

train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.3, random_state=1)

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline

preprocessing = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer())
])
  
print("Preprocessing training data...")
train_preprocessed = preprocessing.fit_transform(train_texts)

print("Preprocessing test data...")
test_preprocessed = preprocessing.transform(test_texts)

Preprocessing training data...
Preprocessing test data...


In [4]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

nb_classifier = MultinomialNB()
svm_classifier = LinearSVC()
lr_classifier = LogisticRegression(multi_class="ovr")

print("Training Naive Bayes classifier...")
nb_classifier.fit(train_preprocessed, train_labels)

print("Training SVM classifier...")
svm_classifier.fit(train_preprocessed, train_labels)

print("Training Logistic Regression classifier...")
lr_classifier.fit(train_preprocessed, train_labels)

Training Naive Bayes classifier...
Training SVM classifier...
Training Logistic Regression classifier...


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [5]:
nb_predictions = nb_classifier.predict(test_preprocessed)
svm_predictions = svm_classifier.predict(test_preprocessed)
lr_predictions = lr_classifier.predict(test_preprocessed)

In [6]:
import numpy as np

print("NB Accuracy:", np.mean(nb_predictions == test_labels))
print("SVM Accuracy:", np.mean(svm_predictions == test_labels))
print("LR Accuracy:", np.mean(lr_predictions == test_labels))

NB Accuracy: 0.8087649402390438
SVM Accuracy: 0.8924302788844621
LR Accuracy: 0.8565737051792829


In [7]:
import eli5

eli5.explain_weights(svm_classifier, 
                     feature_names = preprocessing.named_steps["vect"].get_feature_names(),
                     target_names = ["opinionated", "objective"]
                    )



Weight?,Feature
+2.406,we
+2.222,should
+1.225,need
+1.222,could
+1.179,reduces
+1.079,reducing
+1.031,find
… 418 more positive …,… 418 more positive …
… 304 more negative …,… 304 more negative …
-0.984,argue


In [8]:
import spacy

def simple_test(tokens):
    return "should" in tokens or "need" in tokens or "must" in tokens or "needs" in tokens or "important" in tokens or "we" in tokens or "find" in tokens
    

def test_simple_classifier(texts, labels):
    
    nlp = spacy.load("en")
    predicted_labels = []
    correct = 0
    
    for text, label in zip(texts, labels):
        tokens = set([t.text.lower() for t in nlp(text)])
        if simple_test(tokens) and label == "opinionated":
            correct += 1
        elif not simple_test(tokens) and label == "objective":
            correct += 1
            
    return correct/len(texts)

print(test_simple_classifier(texts, labels))

0.7990430622009569
