In [47]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
import os
import numpy as np
import requests

In [24]:
with open('data/sentences/rt-polarity-utf8.neg') as f:
    neg = f.read().splitlines()
with open('data/sentences/rt-polarity-utf8.pos') as f:
    pos = f.read().splitlines()

In [25]:
len(neg), len(pos)

(5349, 5346)

In [26]:
data = pos + neg
target = [1 for _ in range(len(pos))] + [-1 for _ in range(len(neg))]

In [27]:
neg_train_size = int(len(neg) * 0.8)
pos_train_size = int(len(pos) * 0.8)

In [28]:
neg_train, neg_test = neg[:neg_train_size], neg[neg_train_size:]
pos_train, pos_test = pos[:pos_train_size], pos[pos_train_size:]

In [29]:
train_data = neg_train + pos_train
test_data = neg_test + pos_test

In [30]:
len(train_data) + len(test_data) == len(neg) + len(pos)

True

In [31]:
neg_target_train = [-1 for i in range(len(neg_train))]
neg_target_test = [-1 for i in range(len(neg_test))]
pos_target_train = [1 for i in range(len(pos_train))]
pos_target_test = [1 for i in range(len(pos_test))]

In [32]:
target_train = neg_target_train + pos_target_train
target_test = neg_target_test + pos_target_test

In [33]:
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))])

text_clf_svm = text_clf_svm.fit(train_data, target_train)
predicted_svm = text_clf_svm.predict(test_data)
np.mean(predicted_svm == target_test)



0.73598130841121501

In [34]:
# Training Support Vector Machines - SVM and calculating its performance
text_clf_svm = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                         ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42))])

text_clf_svm = text_clf_svm.fit(data, target)



In [36]:
pos_documents_path = "data/documents/pos"
neg_documents_path = "data/documents/neg"

In [37]:
pos_documents = []
neg_documents = []

for file_name in os.listdir(pos_documents_path):
    with open(os.path.join(pos_documents_path, file_name)) as f:
        pos_documents.append(f.read().splitlines())
        
for file_name in os.listdir(neg_documents_path):
    with open(os.path.join(neg_documents_path, file_name)) as f:
        neg_documents.append(f.read().splitlines())

In [38]:
len(pos_documents), len(neg_documents)

(1000, 1000)

In [74]:
def add_pos_to_sentence(sentence):
    response = requests.post("http://text-processing.com/api/tag/", data={"text": sentence, "output": "iob"})
    if response.status_code != 200:
        return None
    response_list = response.json().get("text").split("\n")
    return " ".join(["_".join(var.split(" ")[0:2]) for var in response_list])

In [75]:
def predict_and_add_pos_documents(documents):
    output = []
    for i, doc in enumerate(documents):
        results = text_clf_svm.predict(doc)
        for sentence, label in zip(doc, results):
            posed_sentence = add_pos_to_sentence(sentence)
            if posed_sentence is None:
                print("Document: {}".formate(i))
                return output
            output.append("{}\t{}".format(label, posed_sentence))
        output.append("")
    return output

In [None]:
pos_output = predict_and_add_pos_documents(pos_documents)
neg_output = predict_and_add_pos_documents(neg_documents)

In [None]:
thefile = open('../data/pos.txt', 'w')
for sentence in pos_output:
    thefile.write("%s\n" % sentence)

In [None]:
thefile = open('../data/neg.txt', 'w')
for sentence in neg_output:
    thefile.write("%s\n" % sentence)