In [41]:
import os
import pandas as pd

from sklearn.datasets import load_files


def load_data(directory_path: str) -> pd.DataFrame:
    """
    Loads all text files from a provided directory into a Pandas Data Frame containing the text and the filename.

    :param directory_path: The directory to read.
    :return:
    """
    if not os.path.isdir(directory_path):
        print("Bloody idiot - Provide a directory!")

    result = pd.DataFrame(columns=['text', 'filename'])

    for filename in os.listdir(directory_path):
        path = os.path.join(directory_path, filename)

        if os.path.isdir(path):
            result = result.append(load_data(path), ignore_index=True)
        if filename.endswith("txt"):
            with open(path) as f:
                text = f.read()
                current_df = pd.DataFrame({'text': [text], 'filename': path})
                result = result.append(current_df, ignore_index=True)

    return result


def load_training() -> pd.DataFrame:
    positive = load_data('./data/train/pos')
    negative = load_data('./data/train/neg')
    positive['sentiment'] = 1
    negative['sentiment'] = 0

    return positive.append(negative, ignore_index=True)

def load_test() -> pd.DataFrame:
    positive = load_data('./data/test/pos')
    negative = load_data('./data/test/neg')
    positive['sentiment'] = 1
    negative['sentiment'] = 0

    return positive.append(negative, ignore_index=True)

df = load_training()
df_test = load_test()


In [7]:
# pip install spacy
# python -m spacy download en_core_web_sm

import spacy

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load("en_core_web_sm")

# Process whole documents
text = ("Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly.")

doc = nlp(text)

# Analyze syntax
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts
for entity in doc.ents:
    print(entity.text, entity.label_) 

Noun phrases: ['Story', 'a man', 'who', 'unnatural feelings', 'a pig', 'a opening scene', 'a terrific example', 'absurd comedy', 'A formal orchestra audience', 'an insane, violent mob', 'the crazy chantings', 'it', 'singers', 'it', 'no general narrative', 'it', 'the era', 'The cryptic dialogue', 'Shakespeare', 'a third grader', 'a technical level', 'it', 'you', 'some good cinematography', 'future great Vilmos Zsigmond', 'Future stars Sally Kirkland', 'Frederic Forrest']
Verbs: ['start', 'turn', 'stay', 'make', 'put', 'should', 'turn', 'would', 'make', 'seem', 'may', 'think', 'can', 'see']
Shakespeare PERSON
third ORDINAL
Sally Kirkland PERSON
Frederic Forrest PERSON


In [51]:
from spacy import displacy
for index, row in df.iterrows():
    if index == 1:
        doc = nlp(row.text)
        displacy.render(doc, style = "ent",jupyter = True)
        displacy.render(doc, style="dep", jupyter= True)

In [56]:
parser = English()
print(parser("I love Scorsese's films, but this was one, is just one too many."))

I love Scorsese's films, but this was one, is just one too many.


In [48]:
import string
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC


# Create our list of punctuation marks
punctuations = string.punctuation

# Create our list of stopwords
nlp = spacy.load('en')
stop_words = spacy.lang.en.stop_words.STOP_WORDS

# Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

bow_vector = CountVectorizer(tokenizer = spacy_tokenizer, ngram_range=(2,2))

tfvectorizer = TfidfVectorizer(tokenizer = spacy_tokenizer)


# Creating our tokenizer function
def spacy_tokenizer(sentence):
    # Creating our token object, which is used to create documents with linguistic annotations.
    mytokens = parser(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

# Custom transformer using spaCy
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        # Cleaning Text
        return [clean_text(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}

# Basic function to clean the text
def clean_text(text):
    # Removing spaces and converting text into lowercase
    return text.strip().lower()

from sklearn.model_selection import train_test_split

X = df['text'] # the features we want to analyze
ylabels = df['sentiment'] # the labels, or answers, we want to test against

X_train = df.text
y_train = df.sentiment

# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(max_iter=300)

# Create pipeline using Bag of Words
pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', tfvectorizer),
                 ('classifier', classifier)])

# model generation
pipe.fit(X_train,y_train)

  " = {}.".format(self.n_jobs))


Pipeline(memory=None,
     steps=[('cleaner', <__main__.predictors object at 0x1a3d72b9e8>), ('vectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngra...ty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))])

In [49]:
from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(df_test.text)

# Model Accuracy
print("Logistic Regression Accuracy:",metrics.accuracy_score(df_test.sentiment, predicted))
print("Logistic Regression Precision:",metrics.precision_score(df_test.sentiment, predicted))
print("Logistic Regression Recall:",metrics.recall_score(df_test.sentiment, predicted))

Logistic Regression Accuracy: 0.88124
Logistic Regression Precision: 0.880418296479604
Logistic Regression Recall: 0.88232


In [36]:
text = "I love Scorsese's films, but this was one, is just one too many. It lacked tight editing, scenes lingered far too long to make artistic points. The technology of that allowed the actors to be made younger was not impressive given the actors could not move as if they were younger. The one scene that made this point, was when De Niro's character roughs up a corner grocer for shoving his daughter. The scene is difficult to watch given De Niro seemed to struggle as much with his own lack of mobility to carry off the fight scene realistically despite having the willing and compliant younger actor. Al Pacino's character looked nothing like Hoffa. Yes the acting was good in general, but the story was just too drawn out and slow and the characters were just not that interesting to hold your attention. As a man of 58, the hardest fact is recognizing the increasing limitation as we age, this movie opens with De Niro's character in an assisted care facility. It is where this movie should have stayed. The Actors and Directors are all legends, but this movie only took away from some of their carrier luster."
pipe.predict([text])[0]



0