In [3]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from nltk.corpus import gutenberg, stopwords
from collections import Counter
import nltk
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.svm import SVC
from sklearn.naive_bayes import BernoulliNB
from nltk.corpus import gutenberg, stopwords

# Challenge 1

### A model was given to us that identified features of a sentences from alice in wonderland and persuasion. Our target was to identify the author of the sentence by using these features. Try to improve model by adding more features

In [4]:
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

In [6]:
# Utility function for standard text cleaning.
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = re.sub(r'--',' ',text)
    text = re.sub(r'\[.*\]', "", text)
    text = ' '.join(text.split())
    return text
    
# Load and clean the data.
persuasion = gutenberg.raw('austen-persuasion.txt')
alice = gutenberg.raw('carroll-alice.txt')

# The Chapter indicator is idiosyncratic
persuasion = re.sub(r'Chapter \d+', '', persuasion)
alice = re.sub(r'CHAPTER .*', '', alice)
    
alice = text_cleaner(alice[:int(len(alice)/3)])
persuasion = text_cleaner(persuasion[:int(len(persuasion)/3)])

In [7]:
# Parse the cleaned novels. This can take a bit.
nlp = spacy.load('en')
alice_doc = nlp(alice)
persuasion_doc = nlp(persuasion)

In [8]:
alice_sents = [[sent, 'Caroll'] for sent in list(alice_doc.sents)]
persuasion_sents = [[sent, 'Austen'] for sent in list(persuasion_doc.sents)]

In [9]:
sentences = pd.DataFrame(alice_sents + persuasion_sents, columns=['sent', 'author'])

In [10]:
# Utility function to create a list of the 2000 most common words.
def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    all_words = [token.lemma_ for token in text if token.is_punct == False and token.is_stop == False]
    
    # Return the most common words.
    return [item[0] for item in Counter(all_words).most_common(2000) if item[1] > 1]

# Creates a data frame with features for each word in our common word set.
# Each value is the count of the times the word appears in each sentence.
def bow_features(sentences):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame()
    df['sentence'] = sentences['sent']
    
    def sent_length(s):
        return len(s['sentence'])
    
    df['sent_length'] = df.apply(sent_length, axis=1)
    
    def punct(s):
        punct = [token.text for token in s['sentence']
                 if token.is_punct == True]
        return Counter(punct)
        
    df['punct'] = df.apply(punct, axis=1)
    
    def lemma(s):
        words = [token.lemma_ for token in s['sentence']
                 if token.is_punct == False
                 and token.is_stop == False
                 and token.lemma_ in common_words]
        return Counter(words)
    
    df['lemmas'] = df.apply(lemma, axis=1)
    
    def pos(s):
        pos = [token.pos_ for token in s['sentence'] if token.is_punct == False and token.is_stop == False]
        return Counter(pos)
    
    df['pos'] = df.apply(pos, axis=1)
    
    def dep(s):
        dep = [token.dep_ for token in s['sentence'] if token.is_punct == False and token.is_stop == False]
        return Counter(dep)
    
    df['dependencies'] = df.apply(dep, axis=1)
    
    for index in df.index:
        words = [x for x in df.loc[index, 'lemmas']]
        for word in words:
            df.loc[index, 'word_' + word] = df.loc[index, 'lemmas'][word]

        pos = [x for x in df.loc[index, 'pos']]
        for part in pos:
            df.loc[index, 'pos_' + part] = df.loc[index, 'pos'][part]

        puncts = [x for x in df.loc[index, 'punct']]
        for punctuation in puncts:
            df.loc[index, 'punct_' + punctuation] = df.loc[index, 'punct'][punctuation]
            
        denpendencies = [x for x in df.loc[index, 'dependencies']]
        for depend in denpendencies:
            df.loc[index, 'dep_' + depend] = df.loc[index, 'dependencies'][depend]

    analyzer = SentimentIntensityAnalyzer()
    
    def postive_sentiment(s):
        return analyzer.polarity_scores(s['sentence'].text)['pos']

    def negative_sentiment(s):
        return analyzer.polarity_scores(s['sentence'].text)['neg']

    def compound_sentiment(s):
        return analyzer.polarity_scores(s['sentence'].text)['compound']
    
    df['postive_sentiment'] = df.apply(postive_sentiment, axis=1)
    df['negative_sentiment'] = df.apply(negative_sentiment, axis=1)
    df['compound_sentiment'] = df.apply(compound_sentiment, axis=1)
    
                
    df['text_source'] = sentences['author']
    
    return df

# Set up the bags.
alicewords = bag_of_words(alice_doc)
persuasionwords = bag_of_words(persuasion_doc)

# Combine bags to create a set of unique words.
common_words = set(alicewords + persuasionwords)

In [11]:
# Create our data frame with features. This can take a while to run.
df = bow_features(sentences)

In [12]:
df.fillna(int(0), inplace=True)

In [13]:
def text_source(s):
    if s['text_source'] == 'Caroll':
        return 0
    if s['text_source'] == 'Austen':
        return 1
    
df['author'] = df.apply(text_source, axis=1)

In [14]:
features = df.drop(['author', 'text_source', 'sentence', 'lemmas', 'pos', 'punct', 'dependencies'], axis=1)

In [17]:
Y = df['author']
X = features

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=200)

In [18]:
rfc = ensemble.RandomForestClassifier()
rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 0.9952107279693486

Test set score: 0.9008620689655172


In [19]:
lr = LogisticRegression(penalty='l2') # No need to specify l2 as it's the default. But we put it for demonstration.
lr.fit(X_train, y_train)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

Training set score: 0.9636015325670498

Test set score: 0.9181034482758621


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# Challenge 2:

### Find out whether your new model is good at identifying Alice in Wonderland vs any other work

In [22]:
hamlet = gutenberg.raw('shakespeare-hamlet.txt')
alice = gutenberg.raw('carroll-alice.txt')

def text_cleaner(text):
    text = re.sub(r'\[.*\]', '', text)
    text = ' '.join(text.split())
    return text

alice = re.sub(r'CHAPTER .*', '', alice)
alice = re.sub(r'\*', '', alice)
hamlet = re.sub(r'Actus.*\.', '', hamlet)
hamlet = re.sub(r'Scoena.*\.', '', hamlet)

hamlet = text_cleaner(hamlet[:int(len(hamlet)/3)])
alice = text_cleaner(alice[:int(len(alice)/3)])

In [23]:
hamlet_doc = nlp(hamlet)
alice_doc = nlp(alice)

In [24]:
hamlet_sents = [[sent, 'Shakespear'] for sent in list(hamlet_doc.sents)]
alice_sents = [[sent, 'Carroll'] for sent in list(alice_doc.sents)]

all_sents = hamlet_sents + alice_sents

In [25]:
sentences_authors = pd.DataFrame(all_sents, columns=['sentence', 'author'])

In [26]:
def BoW(text):
    
    all_words = [token.lemma_ for token in text if token.is_punct == False and token.is_stop == False]
    
    return [item[0] for item in Counter(all_words).most_common(2000) if item[1] > 1]

hamlet_words = BoW(hamlet_doc)
alice_words = BoW(alice_doc)

all_words = set(hamlet_words + alice_words)

In [27]:
def BoW_features(sentences, all_words):
    df = pd.DataFrame()
    df['sentence'] = sentences['sentence']
    
    def words(s):
        words = [token.lemma_ for token in s['sentence']
                 if token.is_punct == False
                 and token.is_stop == False
                 and token.lemma_ in all_words]
        return Counter(words)
    
    def pos(s):
        pos = [token.pos_ for token in s['sentence'] if token.is_punct == False and token.is_stop == False]
        return Counter(pos)
    
    def dep(s):
        dep = [token.dep_ for token in s['sentence'] if token.is_punct == False and token.is_stop == False]
        return Counter(dep)
    
    def punct(s):
        punct = [token.text for token in s['sentence'] if token.is_punct == True]
        return Counter(punct)
    
    df['sent_length'] = df['sentence'].apply(lambda x: len(x))
    df['words'] = df.apply(words, axis=1)
    df['parts_of_speech'] = df.apply(pos, axis=1)
    df['dependencies'] = df.apply(dep, axis=1)
    df['punctuation'] = df.apply(punct, axis=1)
    
    analyzer = SentimentIntensityAnalyzer()
    df['sentiment'] = df['sentence'].apply(lambda x: analyzer.polarity_scores(x.text)['compound'])
    
    for index in df.index:
        
        words = [x for x in df.loc[index, 'words']]
        for word in words:
            df.loc[index, word + '_count'] = df.loc[index, 'words'][word]
        
        pos = [x for x in df.loc[index, 'parts_of_speech']]
        for part in pos:
            df.loc[index, part + '_pos'] = df.loc[index, 'parts_of_speech'][part]
        
        dep = [x for x in df.loc[index, 'dependencies']]
        for depend in dep:
            df.loc[index, depend + '_dep'] = df.loc[index, 'dependencies'][depend]
        
        punct = [x for x in df.loc[index, 'punctuation']]
        for punc in punct:
            df.loc[index, punc + '_punct'] = df.loc[index, 'punctuation'][punc]
    
    df['author'] = sentences['author'].apply(lambda x: 1 if x == 'Carroll' else 0)
    
    return df

In [28]:
df = BoW_features(sentences_authors, all_words)

In [29]:
df = df.fillna(int(0))

In [30]:
features = df.drop(['author', 'sentence', 'words', 'parts_of_speech', 'punctuation', 'dependencies'], axis=1)

In [31]:
Y = df['author']
X = features

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=200)

In [32]:
lr = LogisticRegression(penalty='l2') # No need to specify l2 as it's the default. But we put it for demonstration.
lr.fit(X_train, y_train)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

Training set score: 0.9766970618034447

Test set score: 0.939209726443769


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [37]:
rfc = ensemble.RandomForestClassifier(n_estimators=500, max_depth=16)
rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 0.9645390070921985

Test set score: 0.9224924012158054
