In [2]:
%load_ext autoreload
%autoreload 2

import warnings; warnings.simplefilter('ignore')
import os, codecs, string, random
import numpy as np
from numpy.random import seed as random_seed
from numpy.random import shuffle as random_shuffle
import matplotlib.pyplot as plt
%matplotlib inline  

seed = 42
random.seed(seed)
np.random.seed(seed)

#NLP libraries
import spacy, nltk, gensim, sklearn
import pyLDAvis.gensim

#Vader
import vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#Scikit imports
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

#The data
corpus_root = 'books/'

09 - Handling text 1


Implementing the natural language processing pipeline  
Solving four typical language processing tasks:  
Sentiment analysis  
Document classification  
Topic detection  
Semantic analysis  
Part 1  

load the books    
Remove the new lines  
put in raw text, get a Spacy object  
create our own NLP pipeline with Spacy  
Step 1: Sentence splitting  
Step 2: Tokenization  
Step 3: Part of speech tagging  
Step 4: Named entity recognition  
Step 5: Removing stop words  
Step 6: Lemmatization  
Step 7: Chunking (shallow parsing)  
Step 8: Dependancy parsing  
Counting word occurences  
The NLP pipeline with Spacy  

# Sentiment

In [5]:
analyzer = SentimentIntensityAnalyzer()

example = 'I am already far north of London, and as I walk in the streets of Petersburgh, I feel a cold northern breeze play upon my cheeks, which braces my nerves and fills me with delight.'

nlp = spacy.load('en')
doc = nlp(example)

  and should_run_async(code)


## Get polarity score

In [7]:
positive_sent = []
#iterate through the sentences, get polarity scores, choose a value
[positive_sent.append(analyzer.polarity_scores(sent.text)['pos']) for sent in doc.sents]
negative_sent = []
[negative_sent.append(analyzer.polarity_scores(sent.text)['neg']) for sent in doc.sents]
#plt.hist(negative_sent,bins=15)
total_sent = []
[total_sent.append(analyzer.polarity_scores(sent.text)['compound']) for sent in doc.sents]

  and should_run_async(code)


[None]

# Document classification

### load our corpus via NLTK this time

In [None]:
from nltk.corpus import PlaintextCorpusReader
our_books = PlaintextCorpusReader(corpus_root, '.*.txt')
print(our_books.fileids())

# Segment the books into equally long chunks

### Yield successive n-sized chunks from l

In [None]:
def get_chunks(l, n):
"""Yield successive n-sized chunks from l."""
for i in range(0, len(l), n):
yield l[i:i + n]

### dictionary of books

In [None]:
book_id = {f:n for n,f in enumerate(our_books.fileids())}

### segment the books into equally long chunks

In [None]:
chunks = list()
chunk_class = list() # this list contains the original book of the chunk, for evaluation
limit = 500 # how many chunks total
size = 50 # how many sentences per chunk/page
for f in our_books.fileids():
sentences = our_books.sents(f)
print(f,":")
print('Number of sentences:',len(sentences))
# create chunks
chunks_of_sents = [x for x in get_chunks(sentences,size)] # this is a list of lists of sentences, which are a list of tokens
chs = list()
# regroup so to have a list of chunks which are strings
for c in chunks_of_sents:
grouped_chunk = list()
for s in c:
grouped_chunk.extend(s)
chs.append(" ".join(grouped_chunk))
print("Number of chunks:",len(chs),'\n')
# regroup so to have a list of chunks which are strings
for c in chunks_of_sents:
grouped_chunk = list()
for s in c:
grouped_chunk.extend(s)
chs.append(" ".join(grouped_chunk))
print("Number of chunks:",len(chs),'\n')
# filter to the limit, to have the same number of chunks per book
chunks.extend(chs[:limit])
chunk_class.extend([book_id[f] for _ in range(len(chs[:limit]))])

# Representing the chunks with bag-of-words

In [None]:
vectorizer = CountVectorizer()
#initialize and specify minumum number of occurences to avoid untractable number of features
#vectorizer = CountVectorizer(min_df = 2) if we want high frequency
#create bag of words features
X = vectorizer.fit_transform(chunks)

### Fit the regularized logistic regression and find c using cross_val

In [None]:
#mask and convert to int Frankenstein
Y = np.array(chunk_class) == 1
Y = Y.astype(int)  

#shuffle the data
X, Y = shuffle(X, Y, random_state=0)

#split into training and test set
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
accs = []

#the grid of regularization parameter 
grid = [0.01,0.1,1,10,100,1000,10000]

for c in grid:
    
    #initialize the classifier
    clf = LogisticRegression(random_state=0, solver='lbfgs',C = c)
    
    #crossvalidate
    scores = cross_val_score(clf, X_train,Y_train, cv=10)
    accs.append(np.mean(scores))

### Interpret bag of words

In [None]:
coefs=clf.coef_[0]
top_three = np.argpartition(coefs, -20)[-20:]
print(np.array(vectorizer.get_feature_names())[top_three])

### word emdeddings

In [None]:
list((nlp(example).vector)[0:10])

# Topic detection

### doc processing

In [None]:
STOPWORDS = spacy.lang.en.stop_words.STOP_WORDS

processed_docs = list()
for doc in nlp.pipe(chunks, n_threads=5, batch_size=10):

    # Process document using Spacy NLP pipeline.
    ents = doc.ents  # Named entities

    # Keep only words (no numbers, no punctuation).
    # Lemmatize tokens, remove punctuation and remove stopwords.
    doc = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]

    # Remove common words from a stopword list and keep only words of length 3 or more.
    doc = [token for token in doc if token not in STOPWORDS and len(token) > 2]

    # Add named entities, but only if they are a compound of more than word.
    doc.extend([str(entity) for entity in ents if len(entity) > 1])

    processed_docs.append(doc)
docs = processed_docs
del processed_docs

### add bigrams

In [None]:
# Add bigrams too
from gensim.models.phrases import Phrases

# Add bigrams to docs (only ones that appear 15 times or more).
bigram = Phrases(docs, min_count=15)

for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

### Create a dictionary representation of the documents, and filter out frequent and rare words.

In [None]:
from gensim.corpora import Dictionary
dictionary = Dictionary(docs)

# Remove rare and common tokens.
# Filter out words that occur too frequently or too rarely.
max_freq = 0.5
min_wordcount = 5
dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)

### Bag-of-words representation of the documents.

In [None]:
corpus = [dictionary.doc2bow(doc) for doc in docs]
#MmCorpus.serialize("models/corpus.mm", corpus)
print('Number of unique tokens: %d' % len(dictionary))
print('Number of chunks: %d' % len(corpus))

### Model LDA

In [None]:
from gensim.models import LdaMulticore
params = {'passes': 10, 'random_state': seed}
base_models = dict()
model = LdaMulticore(corpus=corpus, num_topics=4, id2word=dictionary, workers=6,
                passes=params['passes'], random_state=params['random_state'])

### Show topics

In [None]:
model.show_topics(num_words=5)

### Plot topics

In [None]:
data =  pyLDAvis.gensim.prepare(model, corpus, dictionary)
pyLDAvis.display(data)

### Accuracy

In [None]:
# assignment of cluster
sent_to_cluster = list()
for n,doc in enumerate(corpus):
    if doc:
        cluster = max(model[doc],key=lambda x:x[1])
        sent_to_cluster.append(cluster[0])
# accuracy
from collections import Counter
for book, cluster in book_id.items():
    assignments = list()
    for real,given in zip(chunk_class,sent_to_cluster):
        if real == cluster:
            assignments.append(given)
    most_common,num_most_common = Counter(assignments).most_common(1)[0] # 4, 6 times
    print(book,":",most_common,"-",num_most_common)
    print("Accuracy:",num_most_common/limit)
    print("------")

# Semantic analysis based on lexical categories

### Sem study

In [None]:
nlp = spacy.load('en')
doc = nlp(books[3])
empath_features = lexicon.analyze(doc.text,categories = ["disappointment", "pain", "joy", "beauty", "affection"])

### evolution of topics

In [None]:
bins = range(0,len(doc.text),150000)
love = []
pain = []
beauty = []
affection = []


for cnt,i in enumerate(bins[:-1]):
    empath_features = lexicon.analyze(doc.text[bins[cnt]:bins[cnt+1]],
                                      categories = ["love", "pain", "joy", "beauty", "affection"], normalize = True)
    love.append(empath_features["love"])
    pain.append(empath_features["pain"])
    beauty.append(empath_features["beauty"])
    affection.append(empath_features["affection"])
plt.plot(love,label = "love")
plt.plot(beauty, label = "beauty")
plt.plot(affection, label = "affection")
plt.plot(pain,label = "pain")

plt.xlabel("progression in the book")
plt.ylabel("frequency of a category")
plt.legend()

### create custom categories based on seed terms

In [None]:
lexicon.create_category("healthy_food", ["healthy_food","low_carb","kale","avocado"], model="nytimes")

model can be: reddit, nytimes

# Handling text 2

### reading txt to df

In [None]:
season = ""
episode = ""
scene = ""
data = []
with open("data/all_scripts.txt") as f:
    for line in f.readlines():
        line = line[:-1]
        if line.startswith(">> "):
            season = int(line[10:12])
            episode = line[3:]
            continue
        if line.startswith("> "):
            scene = line[2:]
            continue
        character, line = line.split(": ", 1)
        data.append([season, episode, scene, character, line])
lines = pd.DataFrame(data, columns=["Season", "Episode", "Scene", "Character", "Line"])

### Replace punctuation marks and lowercase all the text

In [None]:
def clean_line(line):
    for char in EXCLUDE_CHARS:
        line = line.replace(char, ' ')
    return line.lower()
lines["Line"] = lines["Line"].apply(clean_line)
lines.head()

### Count  and plot word frequency

In [None]:
corpus_frequency = pd.concat([pd.Series(row['Line'].split(' ')) for _, row in lines.iterrows()]).reset_index()
corpus_frequency.columns = ["Frequency", "Word"]
corpus_frequency = corpus_frequency.groupby("Word").count()

corpus_frequency.plot.hist(by="Frequency", bins=100, title="Frequency histogram")
corpus_frequency.plot.hist(by="Frequency", loglog=True, bins=np.logspace(0, 6, 100),
                           title="Frequency histogram (loglog scale)");

### Count number of word per character

In [None]:
lines["Words"] = lines["Line"].apply(lambda x: len(x.split(' ')))
words_per_char = lines.groupby("Character").sum()["Words"]
words_per_char[recurrent_chars.index]

### TfidfVectorizer with stopwords and tokenizer

In [None]:
with open("helpers/stopwords.txt") as f:
    stop_words = list(map(lambda x: x[:-1], f.readlines()))
tfidf = TfidfVectorizer(stop_words=stop_words, tokenizer=simple_tokeniser, min_df=2)
train_vectors = tfidf.fit_transform(train_set["Line"])
test_vectors = tfidf.transform(test_set["Line"])

### Find the set of all words in the training set that are only uttered by Sheldon

In [None]:
words_for_chars = pd.concat([pd.Series(row["Character"], row['Line'].split(' '))
                             for _, row in lines.iterrows()]).reset_index()
words_for_chars.columns = ["Word", "Character"]

words_for_chars = words_for_chars.groupby("Word")["Character"].apply(set)
sheldon_words = words_for_chars[words_for_chars.apply(lambda x: ("Sheldon" in x) and (len(x) == 1))].index

def contains_sheldon_words(line):
    for word in sheldon_words:
        if word in line:
            return True
    return False
test_pred = test_set["Line"].apply(contains_sheldon_words)
test_true = test_set["Character"] == "Sheldon"

### Use SVD

In [None]:
svd = TruncatedSVD(n_components=25)
train_svd = svd.fit_transform(train_vectors)
test_svd = svd.transform(test_vectors)

### Logistic Regression

In [None]:
model = LogisticRegressionCV(cv=10)
train_labels = train_set["Character"] == "Sheldon"
model.fit(train_svd, train_labels)
test_pred = model.predict(test_svd)
train_pred = model.predict(train_svd)

### Metrics

In [None]:
def confusion_matrix(test, pred, positive=1):
    negative = 0 if positive == 1 else 1
    cm = np.zeros((2,2))
    test = test.values
    cm[0,0] = np.logical_and(pred == positive, test == positive).sum()
    cm[0,1] = np.logical_and(pred == positive, test == negative).sum()
    cm[1,0] = np.logical_and(pred == negative, test == positive).sum()
    cm[1,1] = np.logical_and(pred == negative, test == negative).sum()
    df = pd.DataFrame(cm.astype(int), columns=["Positive", "Negative"])
    df.index = ["Positive Prediction", "Negative Prediction"]
    return df

def accuracy(confusion_matrix):
    return (confusion_matrix[0,0] + confusion_matrix[1,1]) / confusion_matrix.sum()

def precision(confusion_matrix):
    if (confusion_matrix[0,:].sum() == 0):
        return 1
    return confusion_matrix[0,0] / confusion_matrix[0,:].sum()

def recall(confusion_matrix):
    if (confusion_matrix[:,0].sum() == 0):
        return 1
    return confusion_matrix[0,0] / confusion_matrix[:,0].sum()

def fscore(confusion_matrix):
    p = precision(confusion_matrix)
    r = recall(confusion_matrix)
    return 2 * p * r / (p+r)

def stats(confusion_matrix):
    confusion_matrix = confusion_matrix.values
    return {"accuracy": accuracy(confusion_matrix), "precision":precision(confusion_matrix),
            "recall": recall(confusion_matrix), "fscore": fscore(confusion_matrix)}

print("Statistics for class 1 on train set:\n", stats(confusion_matrix(train_labels, train_pred, positive=1)))
print("Statistics for class 0 on train set:\n", stats(confusion_matrix(train_labels, train_pred, positive=0)))
print("Statistics for class 1 on test set:\n", stats(confusion_matrix(test_true, test_pred, positive=1)))
print("Statistics for class 0 on test set:\n", stats(confusion_matrix(test_true, test_pred, positive=0)))