# Exploratory Text Analysis

In [78]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
nltk.download('stopwords')
stopwords = stopwords.words('english')
remove_terms = ['the', '``', "''", "'d", "'ll", "'re", "'s", "'ve", 'could', 'might', 'must', "n't", 'need', 'sha', 'wo', 'would']
stopwords = stopwords + remove_terms + list(string.punctuation)
print(stopwords)
lemmatizer = nltk.WordNetLemmatizer()

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/carlostezna/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [125]:
def preprocess(document):
        """
        Tokenizes documents then normalizes and lemmatizes tokens
        """
        from nltk.tokenize import word_tokenize
        words = word_tokenize(document)
        words_clean = []
        for word in words: # Go through every word in your tokens list
            w = word.lower()
            if (w not in stopwords):  # remove stopwords and punctuation
                words_clean.append(lemmatizer.lemmatize(w))
        return words_clean

In [11]:
def load_document(file):
    f = open(file)
    try:
        raw = f.read()
        return file.split('/')[-1], raw
    except:
        print(file)
        pass

def load_collection(files):
    texts = []
    for file in files:
        doc_id, text = load_document(file)
        texts.append(text)
    return texts

In [118]:
import os
COLLECTION_DIR = './dataset/newDataset/'
files = [COLLECTION_DIR + file for file in os.listdir(COLLECTION_DIR)]

corpus = load_collection(files)

# Feature Extraction

In [528]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# using default tokenizer in TfidfVectorizer
tfidf = TfidfVectorizer(token_pattern='(\S+)', min_df=3, max_df=0.8, 
                        ngram_range=(1, 2), stop_words=stopwords, tokenizer=word_tokenize)
features = tfidf.fit_transform(corpus)
df = pd.DataFrame(
        features.todense(),
        columns=tfidf.get_feature_names()
    )

In [529]:
df.head()

Unnamed: 0,'best,'bot,'bot nets,'cold,'cold calls,'could,'do,'eu,'goodbye,'goodbye said,...,£8.5bn,£800m,£800m 1.5bn,£80m,£857m,£8bn,£8m,£9.4m,£99,£9m
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [530]:
len(tfidf.vocabulary_)

31809

In [340]:
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
doc_set = [preprocess(doc) for doc in corpus]
finder = BigramCollocationFinder.from_words(doc_set[0])
print(finder.nbest(bigram_measures.pmi, 10))
#finder.apply_freq_filter(10)

[('16-year-old', 'depressed'), ('1990', '2004'), ('1999', '2003'), ('2', 'european'), ('2004', '25'), ('address', 'issue'), ('alerted', 'experience'), ('also', 'highlighted'), ('amount', 'serious'), ('appearance', 'series')]


In [341]:
word_fd = nltk.FreqDist(doc_set[0])
bigram_fd = nltk.FreqDist(nltk.bigrams(doc_set[0]))
finder = BigramCollocationFinder(word_fd, bigram_fd)
finder.score_ngrams(bigram_measures.raw_freq)[0:10]

[(('vulnerable', 'people'), 0.014705882352941176),
 (('taking', 'life'), 0.011029411764705883),
 (('alcohol', 'problem'), 0.007352941176470588),
 (('bar', 'cell'), 0.007352941176470588),
 (('custody', 'death'), 0.007352941176470588),
 (('death', 'custody'), 0.007352941176470588),
 (('death', 'rate'), 0.007352941176470588),
 (('drug', 'alcohol'), 0.007352941176470588),
 (('highly', 'vulnerable'), 0.007352941176470588),
 (('human', 'right'), 0.007352941176470588)]

In [348]:
trigram_measures = nltk.collocations.TrigramAssocMeasures()
finder = TrigramCollocationFinder.from_words(doc_set[0])
finder.score_ngrams(trigram_measures.raw_freq)[0:10]

[(('drug', 'alcohol', 'problem'), 0.007352941176470588),
 (('life', 'highly', 'vulnerable'), 0.007352941176470588),
 (("'shocks", 'mp', 'death'), 0.003676470588235294),
 (('16-year-old', 'depressed', 'exhibiting'), 0.003676470588235294),
 (('1990', '2004', '25'), 0.003676470588235294),
 (('1999', '2003', 'mp'), 0.003676470588235294),
 (('2', 'european', 'convention'), 0.003676470588235294),
 (('2002', 'urged', 'home'), 0.003676470588235294),
 (('2003', 'mp', 'said'), 0.003676470588235294),
 (('2004', '25', 'child'), 0.003676470588235294)]

# Topic Modeling

In [126]:
import gensim
from gensim import corpora, models
dictionary = corpora.Dictionary(preprocess(doc) for doc in corpus)
bow = [dictionary.doc2bow(preprocess(doc)) for doc in corpus]

In [130]:
ldamodel = models.ldamodel.LdaModel(bow, num_topics=20, id2word=dictionary, passes=50, minimum_probability=0.1)

[(0, '0.025*"wale" + 0.018*"england" + 0.018*"zealand" + 0.017*"new" + 0.014*"game"'), (1, '0.023*"band" + 0.019*"music" + 0.017*"best" + 0.016*"award" + 0.013*"album"'), (2, '0.028*"game" + 0.020*"said" + 0.015*"china" + 0.013*"lending" + 0.012*"sony"'), (3, '0.017*"ireland" + 0.012*"j" + 0.011*"minute" + 0.009*"g" + 0.009*"o\'gara"'), (4, '0.015*"said" + 0.014*"show" + 0.011*"people" + 0.009*"bbc" + 0.009*"u"'), (5, '0.015*"said" + 0.010*"system" + 0.010*"mobile" + 0.010*"technology" + 0.009*"people"'), (6, '0.021*"drug" + 0.016*"test" + 0.014*"sport" + 0.014*"also" + 0.014*"greek"'), (7, '0.021*"said" + 0.018*"michael" + 0.018*"film" + 0.011*"different" + 0.010*"life"'), (8, '0.022*"film" + 0.013*"number" + 0.012*"one" + 0.009*"year" + 0.007*"award"'), (9, '0.010*"said" + 0.010*"game" + 0.009*"player" + 0.008*"time" + 0.006*"world"'), (10, '0.017*"said" + 0.011*"program" + 0.009*"microsoft" + 0.008*"virus" + 0.007*"software"'), (11, '0.015*"said" + 0.009*"year" + 0.007*"carry" + 0.0

In [533]:
print(ldamodel.print_topics(num_topics=1, num_words=15))

[(8, '0.022*"film" + 0.013*"number" + 0.012*"one" + 0.009*"year" + 0.007*"award" + 0.007*"best" + 0.007*"director" + 0.006*"new" + 0.006*"chart" + 0.006*"single" + 0.006*"week" + 0.006*"festival" + 0.006*"said" + 0.006*"first" + 0.005*"also"')]


# Word Embeddings

In [115]:
wv_embeddings = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', 
                                                                binary = True)

In [536]:
analogy = wv_embeddings.most_similar(positive=['man', 'woman'], negative=['king'])
print(analogy)

[('teenage_girl', 0.626004159450531), ('girl', 0.5984843969345093), ('teenager', 0.5653390884399414), ('boy', 0.5254422426223755), ('policewoman', 0.5163928866386414), ('Woman', 0.5034411549568176), ('person', 0.5024771690368652), ('teenaged_girl', 0.4996837377548218), ('female_jogger', 0.49290722608566284), ('motorist', 0.4890908896923065)]


In [542]:
similar_word = wv_embeddings.most_similar('colombia')
print(similar_word)

[('lenguaje', 0.6201727986335754), ('debajo_de', 0.6004170775413513), ('largo_de', 0.5995573997497559), ('precio_windows_7', 0.5937506556510925), ('otros', 0.592975914478302), ('Aunque', 0.5886745452880859), ('de_datos', 0.5863443613052368), ('peru', 0.5859358906745911), ('diferentes', 0.5850200653076172), ('para_nosotros', 0.5848908424377441)]


# Word vectors

In [209]:
import re
from gensim.models import Word2Vec
from gensim.models.phrases import Phraser, Phrases

In [175]:
# Cleaning data - remove punctuation from every text
texts = corpus.copy()
sentences = []
# Go through each text in turn
for ii in range(len(texts)):
    sentences = [re.sub(pattern=r'[\!"#$%&\*+,-./:;<=>?@^_`()|~=]', 
                        repl='', 
                        string=x
                       ).strip().split(' ') for x in texts[ii].split('\n') 
                      if not x.endswith('writes:')]
    sentences = [x for x in sentences if x != ['']]
    texts[ii] = sentences

In [178]:
# concatenate all sentences from all texts into a single list of sentences
all_sentences = []
for text in texts:
    all_sentences += text

In [179]:
# Phrase Detection
# Give some common terms that can be ignored in phrase detection
# For example, 'state_of_affairs' will be detected because 'of' is provided here: 
common_terms = ["of", "with", "without", "and", "or", "the", "a"]
# Create the relevant phrases from the list of sentences:
phrases = Phrases(all_sentences, common_terms=common_terms)
# The Phraser object is used from now on to transform sentences
bigram = Phraser(phrases)
# Applying the Phraser to transform our sentences is simply
all_sentences = list(bigram[all_sentences])

In [280]:
import time
start = time.time()
model = Word2Vec(all_sentences, 
                 min_count=2,   # Ignore words that appear less than this
                 size=200,      # Dimensionality of word embeddings
                 workers=8,     # Number of processors (parallelisation)
                 window=5,      # Context window for words during training
                 iter=100)       # Number of epochs training over corpus
end = time.time()
print('Train time: ', end - start)

Train time:  50.80308508872986


In [544]:
model.wv.most_similar('Apple')

[('iTunes', 0.41001155972480774),
 ('Motorola', 0.3779086172580719),
 ('Microsoft', 0.3739736080169678),
 ('Nintendo', 0.3733164370059967),
 ('iPod', 0.3729538917541504),
 ("Apple's", 0.36621707677841187),
 ('tool', 0.3646322786808014),
 ('Internet_Explorer', 0.3623224198818207),
 ('operating_system', 0.3539520502090454),
 ('Firefox', 0.34643125534057617)]

# Text Summarization

In [465]:
sentences = nltk.sent_tokenize(corpus[0])
total_documents = len(sentences)

In [466]:
def sentence_frequency_matrix(sentences, stopWords):
    frequency_matrix = {}
    lem = lemmatizer

    for sent in sentences:
        freq_table = {}
        words = word_tokenize(sent)
        for word in words:
            word = word.lower()
            word = lem.lemmatize(word)
            if word in stopWords:
                continue
    
            if word in freq_table:
                freq_table[word] += 1
            else:
                freq_table[word] = 1

        frequency_matrix[sent[:15]] = freq_table

    return frequency_matrix

In [467]:
freq_matrix = sentence_frequency_matrix(sentences, stopwords)

In [468]:
def sentence_tf_matrix(freq_matrix):
    tf_matrix = {}

    for sent, f_table in freq_matrix.items():
        tf_table = {}

        count_words_in_sentence = len(f_table)
        for word, count in f_table.items():
            tf_table[word] = count / count_words_in_sentence

        tf_matrix[sent] = tf_table

    return tf_matrix

In [469]:
tf_matrix = sentence_tf_matrix(freq_matrix)

In [470]:
def sentence_df_matrix(freq_matrix):
    df_matrix = {}

    for sent, f_table in freq_matrix.items():
        for word, count in f_table.items():
            if word in df_matrix:
                df_matrix[word] += 1
            else:
                df_matrix[word] = 1

    return df_matrix

In [471]:
df_matrix = sentence_df_matrix(freq_matrix)

In [472]:
def sentence_idf_matrix(freq_matrix, df_matrix, total_documents):
    import math
    idf_matrix = {}

    for sent, f_table in freq_matrix.items():
        idf_table = {}

        for word in f_table.keys():
            idf_table[word] = math.log10(total_documents / float(df_matrix[word]))

        idf_matrix[sent] = idf_table

    return idf_matrix

In [473]:
idf_matrix = sentence_idf_matrix(freq_matrix, df_matrix, total_documents)

In [474]:
def sentence_tf_idf_matrix(tf_matrix, idf_matrix):
    tf_idf_matrix = {}

    for (sent1, tf_table), (sent2, idf_table) in zip(tf_matrix.items(), idf_matrix.items()):

        tf_idf_table = {}

        for (word1, value1), (word2, value2) in zip(tf_table.items(), 
                                                    idf_table.items()):  # keys are same: word1 == word2
            tf_idf_table[word1] = float(value1 * value2)

        tf_idf_matrix[sent1] = tf_idf_table

    return tf_idf_matrix

In [475]:
tf_idf_matrix = sentence_tf_idf_matrix(tf_matrix, idf_matrix)

In [476]:
def score_sentences(tf_idf_matrix):
    sentence_scores = {}

    for sent, f_table in tf_idf_matrix.items():
        total_score_per_sentence = 0

        count_words_in_sentence = len(f_table)
        for word, score in f_table.items():
            total_score_per_sentence += score

        sentence_scores[sent] = total_score_per_sentence / count_words_in_sentence

    return sentence_scores

In [477]:
sentence_scores = score_sentences(tf_idf_matrix)

In [478]:
print(sentence_scores)

{'Custody death r': 0.10179248221200314, 'The joint commi': 0.06977472553820856, 'Members urged t': 0.07481870532259614, 'There was one p': 0.0903398240544809, 'The report, whi': 0.056499799137089766, 'Many of those w': 0.08823013392966018, 'It questioned w': 0.10289222174044234, 'Increased resou': 0.089557625403835, 'Committee chair': 0.08243265509226147, '"Yet throughout': 0.05409358357349983, '"These highly v': 0.09649459673957755, '"Crime levels a': 0.14957262692375967, 'The misplaced o': 0.10152967460208544, '"Until we chang': 0.06723165486153318, 'The committee a': 0.0888845243928637, 'Between 1990 an': 0.09034930525444314, 'It picked out t': 0.058903018563800966, 'It revealed tha': 0.06826391952557925, 'Even though the': 0.07132881808537661, 'Nine days into ': 0.12751924344770646}


In [479]:
def calc_average_score(sentence_scores):
    sumValues = 0
    for entry in sentence_scores:
        sumValues += sentence_scores[entry]

    # Average value of a sentence from original summary_text
    average = (sumValues / len(sentence_scores))

    return average

In [480]:
threshold = calc_average_score(sentence_scores)

In [481]:
threshold

0.08652545692004016

In [482]:
def generate_summary(sentences, sentence_scores, threshold):
    sentence_count = 0
    summary = ''

    for sentence in sentences:
        if sentence[:15] in sentence_scores and sentence_scores[sentence[:15]] >= (threshold):
            summary += " " + sentence
            sentence_count += 1

    return summary

In [483]:
original_text = generate_summary(sentences, sentence_scores, 0.0 * threshold)
print(original_text)

 Custody death rate 'shocks' MPs

Deaths in custody have reached "shocking" levels, a committee of MPs and peers has warned. The joint committee on human rights found those committing suicide were mainly the most vulnerable, with mental health, drugs or alcohol problems. Members urged the government to set up a task force to tackle deaths in prisons, police cells, detention centres and special hospitals. There was one prison suicide every four days between 1999 and 2003, MPs said. The report, which followed a year-long inquiry by the committee, found the high death rate "amounts to a serious failure to protect the right to life of a highly vulnerable group". Many of those who ended up taking their own lives had "presented themselves" to the authorities with these problems before they even offended, the report said. It questioned whether prison was the most appropriate place for them to be kept and whether earlier intervention would have meant custody could have been avoided. Increased 

In [524]:
summary = generate_summary(sentences, sentence_scores, 1.2 * threshold)
print((summary))

 "Crime levels are falling but we are holding more people in custody than ever before. Nine days into his sentence, Joseph hung himself from the bars of his cell window with a sheet.
