# Clean text for analysis 

## Import modules

In [3]:
from connect_to_mongo import connect_to_mongo
import nltk
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import porter

## Connect to Mongo database 

In [57]:
db, transcipts = connect_to_mongo('x-files', 'transcripts')

## Format transcripts for basic text analysis 

Treat each show as a separate text entry. All of Scully's lines will be one entry, all of Mulder's lines another. 

In [61]:
doc_cursor = transcipts.find()

In [62]:
scully_corpus = []
mulder_corpus = []
for i in doc_cursor:
    scully = i['scully_lines']
    scully_concat = (' ').join(scully)
    scully_corpus.append(scully_concat)
    mulder = i['mulder_lines']
    mulder_concat = (' ').join(mulder)
    mulder_corpus.append(mulder_concat)

In [72]:
len(scully_corpus)

204

In [73]:
scully_corpus[0]

'Agent Dana Scully. Yes, sir. Well, sir, I was recruited out of medical school. Um, my parents still think it was an act of rebellion, but, uh... I saw the F.B.I. as a place where I could distinguish myself. Yes, I am. By reputation. He\'s an Oxford educated Psychologist, who wrote a monograph on serial killers and the occult, that helped to catch Monty Props in 1988. Generally thought of as the best analyst in the violent crimes section. He had a nickname at the academy... Spooky Mulder. I believe they have to do with unexplained phenomena. Am I to understand that you want me to debunk the X-Files project, sir? Agent Mulder. I\'m Dana Scully, I\'ve been assigned to work with you. Actually, I\'m looking forward to working with you. I\'ve heard a lot about you. If you have any doubt about my qualifications or credentials, th... Did you bother to read it? Needle punctures, maybe. An animal bite. Electrocution of some kind. It\'s organic. I don\'t know, is it some kind of synthetic protei

In [9]:
scully_concat

'Agent Dana Scully. Yes, sir. Well, sir, I was recruited out of medical school. Um, my parents still think it was an act of rebellion, but, uh... I saw the F.B.I. as a place where I could distinguish myself. Yes, I am. By reputation. He\'s an Oxford educated Psychologist, who wrote a monograph on serial killers and the occult, that helped to catch Monty Props in 1988. Generally thought of as the best analyst in the violent crimes section. He had a nickname at the academy... Spooky Mulder. I believe they have to do with unexplained phenomena. Am I to understand that you want me to debunk the X-Files project, sir? Agent Mulder. I\'m Dana Scully, I\'ve been assigned to work with you. Actually, I\'m looking forward to working with you. I\'ve heard a lot about you. If you have any doubt about my qualifications or credentials, th... Did you bother to read it? Needle punctures, maybe. An animal bite. Electrocution of some kind. It\'s organic. I don\'t know, is it some kind of synthetic protei

In [10]:
mulder_concat = (" ").join(mulder)

## Process all lines as one text entry 

### Lowercase

In [74]:
lower_case = scully_corpus[0].lower()

In [75]:
lower_case[:400]

"agent dana scully. yes, sir. well, sir, i was recruited out of medical school. um, my parents still think it was an act of rebellion, but, uh... i saw the f.b.i. as a place where i could distinguish myself. yes, i am. by reputation. he's an oxford educated psychologist, who wrote a monograph on serial killers and the occult, that helped to catch monty props in 1988. generally thought of as the bes"

### Important words 

In [87]:
important_words = {'f.b.i.': 'fbi', 'x-files': 'xfiles', 'x-file': 'xfile'}

In [77]:
for key in important_words.keys():
    lower_case = lower_case.replace(key, important_words[key])

In [78]:
lower_case

'agent dana scully. yes, sir. well, sir, i was recruited out of medical school. um, my parents still think it was an act of rebellion, but, uh... i saw the fbi as a place where i could distinguish myself. yes, i am. by reputation. he\'s an oxford educated psychologist, who wrote a monograph on serial killers and the occult, that helped to catch monty props in 1988. generally thought of as the best analyst in the violent crimes section. he had a nickname at the academy... spooky mulder. i believe they have to do with unexplained phenomena. am i to understand that you want me to debunk the xfiles project, sir? agent mulder. i\'m dana scully, i\'ve been assigned to work with you. actually, i\'m looking forward to working with you. i\'ve heard a lot about you. if you have any doubt about my qualifications or credentials, th... did you bother to read it? needle punctures, maybe. an animal bite. electrocution of some kind. it\'s organic. i don\'t know, is it some kind of synthetic protein? d

### Remove punctuation 

Replace punctuation with a whitespace, for easier removal of stopwords later like I've (stopwords that appear as separate entries in the corpus - for I've it appears as 'i' and 've' in the stopword corpus). 

In [38]:
import re
import string
punc_removed = re.sub('[%s]' % re.escape(string.punctuation), ' ', lower_case)
# remove extra whitespaces
# regex for 2 or more whitespaces, replace with 1 whitespace

In [39]:
punc_removed[:100]

'agent dana scully  yes  sir  well  sir  i was recruited out of medical school  um  my parents still '

### Remove words with digits 

In [40]:
no_digits = re.sub('\w*\d\w*', '', punc_removed)

In [41]:
no_digits[:400]

'agent dana scully  yes  sir  well  sir  i was recruited out of medical school  um  my parents still think it was an act of rebellion  but  uh    i saw the f b i  as a place where i could distinguish myself  yes  i am  by reputation  he s an oxford educated psychologist  who wrote a monograph on serial killers and the occult  that helped to catch monty props in   generally thought of as the best an'

### Replace 2 or more whitespaces with a single space.

In [42]:
space_removed = " ".join(no_digits.split())

In [43]:
space_removed[:400]

'agent dana scully yes sir well sir i was recruited out of medical school um my parents still think it was an act of rebellion but uh i saw the f b i as a place where i could distinguish myself yes i am by reputation he s an oxford educated psychologist who wrote a monograph on serial killers and the occult that helped to catch monty props in generally thought of as the best analyst in the violent '

### Stem & Stopwords

Stopwords can be removed as part of count vectorizer, but we'll do it here separately. 

In [92]:
my_stopwords = stopwords.words('english')

In [93]:
stemmer = nltk.stem.porter.PorterStemmer()

In [102]:
stemmed_text = []
for word in space_removed.split():
    if word not in my_stopwords: 
        word_stem = stemmer.stem(word)
        stemmed_text.append(word_stem)
stemmed_text = (' ').join(stemmed_text)

In [103]:
stemmed_text

'agent dana sculli ye sir well sir recruit medic school um parent still think act rebellion uh saw f b place could distinguish ye reput oxford educ psychologist wrote monograph serial killer occult help catch monti prop gener thought best analyst violent crime section nicknam academi spooki mulder believ unexplain phenomena understand want debunk x file project sir agent mulder dana sculli assign work actual look forward work heard lot doubt qualif credenti th bother read needl punctur mayb anim bite electrocut kind organ know kind synthet protein theori logic would say girl obvious die someth natur caus plausibl someth miss post mortem murder plausibl sloppi investig find fantast notion answer beyond realm scienc answer know look mention yesterday case alreadi investig found someth autopsi report first three victim show unidentifi mark tissu sampl report sign differ medic examin latest victim better expect better hope medic examin suspect know never pleasur hi hello insinu anyth sir r

## Preprocessing pipeline 

In [114]:
import re
import string
special_words = {'f.b.i.': 'fbi', 'x-files': 'xfiles', 'x-file': 'xfile'}

def clean_text(corpus):
    cleaned_text = []
    for text in corpus:
        lower_case = text.lower()
        for key in special_words.keys():
            preserved_words = lower_case.replace(key, special_words[key])
        punc_removed = re.sub('[%s]' % re.escape(string.punctuation), ' ', preserved_words)
        no_digits = re.sub('\w*\d\w*', '', punc_removed)
        space_removed = " ".join(no_digits.split())
        stemmed_text = []
        for word in space_removed.split():
            if word not in my_stopwords: 
                word_stem = stemmer.stem(word)
                stemmed_text.append(word_stem)
        stemmed_text = (' ').join(stemmed_text)
        cleaned_text.append(stemmed_text)
    return(cleaned_text)

In [115]:
scully_cleaned = clean_text(scully_corpus)

In [116]:
mulder_cleaned = clean_text(mulder_corpus)

In [117]:
len(scully_cleaned) == len(scully_corpus)

True

In [122]:
scully_cleaned[15]

'still get us thank happen barnett theori sort fox guard chicken coop never caught happen barnett think escap sure mulder got clear shot book mulder shoot barnett probabl lot sentenc judg gave barnett henderson come must clever copycat pull print barnett lot time hand prison mayb plan someon outsid plan get right thing mulder sorri fax got phone prison call hunch john barnett die heart attack right least say death certif well fax medic record barnett admit prison infirmari infect right hand indic diagnosi coronari complic fact physic six month earlier given clean bill health left everyth must known better pretti well death certif doctor could tell barnett dead go mean ghost john barnett hello yeah minut right right lost barnett mean hope guy brought fine tooth comb want everi piec lint collect analyz noth turn run sorri mulder still hundr percent sure tri find listen accord doctor ridley sign barnett death certif offici doctor sinc membership expir renow state maryland revok medic lice

# Basic Counts

## Top words for Scully and Mulder over the whole show 

Append all the episodes together and see what each character says most often. 

## Adding labels to documents for classification - testing if Scully or Mulder said it 

Make a function that gets the length of the scully or mulder list and creates an array with the data labels for each record. 

In [71]:
import numpy as np

label_dict = {'Scully': 0, 'Mulder': 1}

def get_labels(list_of_docs, character):
    number_of_docs = len(list_of_docs)
    labels = np.array([label_dict[character]] * number_of_docs)
    return labels

In [72]:
scully_list = [scully_concat]

In [73]:
scully_list.append('test')

In [74]:
get_labels(scully_list, 'Scully')

array([0, 0])

#### Count vectorize 

In [47]:
cv = CountVectorizer(lowercase=True, stop_words='english')
cv_data = cv.fit_transform([scully_concat])

In [48]:
scully_count_vect = pd.DataFrame(cv_data.toarray(), columns=cv.get_feature_names())

In [49]:
scully_count_vect

Unnamed: 0,1988,academy,accident,act,actually,adolescent,advance,afraid,agenda,agent,...,watch,weighing,wheelchair,woods,work,working,wrote,years,yes,yesterday
0,1,1,1,1,1,1,1,1,1,3,...,1,1,1,4,1,1,1,1,5,1


In [57]:
scully_count_vect.sort_values(by=0, axis=1, ascending=False)

Unnamed: 0,mulder,know,ve,don,going,kind,yes,did,body,sir,...,heard,helped,hi,hiding,high,honestly,hoped,house,human,yesterday
0,10,9,8,8,6,6,5,5,5,5,...,1,1,1,1,1,1,1,1,1,1


In [58]:
cv = CountVectorizer(lowercase=True, stop_words='english')
cv_data = cv.fit_transform([mulder_concat])
mulder_count_vect = pd.DataFrame(cv_data.toarray(), columns=cv.get_feature_names())
mulder_count_vect

Unnamed: 0,abducted,abduction,able,academy,access,actually,agenda,ah,alien,allowed,...,wonder,wondering,work,worked,write,ya,yeah,year,yes,zipcode
0,1,2,3,1,1,1,1,1,3,1,...,1,1,2,1,1,1,5,1,2,1


In [62]:
list(mulder_count_vect.columns)

['abducted',
 'abduction',
 'able',
 'academy',
 'access',
 'actually',
 'agenda',
 'ah',
 'alien',
 'allowed',
 'amazing',
 'answers',
 'apart',
 'applying',
 'aptitude',
 'arrange',
 'arranged',
 'attempts',
 'authorities',
 'autopsy',
 'aware',
 'beats',
 'bedpan',
 'behavioural',
 'better',
 'big',
 'billy',
 'bites',
 'blocking',
 'blood',
 'bodies',
 'body',
 'bound',
 'boy',
 'boys',
 'brakes',
 'bright',
 'bureau',
 'buried',
 'called',
 'calls',
 'came',
 'campfire',
 'care',
 'case',
 'cases',
 'cause',
 'causing',
 'cemetery',
 'changed',
 'chemistry',
 'circumstances',
 'city',
 'class',
 'classified',
 'clock',
 'close',
 'closely',
 'come',
 'committed',
 'comprehend',
 'confessed',
 'confirm',
 'congress',
 'connection',
 'connections',
 'continue',
 'control',
 'controls',
 'convention',
 'corpse',
 'couldn',
 'county',
 'coventional',
 'crazy',
 'crime',
 'crimes',
 'criminal',
 'cursory',
 'damn',
 'day',
 'death',
 'deaths',
 'deep',
 'degree',
 'dell',
 'destroy',
 

In [59]:
mulder_count_vect.sort_values(by=0, axis=1, ascending=False)

Unnamed: 0,know,scully,ve,think,billy,just,did,oh,night,yeah,...,hadn,guy,great,graves,gravedigging,grave,graduating,gotten,gotta,zipcode
0,16,11,11,8,7,6,6,6,5,5,...,1,1,1,1,1,1,1,1,1,1


## Immediate next steps:
* Grab necessary libraries, dictionaries, etc. for preprocessing
* Label Scully and Mulder data
* Review other stemmers
* consider using ne_chunk to find important compound words like F.B.I., United States, etc.

## To dos: 
* consider a lemmatizer
* Functions for cleaning text
* How to label the records as Scully or Mulder - add get_labels into pipeline
* Concatenate all the episodes for Scully, and get word counts for all of them
* Concatenate all the episodes for Mulder's lines, and get word counts for all of them
* Get count vectorized data per line for Scully and Mulder
* Clean the text
    * compound words, hyphenated words
* Special words to deal with: 
    * F.B.I. (changes to F B I after replacing punctuation) - as a dictionary
    * X-files
    * Names (Mulder says 'Scully' a lot, vice versa)
    * Could also do regex to remove only certain punctuation (\W\s+) also drop apostrophes