## Preprocessing

This document is to iron out what pre-processing steps I want to take before modeling. Those pre-processing steps will be duplicated in my modeling notebook once decided upon.

In [1]:
from sklearn.decomposition import LatentDirichletAllocation, TruncatedSVD
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_union, make_pipeline, Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC

In [2]:
from nltk import text
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [3]:
import pandas as pd
import string

In [4]:
df = pd.read_csv('./merged_df.csv')

In [6]:
common_chars = df['character'].value_counts()[:10].index

common_chars_df = df.loc[df['character'].isin(common_chars)]

In [16]:
common_chars_df['character'].unique()

array(['SISKO', "O'BRIEN", 'KIRA', 'ODO', 'QUARK', 'BASHIR', 'DAX',
       'DUKAT', 'GARAK', 'WORF'], dtype=object)

In [8]:
# drop all sentences shorter than five words
count_array = [len(word_tokenize(line)) > 5 for line in common_chars_df['text']]
df = common_chars_df[count_array]

In [17]:
df['character'].unique()

array(['SISKO', "O'BRIEN", 'KIRA', 'ODO', 'QUARK', 'BASHIR', 'DAX',
       'DUKAT', 'GARAK', 'WORF'], dtype=object)

In [55]:
def cleaner(text):
    stemmer = PorterStemmer()
    stop = stopwords_list
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.translate(str.maketrans('', '', string.digits))
    text = text.lower().strip()
    final_text = []
    for w in text.split():
        if w.strip() not in stop:
            final_text.append(stemmer.stem(w.strip()))
    return ' '.join(final_text)

In [11]:
stopwords_list = stopwords.words('english')

In [30]:
for each in df[df['character'] == 'BASHIR']['text'][5:15]:
    print(cleaner(each))

adventur
hero made
wilder
didnt mean
mean construct
theyv listen reason havent
warn happen
believ cardassian ever attack feder outpost
look attent



In [23]:
cv = CountVectorizer(preprocessor=cleaner)
cv.fit(df['text'])
to_dense = cv.transform(df['text']).todense()
to_dense_df = pd.DataFrame(to_dense, columns=cv.get_feature_names())
to_dense_df.sum().sort_values(ascending=False)[:50]

im            2743
go            2584
dont          2437
know          2211
get           1985
one           1913
want          1804
well          1708
your          1635
think         1557
like          1512
time          1259
us            1167
would         1160
take          1133
that          1103
see           1066
could         1056
right         1056
way           1052
look           999
make           995
ill            987
say            954
ive            929
tell           904
back           898
let            862
come           860
need           851
station        833
ship           825
thing          822
cant           804
tri            773
two            747
someth         747
there          739
command        730
find           719
cardassian     706
never          688
talk           673
peopl          663
got            647
didnt          634
sure           632
he             630
id             602
work           581
dtype: int64

In [24]:
to_dense_df.sum().sort_values(ascending=False).index[:50]

Index(['im', 'go', 'dont', 'know', 'get', 'one', 'want', 'well', 'your',
       'think', 'like', 'time', 'us', 'would', 'take', 'that', 'see', 'could',
       'right', 'way', 'look', 'make', 'ill', 'say', 'ive', 'tell', 'back',
       'let', 'come', 'need', 'station', 'ship', 'thing', 'cant', 'tri', 'two',
       'someth', 'there', 'command', 'find', 'cardassian', 'never', 'talk',
       'peopl', 'got', 'didnt', 'sure', 'he', 'id', 'work'],
      dtype='object')

In [28]:
stopwords_list.extend(['im', 'go', 'dont', 'know', 'get', 'one', 'want', 'well', 'your',
       'think', 'like', 'us', 'would', 'take', 'that', 'see', 'could',
       'right', 'way', 'make', 'ill', 'say', 'ive', 'tell', 'back',
       'let', 'come', 'thing', 'cant', 'tri', 'two',
       'someth', 'there', 'find', 'talk', 'got', 'didn\t', 'sure', 'he', 'id', 'work'])

In [31]:
#stopwords_list

In [32]:
cv = CountVectorizer(preprocessor=cleaner)
cv.fit(df['text'])
to_dense = cv.transform(df['text']).todense()
to_dense_df = pd.DataFrame(to_dense, columns=cv.get_feature_names())
to_dense_df.sum().sort_values(ascending=False)[:50]

go            1805
your          1635
time          1259
that          1103
look           999
need           851
station        833
ship           825
tri            773
someth         747
there          739
command        730
cardassian     706
never          688
peopl          663
didnt          634
he             630
theyr          580
doctor         577
day            567
good           566
much           558
use            555
still          544
mean           538
klingon        537
help           535
give           532
first          529
chief          516
littl          513
three          509
major          506
even           504
better         496
believ         489
mayb           489
long           484
thought        481
captain        477
starfleet      474
bajoran        473
oh             471
might          453
anyth          449
year           443
odo            438
man            438
ask            435
youv           430
dtype: int64

In [33]:
to_dense_df.sum().sort_values(ascending=False).index[:50]

Index(['go', 'your', 'time', 'that', 'look', 'need', 'station', 'ship', 'tri',
       'someth', 'there', 'command', 'cardassian', 'never', 'peopl', 'didnt',
       'he', 'theyr', 'doctor', 'day', 'good', 'much', 'use', 'still', 'mean',
       'klingon', 'help', 'give', 'first', 'chief', 'littl', 'three', 'major',
       'even', 'better', 'believ', 'mayb', 'long', 'thought', 'captain',
       'starfleet', 'bajoran', 'oh', 'might', 'anyth', 'year', 'odo', 'man',
       'ask', 'youv'],
      dtype='object')

In [39]:
stopwords_list.extend(to_dense_df.sum()[to_dense_df.sum() < 5].index)

In [44]:
stopwords_list.extend(['go', 'your', 'theyr', 'day', 'much', 'use', 'still', 'mean', 'thought', 'oh', 'anyth'])

In [48]:
cv = CountVectorizer(preprocessor=cleaner)
cv.fit(df['text'])
to_dense = cv.transform(df['text']).todense()
to_dense_df = pd.DataFrame(to_dense, columns=cv.get_feature_names())
to_dense_df.sum().sort_values(ascending=False)[:25]

go            1805
your          1635
time          1259
that          1103
look           999
need           851
station        833
ship           825
tri            773
someth         747
there          739
command        730
cardassian     706
never          688
peopl          663
didnt          634
he             630
theyr          580
doctor         577
good           566
klingon        537
help           535
give           532
first          529
chief          516
dtype: int64

In [57]:
for each in df[df['character'] == 'BASHIR']['text'][5:50]:
    print(cleaner(each))

adventur
hero made
wilder
didnt
construct
theyv listen reason havent
warn happen
believ cardassian ever attack feder outpost
look attent

thirteen injur command fatal
walk along odo someon practic phaser around


quark found delight dri champagn estat bottl korri
cold hand warm heart
exot your wear
ye ye
free
guess competit
nice dinner command sisko
said tri rise suggest alway succeed
champagn ice
caus death mysteri
knife directli left thorac vertebra perfor lower ventricl heart
murder appar decent knowledg bajoran anatomi
sweep hair follicl skin cellular remnant dna fragment
dna sequenc analys cellular spectrograph particul matter trace
ask lieuten dax confirm find afraid concur
apart bodi discov dna present weve identifi ibudan
curiou found seofuran fragment near matter reclam unit
appear tri rid
exactli theyr
time
analys fragment detect trace complex organ structur
suggest ibudan may conduct sort medic experi board ship
let standard electrophoret analysi tell
complex protein break d

In [56]:
'go' in stopwords_list

True