In [33]:
import sys
import nltk
import sklearn
import pandas as pd
import numpy as np

In [34]:
text_data = pd.read_csv('processed-train.csv', encoding='utf-8')
text_data.head()

Unnamed: 0,cap,location
0,It's a little eerie.,['room']
1,It just sits there and collects rainwater.,['room']
2,"I wanted to get out, but I couldn't because th...",['room']
3,The barriers were featureless and the floor wa...,['room']
4,The only sound was the occasional creak from t...,['room']


In [35]:
text_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49186 entries, 0 to 49185
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   cap       49186 non-null  object
 1   location  49186 non-null  object
dtypes: object(2)
memory usage: 768.7+ KB


In [36]:
text_data['cap'].value_counts()

Beyond lay another dull wilderness of bricks and mortar, its silence broken only by the heavy, regular footfall of the policeman, or the songs and shouts of some belated party of revellers.                                                                                                                                                                                                                                                                                                                                                                                                                                                                               7
Imagine, then, my thrill of terror when last night, as I lay awake, thinking over her terrible fate, I suddenly heard in the silence of the night the low whistle which had been the herald of her own death.                                                                                                                                             

In [37]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
label = enc.fit_transform(text_data['location'])
print(label[:10])
print(text_data['cap'][:10])

[473 473 473 473 473 473 473 473 473 473]
0                                 It's a little eerie.
1           It just sits there and collects rainwater.
2    I wanted to get out, but I couldn't because th...
3    The barriers were featureless and the floor wa...
4    The only sound was the occasional creak from t...
5    The only sound was the ticking of the clock on...
6    The barriers were a dull gray, the floor a col...
7    The barriers are a light blue color and the fl...
8    The only light came from the small window at t...
9    The barriers were plateau and there was nothin...
Name: cap, dtype: object


In [38]:
text = text_data['cap']
text[:10]

0                                 It's a little eerie.
1           It just sits there and collects rainwater.
2    I wanted to get out, but I couldn't because th...
3    The barriers were featureless and the floor wa...
4    The only sound was the occasional creak from t...
5    The only sound was the ticking of the clock on...
6    The barriers were a dull gray, the floor a col...
7    The barriers are a light blue color and the fl...
8    The only light came from the small window at t...
9    The barriers were plateau and there was nothin...
Name: cap, dtype: object

In [39]:
processed = text.str.replace(r'[^\w\d\s]', ' ')
processed = processed.str.replace(r'\s+', ' ')
processed = processed.str.replace(r'^\s+|\s+?$', '')

  processed = text.str.replace(r'[^\w\d\s]', ' ')
  processed = processed.str.replace(r'\s+', ' ')
  processed = processed.str.replace(r'^\s+|\s+?$', '')


In [40]:
processed = processed.str.lower()
processed

0                                      it s a little eerie
1                it just sits there and collects rainwater
2        i wanted to get out but i couldn t because the...
3        the barriers were featureless and the floor wa...
4        the only sound was the occasional creak from t...
                               ...                        
49181    an eleven year old male child his road to hapi...
49182    grace runs away from her evil mother she has n...
49183    the night of the funeral changes everything fo...
49184    the boy ran away but got lost he accidentally ...
49185    alicia lost her family two years ago she neede...
Name: cap, Length: 49186, dtype: object

In [41]:
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\IMHost\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [42]:
ps = nltk.PorterStemmer()

processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

In [43]:
processed

0                                               littl eeri
1                                      sit collect rainwat
2                                       want get door lock
3              barrier featureless floor cold hard concret
4                            sound occasion creak age wood
                               ...                        
49181            eleven year old male child road hapi hard
49182    grace run away evil mother father support wand...
49183    night funer chang everyth ingrid smith 25 year...
49184    boy ran away got lost accident found undergrou...
49185    alicia lost famili two year ago need money fou...
Name: cap, Length: 49186, dtype: object

In [44]:
from nltk.tokenize import word_tokenize

all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)

# Print the result
print('Number of words: {}'.format(len(all_words)))
print('Most common words: {}'.format(all_words.most_common(15)))

Number of words: 14665
Most common words: [('place', 5426), ('peopl', 4184), ('use', 2916), ('great', 2519), ('alway', 2261), ('shine', 2228), ('also', 2189), ('go', 2067), ('one', 1994), ('beauti', 1810), ('mani', 1782), ('get', 1737), ('time', 1675), ('way', 1619), ('see', 1568)]


In [45]:
# use the 1500 most common words as features
word_features = [x[0] for x in all_words.most_common(1500)]

In [46]:
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)

    return features

In [47]:
features = find_features(processed[0])
for key, value in features.items():
    if value == True:
        print(key)

littl


In [48]:
list(features.items())[:10]

[('place', False),
 ('peopl', False),
 ('use', False),
 ('great', False),
 ('alway', False),
 ('shine', False),
 ('also', False),
 ('go', False),
 ('one', False),
 ('beauti', False)]

In [50]:
messages = list(zip(processed, label))

np.random.seed(1)
np.random.shuffle(messages)

# Call find_features function for each text_data message
feature_set = [(find_features(text), label) for (text, label) in messages]

In [51]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(feature_set, test_size=0.25, random_state=1)

In [52]:
print(len(training))
print(len(test))

36889
12297


In [53]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB

names = ['K Nearest Neighbors', 'Decision Tree', 'Naive Bayes']

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    MultinomialNB()
]

models = zip(names, classifiers)

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, test)
    print("{} model Accuracy: {}".format(name, accuracy))

KeyboardInterrupt: 

K Nearest Neighbors model Accuracy: 0.3689517768561438 <br>
Decision Tree model Accuracy: 0.42408717573391885 <br>
Naive Bayes model Accuracy: 0.4424656420265105 <br>