In [None]:
import sys
import nltk
import sklearn
import pandas as pd
import numpy as np

In [None]:
text_data = pd.read_csv('processed-train.csv', encoding='utf-8')
text_data.head()

In [None]:
text_data.info()

In [None]:
text_data['cap'].value_counts()

In [None]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()
label = enc.fit_transform(text_data['location'])
print(label[:10])
print(text_data['cap'][:10])

In [None]:
text = text_data['cap']
text[:10]

In [None]:
processed = text.str.replace(r'[^\w\d\s]', ' ')
processed = processed.str.replace(r'\s+', ' ')
processed = processed.str.replace(r'^\s+|\s+?$', '')

In [None]:
processed = processed.str.lower()
processed

In [None]:
from nltk.corpus import stopwords
nltk.download('stopwords')

stop_words = set(stopwords.words('english'))

processed = processed.apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

In [None]:
ps = nltk.PorterStemmer()

processed = processed.apply(lambda x: ' '.join(ps.stem(term) for term in x.split()))

In [None]:
processed

In [None]:
from nltk.tokenize import word_tokenize

all_words = []

for message in processed:
    words = word_tokenize(message)
    for w in words:
        all_words.append(w)
        
all_words = nltk.FreqDist(all_words)

# Print the result
print('Number of words: {}'.format(len(all_words)))
print('Most common words: {}'.format(all_words.most_common(15)))

In [None]:
# use the 1500 most common words as features
word_features = [x[0] for x in all_words.most_common(1500)]

In [None]:
def find_features(message):
    words = word_tokenize(message)
    features = {}
    for word in word_features:
        features[word] = (word in words)

    return features

In [None]:
features = find_features(processed[0])
for key, value in features.items():
    if value == True:
        print(key)

In [None]:
list(features.items())[:10]

In [None]:
messages = list(zip(processed, label))

np.random.seed(1)
np.random.shuffle(messages)

# Call find_features function for each text_data message
feature_set = [(find_features(text), label) for (text, label) in messages]

In [None]:
from sklearn.model_selection import train_test_split

training, test = train_test_split(feature_set, test_size=0.25, random_state=1)

In [None]:
print(len(training))
print(len(test))

In [None]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB

names = ['K Nearest Neighbors', 'Decision Tree', 'Naive Bayes']

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    MultinomialNB()
]

models = zip(names, classifiers)

for name, model in models:
    nltk_model = SklearnClassifier(model)
    nltk_model.train(training)
    accuracy = nltk.classify.accuracy(nltk_model, test)
    print("{} model Accuracy: {}".format(name, accuracy))

K Nearest Neighbors model Accuracy: 0.3689517768561438 <br>
Decision Tree model Accuracy: 0.42408717573391885 <br>
Naive Bayes model Accuracy: 0.4424656420265105 <br>