In [38]:
# --- IMPORTS ---
import nltk
import random
from nltk.classify.scikitlearn import SklearnClassifier
import pickle
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode
from nltk.tokenize import word_tokenize
import re
import os

In [39]:
# open each file in directory
files_pos = os.listdir('pos')
files_pos = [open('pos/'+f, 'r').read() for f in files_pos]
files_neg = os.listdir('neg')
files_neg = [open('neg/'+f, 'r').read() for f in files_neg]

In [40]:
# empty lists for all word strings and document sorting tuples
all_words = []
documents = []

# --- IMPORTS ---
from nltk.corpus import stopwords
from nltk.corpus import wordnet
import re

# stopwords and word types (adjectives, nouns, adverbs)
stopwords = list(set(stopwords.words('english')))
wordtypes = ["J", "N", "R"]

In [41]:
for p in files_pos:
    
    # create a list of tuples where the first element of each tuple is the file name, the second element is the label "pos"
    documents.append( (p, "pos") )
    
    # remove punctuation
    cleaned = re.sub(r'[^(a-zA-Z)\s]',' ', p)
    
    # tokenize contents of pos posts
    tokenized = word_tokenize(cleaned)
    
    # remove stopwords 
    stopped = [w for w in tokenized if not w in stopwords]
    
    # tags type of word for each word
    pos = nltk.pos_tag(stopped)
    
    # make a list of all words identified as adjectives, nouns, or adverbs
    for w in pos:
        if w[1][0] in wordtypes:
            all_words.append(w[0].lower())

In [42]:
for p in files_neg:
    # create a list of tuples where the first element of each tuple is the file name, the second element is the label "neg"
    documents.append( (p, "neg") )
    
    # remove punctuation
    cleaned = re.sub(r'[^(a-zA-Z)\s]',' ', p)
    
    # tokenize contents of neg posts
    tokenized = word_tokenize(cleaned)
    
    # remove stopwords 
    stopped = [w for w in tokenized if not w in stopwords]
    
    # tags type of word for each word
    neg = nltk.pos_tag(stopped)
    
    # make a list of all words identified as adjectives, nouns, or adverbs
    for w in neg:
        if w[1][0] in wordtypes:
            all_words.append(w[0].lower())

In [43]:
# create a frequency distribution of each word
all_words = nltk.FreqDist(all_words)

# list the 1000 most frequent words
word_features = list(all_words.keys())[:1000]

print(word_features)

['someone', 'recovery', 'certain', 'way', 'mean', 'needs', 'friends', 'your', 'body', 'honor', 'trust', 'base', 'plans', 'blogs', 'never', 'calorie', 'goal', 'meal', 'plan', 'whilst', 'realistic', 'knowing', 'much', 'eat', 'full', 'team', 'great', 'cool', 'almost', 'new', 'years', 'lot', 'people', 'resolution', 'weight', 'really', 'good', 'help', 'world', 'better', 'place', 'person', 'year', 'please', 'smaller', 'something', 'brings', 'instead', 'fatphobia', 'support', 'industry', 'profits', 'insecurities', 'unfollow', 'thinspo', 'fat', 'negative', 'accounts', 'hungry', 'stop', 'content', 'food', 'self', 'talk', 'positive', 'accountsor', 'maybe', 'compliment', 'often', 'away', 'compliments', 'send', 'kind', 'message', 'day', 'need', 'comments', 'languagethere', 'resolutions', 'influence', 'outside', 'thinking', 'life', 'unhappy', 'many', 'times', 'things', 'smile', 'mirror', 'skin', 'empty', 'space', 'nights', 'torn', 'apart', 'enough', 'heart', 'felt', 'wrong', 'terrible', 'happy', 't

In [44]:
# create feature for each of the 1000 most frequent words
def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

# create feature set for each posts determining if the post contains the feature (word) or not
featuresets = [(find_features(post), category) for (post, category) in documents]

In [45]:
print(featuresets[:1])

[({'someone': True, 'recovery': True, 'certain': True, 'way': True, 'mean': True, 'needs': True, 'friends': True, 'your': True, 'body': True, 'honor': True, 'trust': True, 'base': False, 'plans': False, 'blogs': False, 'never': False, 'calorie': False, 'goal': False, 'meal': False, 'plan': False, 'whilst': False, 'realistic': False, 'knowing': False, 'much': False, 'eat': False, 'full': False, 'team': False, 'great': False, 'cool': False, 'almost': False, 'new': False, 'years': False, 'lot': False, 'people': False, 'resolution': False, 'weight': False, 'really': False, 'good': False, 'help': False, 'world': False, 'better': False, 'place': False, 'person': False, 'year': False, 'please': True, 'smaller': False, 'something': False, 'brings': False, 'instead': False, 'fatphobia': False, 'support': False, 'industry': False, 'profits': False, 'insecurities': False, 'unfollow': False, 'thinspo': False, 'fat': False, 'negative': False, 'accounts': False, 'hungry': False, 'stop': False, 'cont

In [48]:
# shuffles documents 
random.shuffle(featuresets)

# creates training and testing set; 750 in training, 250 in testing
training_set = featuresets[:750]
testing_set = featuresets[750:]

In [49]:
# trains NLP with the Naive Bayes Classifier
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(100)

Classifier accuracy percent: 72.50996015936255
Most Informative Features
                recovery = True              pos : neg    =     51.1 : 1.0
                 deserve = True              pos : neg    =     23.4 : 1.0
                 illness = True              pos : neg    =     13.2 : 1.0
                disorder = True              pos : neg    =     12.0 : 1.0
                 recover = True              pos : neg    =      9.8 : 1.0
                 anymore = True              pos : neg    =      8.7 : 1.0
                 thinspo = True              neg : pos    =      8.6 : 1.0
                   alone = True              pos : neg    =      8.0 : 1.0
                    away = True              pos : neg    =      7.9 : 1.0
                   years = True              pos : neg    =      7.5 : 1.0
                    give = True              pos : neg    =      7.4 : 1.0
                   happy = True              pos : neg    =      7.4 : 1.0
                   point = 