# Document classifier

## Daten
- Wir brauchen zuerst daten um unser Modell zu trainieren

In [1]:
from textblob.classifiers import NaiveBayesClassifier

train = [
    ('I love this sandwich.', 'pos'),
    ('This is an amazing place!', 'pos'),
    ('I feel very good about these beers.', 'pos'),
    ('This is my best work.', 'pos'),
    ("What an awesome view", 'pos'),
    ('I do not like this restaurant', 'neg'),
    ('I am tired of this stuff.', 'neg'),
    ("I can't deal with this", 'neg'),
    ('He is my sworn enemy!', 'neg'),
    ('My boss is horrible.', 'neg')
]
test = [
    ('The beer was good.', 'pos'),
    ('I do not enjoy my job', 'neg'),
    ("I ain't feeling dandy today.", 'neg'),
    ("I feel amazing!", 'pos'),
    ('Gary is a friend of mine.', 'pos'),
    ("I can't believe I'm doing this.", 'neg')
]

## Training

In [2]:
cl = NaiveBayesClassifier(train)

## Test
- Wie gut performed unser Modell bei Daten die es noch nie gesehen hat?

In [3]:
cl.accuracy(test)

0.8333333333333334

- Zu 80% korrekt, ok für mich :)

## Features
- Welche wörter sorgen am meisten dafür dass etwas positiv oder negativ klassifiziert wird?

In [9]:
cl.show_informative_features(5)

Most Informative Features
          contains(this) = True              neg : pos    =      2.3 : 1.0
          contains(this) = False             pos : neg    =      1.8 : 1.0
          contains(This) = False             neg : pos    =      1.6 : 1.0
            contains(an) = False             neg : pos    =      1.6 : 1.0
             contains(I) = False             pos : neg    =      1.4 : 1.0


Er ist der meinung wenn "this" vorkommt ist es eher positiv, was natürlich quatsch ist, aber das hat er nun mal so gelernt, deswegen braucht ihr gute trainingsdaten. 

## Klassifizierung

In [4]:
cl.classify("Their burgers are amazing")  # "pos"

'pos'

In [5]:
cl.classify("I don't like their pizza.")  # "neg"

'neg'

In [6]:
cl.classify("I hate cars.")

'neg'

In [7]:
cl.classify("Zurich is beautiful.")

'pos'

In [8]:
cl.classify("Zurich")

'pos'

### Klassizierung nach Sätzen

In [10]:
from textblob import TextBlob
blob = TextBlob("The beer was amazing. "
                "But the hangover was horrible. My boss was not happy.",
                classifier=cl)


In [12]:
for sentence in blob.sentences:
    print(("%s (%s)") % (sentence,sentence.classify()))

The beer was amazing. (pos)
But the hangover was horrible. (neg)
My boss was not happy. (neg)


## Mit schweizer Songtexten Kommentare klassifizieren

In [17]:
import os,glob
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize


from io import open


train = []
countries = ["schweiz", "deutschland"]
for country in countries:
    out = []
    folder_path = 'songtexte/%s' % country 
    for filename in glob.glob(os.path.join(folder_path, '*.txt')):#alle Dateien einlesen
      with open(filename, 'r') as f:
        text = f.read()
        words = word_tokenize(text)
        words=[word.lower() for word in words if word.isalpha()]
        for word in words:
            out.append(word)
    out = set(out)
    for word in out:
        train.append((word,country))
        #print (filename)
        #print (len(text))
train

[('isch', 'schweiz'),
 ('d', 'schweiz'),
 ('ding', 'schweiz'),
 ('see', 'schweiz'),
 ('einzigs', 'schweiz'),
 ('chöme', 'schweiz'),
 ('i', 'schweiz'),
 ('si', 'schweiz'),
 ('ich', 'schweiz'),
 ('höö', 'schweiz'),
 ('wei', 'schweiz'),
 ('wer', 'schweiz'),
 ('eim', 'schweiz'),
 ('ohni', 'schweiz'),
 ('schue', 'schweiz'),
 ('stärn', 'schweiz'),
 ('himmel', 'schweiz'),
 ('ring', 'schweiz'),
 ('härzli', 'schweiz'),
 ('ds', 'schweiz'),
 ('gumpe', 'schweiz'),
 ('schlaus', 'schweiz'),
 ('chöpfli', 'schweiz'),
 ('wisli', 'schweiz'),
 ('de', 'schweiz'),
 ('bürschteli', 'schweiz'),
 ('als', 'schweiz'),
 ('schell', 'schweiz'),
 ('nand', 'schweiz'),
 ('ja', 'schweiz'),
 ('chindli', 'schweiz'),
 ('schwanz', 'schweiz'),
 ('ga', 'schweiz'),
 ('sing', 'schweiz'),
 ('ine', 'schweiz'),
 ('süsch', 'schweiz'),
 ('ids', 'schweiz'),
 ('gseh', 'schweiz'),
 ('hei', 'schweiz'),
 ('füess', 'schweiz'),
 ('schwänzli', 'schweiz'),
 ('mir', 'schweiz'),
 ('wie', 'schweiz'),
 ('gange', 'schweiz'),
 ('uftue', 'schweiz'

In [14]:
from textblob.classifiers import NaiveBayesClassifier
c2 = NaiveBayesClassifier(train)

In [15]:
c2.classify("Ich gehe durch den Wald")  # "deutsch"

'deutschland'

In [16]:
c2.classify("Häsch es guet")  # "deutsch"

'schweiz'

In [18]:
c2.classify("Wötsch da?") 

'deutschland'

In [19]:
c2.show_informative_features(5)

Most Informative Features
           contains(zur) = True           schwei : deutsc =      1.3 : 1.0
          contains(froh) = True           schwei : deutsc =      1.3 : 1.0
           contains(wer) = True           schwei : deutsc =      1.3 : 1.0
           contains(das) = True           schwei : deutsc =      1.3 : 1.0
         contains(macht) = True           schwei : deutsc =      1.3 : 1.0


## Hardcore Beispiel mit Film-review daten mit NLTK
- https://www.nltk.org/book/ch06.html
- Wir nutzen nur noch die 100 häufigsten Wörter in den Texten und schauen ob sie bei positiv oder negativ vorkommen

In [27]:
import random
import nltk
nltk.download('movie_reviews')
review = (" ").join(train[0][0])
print(review)

[nltk_data] Downloading package movie_reviews to
[nltk_data]     /Users/edzardschade/nltk_data...
[nltk_data]   Unzipping corpora/movie_reviews.zip.


i s c h


In [28]:
from nltk.corpus import movie_reviews
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]
random.shuffle(documents)

In [29]:
(" ").join(documents[0][0])



In [35]:
(" ").join(documents[1][1])

'p o s'

In [36]:
#ist hier ein Zwischenschritt
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000] #wir nehmen die 2000 häufigsten Wörter
word_features 

['plot',
 ':',
 'two',
 'teen',
 'couples',
 'go',
 'to',
 'a',
 'church',
 'party',
 ',',
 'drink',
 'and',
 'then',
 'drive',
 '.',
 'they',
 'get',
 'into',
 'an',
 'accident',
 'one',
 'of',
 'the',
 'guys',
 'dies',
 'but',
 'his',
 'girlfriend',
 'continues',
 'see',
 'him',
 'in',
 'her',
 'life',
 'has',
 'nightmares',
 'what',
 "'",
 's',
 'deal',
 '?',
 'watch',
 'movie',
 '"',
 'sorta',
 'find',
 'out',
 'critique',
 'mind',
 '-',
 'fuck',
 'for',
 'generation',
 'that',
 'touches',
 'on',
 'very',
 'cool',
 'idea',
 'presents',
 'it',
 'bad',
 'package',
 'which',
 'is',
 'makes',
 'this',
 'review',
 'even',
 'harder',
 'write',
 'since',
 'i',
 'generally',
 'applaud',
 'films',
 'attempt',
 'break',
 'mold',
 'mess',
 'with',
 'your',
 'head',
 'such',
 '(',
 'lost',
 'highway',
 '&',
 'memento',
 ')',
 'there',
 'are',
 'good',
 'ways',
 'making',
 'all',
 'types',
 'these',
 'folks',
 'just',
 'didn',
 't',
 'snag',
 'correctly',
 'seem',
 'have',
 'taken',
 'pretty',


In [33]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000] #wir nehmen die 2000 häufigsten Wörter

def document_features(document): 
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [31]:
print(document_features(movie_reviews.words('pos/cv957_8737.txt'))) 

{'contains(plot)': True, 'contains(:)': True, 'contains(two)': True, 'contains(teen)': False, 'contains(couples)': False, 'contains(go)': False, 'contains(to)': True, 'contains(a)': True, 'contains(church)': False, 'contains(party)': False, 'contains(,)': True, 'contains(drink)': False, 'contains(and)': True, 'contains(then)': True, 'contains(drive)': False, 'contains(.)': True, 'contains(they)': True, 'contains(get)': True, 'contains(into)': True, 'contains(an)': True, 'contains(accident)': False, 'contains(one)': True, 'contains(of)': True, 'contains(the)': True, 'contains(guys)': False, 'contains(dies)': False, 'contains(but)': True, 'contains(his)': True, 'contains(girlfriend)': True, 'contains(continues)': False, 'contains(see)': False, 'contains(him)': True, 'contains(in)': True, 'contains(her)': False, 'contains(life)': False, 'contains(has)': True, 'contains(nightmares)': False, 'contains(what)': True, "contains(')": True, 'contains(s)': True, 'contains(deal)': False, 'contains

In [37]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [38]:
classifier.classify(document_features("a movie with bad actors".split(" ")))

'neg'

In [39]:
classifier.classify(document_features("an uplifting movie with russel crowe".split(" "))) #split: er nimmt nur Wörtelisten

'neg'

In [40]:
classifier.show_most_informative_features(10)

Most Informative Features
        contains(turkey) = True              neg : pos    =      8.5 : 1.0
 contains(unimaginative) = True              neg : pos    =      7.7 : 1.0
     contains(atrocious) = True              neg : pos    =      6.6 : 1.0
    contains(schumacher) = True              neg : pos    =      6.6 : 1.0
        contains(shoddy) = True              neg : pos    =      6.4 : 1.0
       contains(singers) = True              pos : neg    =      6.3 : 1.0
        contains(justin) = True              neg : pos    =      5.8 : 1.0
           contains(ugh) = True              neg : pos    =      5.8 : 1.0
        contains(canyon) = True              neg : pos    =      5.7 : 1.0
        contains(suvari) = True              neg : pos    =      5.7 : 1.0
