In [1]:
import pandas as pd
import sqlite3
import re
import numpy as np
from nltk.probability import FreqDist
from nltk.classify import SklearnClassifier
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
import random


# Lade Amazon_raw
amazon_sql = 'data/amazon/amazon.db' #db -> id+rating

conn=sqlite3.connect(amazon_sql)
cur = conn.cursor()
cur.execute('SELECT ID, text,rating FROM dvd')
getrating={ID:rating for ID, text,rating in cur}
conn.close() 
########################


# Lade text processed data
text_sql = 'data/text_processing/text_processing.db' 

conn=sqlite3.connect(text_sql)
cur = conn.cursor()
cur.execute('SELECT ID,text FROM stopwords')
data={ID:re.sub('[\s]+',' ',text.lower().replace('.','')).strip().split(' ') for ID, text in cur if text!=None and len(text.replace('.','').strip())>0}
conn.close()
########################

#Classifier

random.seed(42)
data=list(data.items())
random.shuffle(data)
    
pipeline = Pipeline([('tfidf', TfidfTransformer()),
                     ('chi2', SelectKBest(chi2, k='all')),
                     ('nb', MultinomialNB())])
classif = SklearnClassifier(pipeline)

pos=[FreqDist(text) for ID, text in data if int(getrating[ID][0])>3 and len(text)>0]
neg=[FreqDist(text) for ID, text in data if int(getrating[ID][0])<3 and len(text)>0]

add_label = lambda lst, lab: [(x, lab) for x in lst]

lpos=int(len(pos)*0.5)
lneg=int(len(neg)*0.5)

classif.train(add_label(pos[:lpos], 'pos') + add_label(neg[:lneg], 'neg'))

l_pos = np.array(classif.classify_many(pos[lpos:]))
l_neg = np.array(classif.classify_many(neg[lneg:]))
pospos, posneg,negpos,negneg = (l_pos == 'pos').sum(), (l_pos == 'neg').sum(),(l_neg == 'pos').sum(), (l_neg == 'neg').sum()

accuracy=float(pospos+negneg)/(pospos+ posneg+negpos+negneg)
recall=float(pospos)/(pospos+posneg)
precision=float(pospos)/(pospos+negpos)
f1=(2.0*precision*recall)/(precision+recall)

accuracy, recall, precision, f1

(0.8578916073906054,
 0.8727834142037936,
 0.8471847570113467,
 0.8597935904399784)

In [2]:
from collections import Counter
def word_feats(words):
    return dict([(word, True) for word in words.split()])

words=re.sub('\s+',' ',' '.join([' '.join(text) for ID, text in data])).split(' ')
data2=[]
for word in set(words):
    dist = classif.prob_classify(word_feats(word))
    labels=sorted([(word,label, dist.prob(label)) for label in dist.samples()])
    temp=(labels[0][0],labels[0][2],labels[1][2])
    data2.append(temp)
    
df=pd.DataFrame(sorted(data2), columns=['tok','neg','pos'])

c=Counter(words)
n=[c[word] for word in list(df['tok'])]
df['n']=n

In [3]:
df.sort_values('pos', ascending=False)[:10]

Unnamed: 0,tok,neg,pos,n
3677,wunderbares,0.075404,0.924596,87
3306,unvergesslich,0.097485,0.902515,288
1746,kelly,0.102241,0.897759,205
3222,uneingeschränkt,0.10833,0.89167,404
3055,süchtig,0.115998,0.884002,270
3268,unschlagbar,0.120619,0.879381,221
2652,rundum,0.130845,0.869155,281
547,bravour,0.133974,0.866026,76
3584,warmherzig,0.135036,0.864964,104
1169,freundschaft,0.135765,0.864235,736


In [4]:
df.sort_values('neg', ascending=False)[:10]

Unnamed: 0,tok,neg,pos,n
3705,zeitverschwendung,0.980669,0.019331,426
3747,zumutung,0.962605,0.037395,458
1257,geldverschwendung,0.960321,0.039679,145
3375,vergeuden,0.955355,0.044645,161
2189,mogelpackung,0.946674,0.053326,138
2156,miserabel,0.941189,0.058811,744
1371,grottenschlecht,0.9411,0.0589,748
1393,gähnen,0.940304,0.059696,637
1157,frechheit,0.939924,0.060076,976
754,dümpeln,0.937849,0.062151,141


In [9]:
df.sort_values('pos', ascending=False)[df['tok']=='staffel']

  """Entry point for launching an IPython kernel.


Unnamed: 0,tok,neg,pos,n
2958,staffel,0.280922,0.719078,12294
