In [410]:
import csv
import re
import random
from collections import defaultdict
from math import log10

In [411]:
data=[]
with open('games-train.csv','r', newline='') as f:
    raw = csv.reader(f, delimiter='\t')
    for row in raw:
        data.append(row)

In [412]:
## Cleaning Extracting the Dataset
data_set=[]
for num, dat in enumerate(data):
    dat[3] = re.sub(r'[^\w\säöüß]', ' ', dat[3].lower())
    data_set.append([num, dat[1],[token for token in dat[3].lower().split()]])

In [554]:
class Naive_Bayes:
    def __init__(self):
        self.c_prob = None
        self.feature_types={}
        self.class_term_freq ={} 
        self.token_counts = {}
        self.vocab = defaultdict(int)
        self.idf = {}
        
    def train_test_split(self,dataset, split=0.85, random_state=3):
        random.seed(random_state)
        training_size = int(len(dataset)*split)
        training_set = []
        testing = dataset.copy()
        while len(training_set) < training_size:
            index = random.randrange(len(testing))
            training_set.append(testing.pop(index))
        x_train = []
        y_train = []
        x_test = []
        y_test = []
        for i in training_set:
            x_train.append([i[0], i[2]])
            y_train.append([i[0], i[1]])
        for i in testing:
            x_test.append([i[0], i[2]])
            y_test.append([i[0], i[1]])
        return x_train, x_test, y_train, y_test
    
    def _idf(self,data):
        for line in data:
            for term in line[1]:
                self.vocab[term] += 1
        idf= self.vocab.copy()
        for term in idf.keys():
            idf[term]= log10(len(data)/idf[term])
        self.idf = idf
        return idf           
    
    def train(self,data,labels):
        idf = self._idf(data)
        
        classes = defaultdict(int)
        features = self.feature_types
        class_terms = self.class_term_freq
        
        for item in labels:
            classes[item[1]] +=1
        c_prob = {}
        for key, val in classes.items():
            c_prob[key] = val/len(labels)
        self.c_prob = c_prob
        
        for key in c_prob.keys():
            feature_count = 0
            terms = defaultdict(int)
            for num, item in enumerate(data):
                if labels[num][1] == key:
                    for word in item[1]:
                            feature_count +=1
                            terms[word] += 1
         
            
            self.token_counts[key] = feature_count
            for term,val in terms.items():
                terms[term] = val+1
            class_terms[key] = terms

        for key in self.c_prob.keys():
            weights = {}
            for term, val in class_terms[key].items():
                weights[term] = (val/(self.token_counts[key]))
            self.feature_types[key] = weights
        return
    
    
    def predict(self,train):
        predictions = {}
        for item in train:
            c_pred=[]
            total=1
            for key in self.c_prob.keys():
                for word in item[1]:
                    try:
                        total = total*self.feature_types[key][word]
                    except:
                        total = total*(1/(self.token_counts[key]+len(self.vocab)))
                        continue
                c_pred.append((total*self.c_prob[key],key))
            predictions[item[0]] = sorted(c_pred, reverse=True)[0][1]
        return predictions
    
    
    def evaluate(self,pred,test):
        def _cl_count(test):
            classes=defaultdict(int)
            for item in test:
                classes[item[1]] +=1
            return classes
        
        def _correct(pred,test, classes):
            evals={}
            for key,val in classes.items():
                scores = {}
                TP = 0
                FP = 0
                FN = 0
                TN = 0
                for item in test:
                    if pred[item[0]] == key and item[1] == key:
                        TP += 1 
                    elif pred[item[0]] == key and item[1] != key:
                        FP += 1
                    elif pred[item[0]] != key and item[1] == key:
                        FN += 1
                    else:
                        TN += 1     
                scores['P'] = TP/(TP+FP)
                scores['R'] = TP/(TP+FN)
                scores['F1']= (2*scores['P']*scores['R'])/(scores['P']+scores['R'])
                evals[key]=[scores, TP, FP, FN, TN]
                print("Class:", key, '\n','TP:', TP, 'FP:', FP, 'FN:', FN,
                      '\n', 'Precision:', scores['P'], 'Recall:', scores['R'],
                      '\n',"F1-score:", scores['F1'] )
            return evals
            
        def macro_f1(evals):
            mac_P = 0
            mac_R = 0
            for key in evals.keys():
                mac_P += evals[key][0]['P']
                mac_R += evals[key][0]['R']
            mac_P = mac_P/len(evals.keys())
            mac_R = mac_R/len(evals.keys())
            mac_F1 = (2*mac_P*mac_R)/(mac_P+mac_R)
            return mac_P, mac_R, mac_F1

        def micro_f1(evals):
            TP=0
            FP=0
            FN=0
            for item in  evals.values():
                TP += item[1]
                FP += item[2]
                FN += item[3]
            mic_P = TP/(TP+FP)
            mic_R = TP/(TP+FN)
            mic_F1 = (2*mic_P*mic_R)/(mic_P+mic_R)
            return mic_P, mic_R, mic_F1
        
        classes = _cl_count(test)
        evals = _correct(pred,test,classes)
        mac_P,mac_R, mac_F1 = macro_f1(evals)
        mic_P,mic_R, mic_F1 = micro_f1(evals)
        
        print("Macro Precision:",mac_P,'\tMacro Recall:',mac_R,'\tMacro F1 measure:',mac_F1)
        print("Micro Precision:",mic_P,'\tMicro Recall:',mic_R,'\tMicro F1 measure:',mic_F1)
            

In [555]:
clf = Naive_Bayes()

In [556]:
x_train, x_test, y_train, y_test = clf.train_test_split(data_set,0.90, random_state=42)

In [557]:
print('train:', len(x_train),"\n","test:",len(x_test))

train: 111443 
 test: 12383


In [558]:
clf.train(x_train, y_train)

In [559]:
pred = clf.predict(x_train)

# Evaluation of Training Set

In [560]:
gut=0
schlecht=0
for val in pred.values():
    if val == 'gut':
        gut += 1
    else:
        schlecht += 1
print('gut:',gut,'schlecht:',schlecht)

gut: 111328 schlecht: 115


In [561]:
evals = clf.evaluate(pred, y_train)

Class: gut 
 TP: 91681 FP: 19647 FN: 37 
 Precision: 0.8235214860592124 Recall: 0.9995965895462178 
 F1-score: 0.9030564502625021
Class: schlecht 
 TP: 78 FP: 37 FN: 19647 
 Precision: 0.6782608695652174 Recall: 0.003954372623574145 
 F1-score: 0.007862903225806453
Macro Precision: 0.7508911778122149 	Macro Recall: 0.501775481084896 	Macro F1 measure: 0.6015627211166565
Micro Precision: 0.8233715890634674 	Micro Recall: 0.8233715890634674 	Micro F1 measure: 0.8233715890634674


# Most Important Words (Top 15)

In [562]:

sorted([(val,key) for key, val in clf.feature_types['gut'].items()], reverse=True)[:15]

[(0.034387872576877775, 'spiel'),
 (0.030234265648949786, 'es'),
 (0.028848990601424537, 'das'),
 (0.02636680841058266, 'ist'),
 (0.025264355730646632, 'ich'),
 (0.02162648370832252, 'und'),
 (0.016435861432567384, 'cool'),
 (0.013791749572439126, 'macht'),
 (0.013160667505151803, 'aber'),
 (0.012553985798989826, 'die'),
 (0.012009414102965087, 'man'),
 (0.011617899217635693, 'super'),
 (0.010763886578248632, 'nicht'),
 (0.010089549325330014, 'geil'),
 (0.010008584490686825, 'nur')]

In [563]:
sorted([(val,key) for key, val in clf.feature_types['schlecht'].items()], reverse =True)[:15]

[(0.029711842254933597, 'ich'),
 (0.028883095658714258, 'das'),
 (0.026619712097611027, 'nicht'),
 (0.024474720907396275, 'es'),
 (0.023968651557323962, 'und'),
 (0.02003848912763394, 'spiel'),
 (0.015196009016391539, 'ist'),
 (0.01395172841254402, 'mehr'),
 (0.012986018093139975, 'die'),
 (0.009603710556188787, 'man'),
 (0.008958356063894737, 'aber'),
 (0.008777285379006478, 'kann'),
 (0.00841746542826699, 'wieder'),
 (0.00757479031782548, 'nur'),
 (0.007423898080418598, 'auf')]

# Development Set

In [564]:
t_pred = clf.predict(x_test)

In [565]:
clf.evaluate(t_pred, y_test)

Class: gut 
 TP: 10198 FP: 2176 FN: 4 
 Precision: 0.8241474058509779 Recall: 0.9996079200156832 
 F1-score: 0.9034372785258683
Class: schlecht 
 TP: 5 FP: 4 FN: 2176 
 Precision: 0.5555555555555556 Recall: 0.0022925263640531865 
 F1-score: 0.0045662100456621
Macro Precision: 0.6898514807032667 	Macro Recall: 0.5009502231898681 	Macro F1 measure: 0.5804178010433477
Micro Precision: 0.823952192522006 	Micro Recall: 0.823952192522006 	Micro F1 measure: 0.823952192522006


# Cleaning and Extracting Test Dataset

In [566]:
data=[]
with open('games-test.csv','r', newline='') as f:
    raw = csv.reader(f, delimiter='\t')
    for row in raw:
        data.append(row)

In [567]:
labels = []
test = []
for num, dat in enumerate(data):
    dat[3] = re.sub(r'[^\w\säöüß]', ' ', dat[3].lower())
    labels.append((num, dat[1]))
    test.append((num,[token for token in dat[3].lower().split()]))

In [568]:
test_pred = clf.predict(test)

In [569]:
gut=0
schlecht=0
for val in test_pred.values():
    if val == 'gut':
        gut += 1
    else:
        schlecht += 1
print('gut:',gut,'schlecht:',schlecht)

gut: 44187 schlecht: 46


In [570]:
clf.evaluate(test_pred, labels)

Class: schlecht 
 TP: 26 FP: 20 FN: 7791 
 Precision: 0.5652173913043478 Recall: 0.0033260841755149034 
 F1-score: 0.006613251939463309
Class: gut 
 TP: 36396 FP: 7791 FN: 20 
 Precision: 0.8236811731957363 Recall: 0.99945079086116 
 F1-score: 0.9030929369874572
Macro Precision: 0.6944492822500421 	Macro Recall: 0.5013884375183374 	Macro F1 measure: 0.5823346007692746
Micro Precision: 0.8234123844188728 	Micro Recall: 0.8234123844188728 	Micro F1 measure: 0.8234123844188728
