In [164]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import networkx as nx
import nltk
import pickle
import string
import random
import csv 
import sys
import plotly.plotly as py
from plotly.graph_objs import *
import json
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from scipy import spatial
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.cluster import KMeans
from sklearn.cross_validation import train_test_split
from sklearn.cross_validation import cross_val_score

In [None]:
df = pd.read_csv('sr20.csv', header=0)
df

In [None]:
# cleaning rows with null body
df = df[df.selftext.notnull()]

# clean out comments which were deleted
df = df[df.selftext != '[deleted]']

# select text only posts
df = df[df.apply(lambda r: r.domain[:5] == 'self.', axis=1)]


all_words = []
def clean_body(row):
    # clean unprintable chars
    body = filter(lambda x: x in string.printable, row.selftext)
    # maybe we dont want to just replace it with space. some words might have an apost in it and this 
    # will just split those words. but maybe these words are not that imp either?
    body = re.sub("[^a-zA-Z]", " ", body ).lower()
    
    all_words.extend(word_tokenize(body))
    return body

df['body'] = df.apply(clean_body, axis=1)


In [None]:
def computeWordCounts(all_words):
    wordCounts = {}
    for word in all_words:
        if word in wordCounts:
            wordCounts[word] += 1 
        else: 
            wordCounts[word] = 1
    return wordCounts

wordCounts = computeWordCounts(all_words)
sortedWords = sorted(wordCounts, key=wordCounts.get, reverse=True)
mostFreqWords = sortedWords[0:1000]
freqWordCounts = [wordCounts[word] for word in mostFreqWords]

In [None]:
def get_n_frequent_words(df, n):
    return pd.Series(' '.join(df['body']).lower().split()).value_counts()[:n]

In [None]:
print len(df)
df.subreddit.value_counts() #* 100.0/len(df)

In [233]:
#cls = 'positive'
#cls = 'score_class'
#cls = 'score_class_subreddit'
#cls = 'subreddit'
#cls = 'subreddit_implicit'
cls = 'our_model'

def get_score_class(row):
    score = row.score
    if score < 10:
        return 1
    if score < 100:
        return 2
    return 3

if cls == 'positive':
    df[cls] =  df.apply(lambda r: r.score > 0, axis=1)
elif cls == 'score_class' or cls == 'subreddit_implicit' or cls == 'score_class_subreddit' or cls == 'our_model':
    df[cls] =  df.apply(get_score_class, axis=1)
    
# if its baseline model eval or our model, we want to have the subreddit and the corresponding class labels together
if cls == 'subreddit_implicit' or cls == 'our_model':
    df[cls] = zip(df[cls], df.subreddit)


if cls == 'score_class_subreddit':
    sr_split = {}
    vectorizer = CountVectorizer(analyzer = "word", max_features = 1000, stop_words='english')
    #vectorizer = TfidfVectorizer(max_df=0.5, max_features=1000, min_df=2, stop_words='english')
    train_data_features_all = vectorizer.fit_transform(df.body.values)
    train_data_features_all = train_data_features_all.toarray()
    for sr in df.subreddit.unique():
        bool_idx = (df.subreddit == sr).values
        data_train, data_test, labels_train, labels_test = train_test_split(
    train_data_features_all[bool_idx], df[cls].values[bool_idx], test_size=0.33, random_state=1121)
        if len(data_train) == 0 or len(data_test) == 0:
            continue
        sr_split[sr] = data_train, data_test, labels_train, labels_test
elif cls == 'our_model':
    sr_train = {}
    vectorizer = CountVectorizer(analyzer = "word", max_features = 1000, stop_words='english')
    #vectorizer = TfidfVectorizer(max_df=0.5, max_features=1000, min_df=2, stop_words='english')
    train_data_features_all = vectorizer.fit_transform(df.body.values)
    train_data_features_all = train_data_features_all.toarray()
    data_train_agg = []
    labels_train_agg = []
    data_test_agg = []
    labels_test_agg = []
    for subr in df.subreddit.unique():
        bool_idx = (df.subreddit == subr).values
        sr_data_train, sr_data_test, sr_labels_train, sr_labels_test = train_test_split(
    train_data_features_all[bool_idx], df[cls].values[bool_idx], test_size=0.33, random_state=1121)
        #print subr, len(sr_data_train), len(sr_data_test)
        
        if len(sr_data_train) == 0 or len(sr_data_test) == 0:
            print 'Skipped {0} due to no training or testing instances'.format(subr)
            continue
        
        # need for training subreddit classifiers. here labels should be score classes
        sr_train[subr] = (sr_data_train, [score_class for (score_class, sr) in sr_labels_train])
        
        # need for training base subreddit classifer. here labels should be subreddits
        data_train_agg += list(sr_data_train)
        labels_train_agg += [sr for (score_class, sr) in sr_labels_train]
        
        # for the testing model, we only want insatnces of high score posts
        test_instances = zip(sr_data_test, sr_labels_test)
        #print subr, len(sr_data_train), len(sr_data_test), test_instances[0]
        test_instances = [(fv, sr) for fv, (score_class, sr) in test_instances if score_class == 2 ]
        
        if len(test_instances) == 0:
            print 'Skipped {0} due to no high scoring posts in test data'.format(subr)
            continue
        #print subr, len(sr_data_train), len(sr_data_test), test_instances[0]
        sr_data_test, sr_labels_test = zip(*test_instances)
    
        # need for testing combined model. here instances are high scoring posts and labels should be subreddits 
        data_test_agg += sr_data_test
        labels_test_agg += sr_labels_test
        
        
else:
    vectorizer = CountVectorizer(analyzer = "word", max_features = 1000, stop_words='english')
    #vectorizer = TfidfVectorizer(max_df=0.5, max_features=1000, min_df=2, stop_words='english')
    train_data_features = vectorizer.fit_transform(df.body.values)
    train_data_features = train_data_features.toarray()

    data_train, data_test, labels_train, labels_test = train_test_split(
    train_data_features, df[cls].values, test_size=0.33, random_state=1121)

# if its implicit eval of baseline model, for the testing rows, we only want insatnces of high score posts
if cls == 'subreddit_implicit':
    test_instances = zip(data_test, labels_test)
    test_instances = [(fv, sr) for fv, (score_class, sr) in test_instances if score_class == 2 ]
    data_test, labels_test = zip(*test_instances)
    labels_train = [sr for (score_class, sr) in labels_train]
    
        
    

Skipped news due to no high scoring posts in test data
Skipped animenews due to no high scoring posts in test data
Skipped betternews due to no training or testing instances


In [234]:
if cls == 'our_model':
    allCounts = np.zeros(shape=(len(mostFreqWords) ,len(sr_train)))
    subInd = 0
    sr_map = {}
    for sr in sr_train: 

        # get posts 
        posts = df[df.subreddit == sr].body

        # convert to all words
        allPosts = " ".join(posts)
        allWords = allPosts.split()
        allWords = [word.lower() for word in allWords]

        # count words 
        wordCounts = computeWordCounts(allWords)

        # get total words 
        totWords = float(len(allWords))
        #print totWords

        # get freqWordCounts 
        freqWordCounts = [wordCounts[word] if word in wordCounts else 0 for word in mostFreqWords ]
        freqWordCounts = [i/totWords for i in freqWordCounts]

        # concatenate 
        allCounts[:,subInd] = freqWordCounts

        sr_map[subInd] = sr
        subInd += 1
        
        
""" get pairwise distance """
normMax = np.max(allCounts, axis=1)
normCounts = np.transpose(np.divide(np.transpose(allCounts), normMax))
distVec = spatial.distance.pdist(np.transpose(normCounts), 'euclidean')
distMat = spatial.distance.squareform(distVec)

normCounts = np.transpose(normCounts)

n = 5
km = KMeans(n_clusters=n, init='k-means++', max_iter=100, n_init=10)               

print("Clustering data with %s" % km)
km.fit(normCounts)
print(km.labels_)

cluster = {i:set() for i in range(n)}
sr_cluster = {}
for idx in range(len(km.labels_)):
    cluster[km.labels_[idx]].add(sr_map[idx])
    sr_cluster[sr_map[idx]] = km.labels_[idx]
for label, st in cluster.iteritems():
    print label, st
     

Clustering data with KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=5, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)
[0 1 3 3 0 2 2 4 0 1 2 2 1 1 1 1]
0 set(['relationships', 'Divorce', 'Marriage'])
1 set(['TrueAnime', 'anime', 'news', 'nba', 'NBA_Draft', 'manga'])
2 set(['weightroom', 'weightlifting', 'Stronglifts5x5', 'Fitness'])
3 set(['fantasybball', 'animenews'])
4 set(['inthenews'])


In [235]:
print df[cls].unique()
df[cls].value_counts() * 100.0 /len(df)

[(1, 'Fitness') (1, 'nba') (1, 'news') (1, 'relationships') (2, 'Fitness')
 (2, 'relationships') (1, 'manga') (2, 'nba') (1, 'anime') (2, 'manga')
 (2, 'anime') (3, 'relationships') (1, 'Stronglifts5x5') (1, 'Divorce')
 (3, 'Fitness') (2, 'Marriage') (2, 'weightlifting') (2, 'weightroom')
 (2, 'fantasybball') (3, 'nba') (1, 'weightlifting') (3, 'anime')
 (2, 'Divorce') (1, 'animenews') (1, 'Marriage') (2, 'Stronglifts5x5')
 (1, 'TrueAnime') (2, 'inthenews') (1, 'inthenews') (1, 'weightroom')
 (2, 'TrueAnime') (1, 'NBA_Draft') (3, 'weightroom') (3, 'manga')
 (2, 'NBA_Draft') (1, 'fantasybball') (1, 'betternews') (3, 'news')
 (3, 'inthenews')]


(1, relationships)     31.063048
(1, Fitness)           24.297174
(1, nba)                8.988680
(2, relationships)      6.064922
(1, news)               5.926231
(1, anime)              4.633031
(2, anime)              2.912512
(1, manga)              2.571407
(2, nba)                2.413974
(3, relationships)      1.990404
(2, Fitness)            1.668041
(3, nba)                1.315691
(1, Divorce)            1.117025
(3, anime)              1.109528
(1, weightlifting)      0.940850
(2, manga)              0.753430
(3, Fitness)            0.502287
(1, Stronglifts5x5)     0.386086
(1, Marriage)           0.266137
(2, TrueAnime)          0.138691
(2, Marriage)           0.127446
(2, weightroom)         0.108704
(1, NBA_Draft)          0.104955
(2, Divorce)            0.104955
(1, inthenews)          0.089962
(1, TrueAnime)          0.086213
(2, weightlifting)      0.074968
(2, NBA_Draft)          0.052478
(2, Stronglifts5x5)     0.048729
(1, weightroom)         0.037484
(1, fantas

In [236]:
if cls == 'positive':
    all_classes = np.array([False, True])
elif cls == 'score_class' or cls == 'score_class_subreddit':
    all_classes = np.array([1, 2, 3])
elif cls == 'subreddit' or cls == 'subreddit_implicit':
    all_classes = np.array(['Fitness', 'nba', 'news', 'relationships', 'manga', 'anime', 'Stronglifts5x5',
 'Divorce', 'Marriage', 'weightlifting', 'weightroom', 'fantasybball',
 'animenews', 'TrueAnime', 'inthenews', 'NBA_Draft', 'betternews'])


if cls == 'our_model':
    # we need to train 2 models. subreddit classifier and the inndivual subreddit scorers
    subreddit_model = MultinomialNB(alpha=0.01)
    #labels_train_agg = [sr_cluster[sr] for sr in labels_train_agg]
    subreddit_model.fit(data_train_agg, labels_train_agg)
    
    sr_model = {}
    for sr, (data_train, labels_train) in sr_train.iteritems():
        model = MultinomialNB(alpha=0.01)
        model.fit(data_train, labels_train)
        sr_model[sr] = model
        

elif cls == 'score_class_subreddit':
    sr_model = {}
    for sr, split in sr_split.iteritems():
        data_train, data_test, labels_train, labels_test = split
        model = MultinomialNB(alpha=0.01)
        model.fit(data_train, labels_train)
        sr_model[sr] = model
else:
    model = MultinomialNB(alpha=0.01)
    #model = SGDClassifier()

    training = 'offline'
    if training == 'online':
        slice_size = len(train_data_features)/1000
        for start in range(0, len(train_data_features), slice_size):
            slice_train_data = data_train[start:start+slice_size]
            slice_label_data = labels_train[start:start+slice_size]
            model.partial_fit(slice_train_data, slice_label_data, classes=all_classes)
    else:
        model.fit(data_train, labels_train)


In [230]:
def our_model(subreddit_model, cluster, sr_model, data_test_agg):
    predicted = []
    for f in data_test_agg:
        label = subreddit_model.predict([f])[0]
#         sr = subreddit_model.predict([f])[0]
#         for label, st in cluster.iteritems():
#             if sr in st:
#                 break
        best_sr = ''
        best_score = 0
        for sr in cluster[label]:
            score = sr_model[sr].predict([f])[0]
            if score > best_score:
                best_score = score
                best_sr = sr
                
        predicted.append(best_sr)
    return predicted
                

if cls == 'our_model':
    predicted = our_model(subreddit_model, cluster, sr_model, data_test_agg)
    correct = 0
    for i in range(len(predicted)):
        if predicted[i] == labels_test_agg[i]:
            correct += 1
    print correct * 100.0 / len(predicted)
elif cls == 'score_class_subreddit':
    for sr, split in sr_split.iteritems():
        data_train, data_test, labels_train, expected = split
        model = sr_model[sr]
        predicted = model.predict(data_test)
        print 'Results fro subreddit classifier for ' + sr
        print(metrics.classification_report(expected, predicted))
        print(metrics.confusion_matrix(expected, predicted))
else:
    expected = labels_test
    predicted = model.predict(data_test)
    if cls == 'subreddit_implicit':
        correct = 0
        for i in range(len(predicted)):
            if predicted[i] == expected[i]:
                correct += 1
        print correct * 100.0 / len(predicted)
    else:
        # summarize the fit of the model
        print(metrics.classification_report(expected, predicted))
        print(metrics.confusion_matrix(expected, predicted))

46.5944272446


In [237]:
#x = [sr_cluster[sr] for sr in labels_test_agg]
x = labels_test_agg
p = subreddit_model.predict(data_test_agg)
correct = 0
for i in range(len(predicted)):
    if p[i] == x[i]:
        correct += 1
print correct * 100.0 / len(p)

80.8823529412


In [None]:
get_n_frequent_words(df, 20)

In [None]:
word_count = {}
for word in vectorizer.get_feature_names():
    word_count[word] = all_words.count(word)

word_count

In [None]:
vectorizer = CountVectorizer(analyzer = "word", max_features = 10, stop_words='english')
#vectorizer = TfidfVectorizer(max_df=0.5, max_features=1000, min_df=2, stop_words='english')
x = vectorizer.fit_transform(['i am just testing how this thing', 'works. whether or not', 'it stores state every time i do', 'a fit transform'])

print vectorizer.get_feature_names()
print x.toarray()
x = vectorizer.fit_transform(['fit fit fit fit', 'works. every every every or not', 'it stores state every time i do', 'a fit transform'])
print vectorizer.get_feature_names()
print x.toarray()

In [None]:
nb_model = MultinomialNB(alpha=0.01)
nb_model.fit(data_train, labels_train)

expected = labels_test
predicted = nb_model.predict(data_test)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

In [None]:
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
df['train'] = zip(df.body, df.positive)
cl = NaiveBayesClassifier(df.train.values)


In [None]:
count = {}
for w in all_words:
    count[w] = count.setdefault(w,0) + 1

features = sorted(count, key=count.get, reverse=True)[:2000]
feature_map = {features[i]:i for i in range(len(features))}

def generate_training_tuple(row):
    words = set(row.body.split())
    features_set = {}
    for feature in features:
        features_set[feature_map[feature]] = (feature in words)
    return features_set, row.positive

data = df.apply(generate_training_tuple, axis=1)


In [None]:
s='''news
inthenews
worldnews
betternews
Fitness
weightlifting
weightroom
Stronglifts5x5
relationships
Divorce
Marriage
Dating
nba
NBA_Draft
fantasybball
wtfnba
anime
animenews
TrueAnime
manga'''
for sr in s.split('\n'):
    print 'select count(*) from posts where subreddit="' + sr + '";'

In [None]:
s='''news
inthenews
worldnews
betternews
fitness
weightlifting
weightroom
Stronglifts5x5
relationships
Divorce
Marriage
Dating
nba
NBA_Draft
fantasybball
wtfnba
anime
animenews
TrueAnime
manga'''
"' or subreddit='".join(s.split('\n'))

################################################
# OLDER CLEAN CODE WIHTOUT LEMMATIZATION
# cleaning rows with null body
df = df[df.selftext.notnull()]

# clean out comments which were deleted
df = df[df.selftext != '[deleted]']

# select text only posts
df = df[df.apply(lambda r: r.domain[:5] == 'self.', axis=1)]

stop_words = set(nltk.corpus.stopwords.words("english"))
all_words = []
def clean_body(row):
    # clean unprintable chars
    body = filter(lambda x: x in string.printable, row.selftext)
    body = re.sub("[^a-zA-Z]", " ", body )
    words = body.lower().split()   
    # remove stop words
    words = [w for w in words if (w not in stop_words) and (len(w) > 3)]
    all_words.extend(words)
    return " ".join( words )

df['body'] = df.apply(clean_body, axis=1)




#########################################
# code for extracting features
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(analyzer = "word", max_features = 1000) 

train_data_features = vectorizer.fit_transform(df.body.values)

train_data_features = train_data_features.toarray()



# top 20 words with this method
like      41581
would     31031
don't     27578
know      26772
really    26746
want      23404
time      21831
even      18897
it's      18772
feel      18115
game      17866
think     17726
i've      15449
going     15015
people    14642
still     14501
said      13889
could     13800
never     13721
much      13622


#### results for topic modelling with most 1000 tf-idf features

             precision    recall  f1-score   support

    Divorce       0.80      0.04      0.07       102
    Fitness       0.89      0.94      0.91      2348
   Marriage       0.00      0.00      0.00        29
  NBA_Draft       0.00      0.00      0.00        18
Stronglifts5x5       0.00      0.00      0.00        29
  TrueAnime       0.00      0.00      0.00        23
      anime       0.80      0.74      0.77       780
  animenews       0.00      0.00      0.00         1
 betternews       0.00      0.00      0.00         1
fantasybball       0.00      0.00      0.00         7
  inthenews       0.00      0.00      0.00         8
      manga       0.91      0.50      0.65       318
        nba       0.90      0.75      0.82      1085
       news       0.86      0.48      0.62       526
relationships       0.81      1.00      0.89      3424
weightlifting       1.00      0.07      0.14        94
 weightroom       1.00      0.09      0.17        11

avg / total       0.84      0.84      0.82      8804


#### results for topic modelling with 1000 most frequent words

             precision    recall  f1-score   support

    Divorce       0.41      0.63      0.50       102
    Fitness       0.91      0.87      0.89      2348
   Marriage       0.22      0.17      0.19        29
  NBA_Draft       0.18      0.11      0.14        18
Stronglifts5x5       0.03      0.07      0.04        29
  TrueAnime       0.58      0.65      0.61        23
      anime       0.82      0.60      0.69       780
  animenews       0.00      0.00      0.00         1
 betternews       0.00      0.00      0.00         1
fantasybball       0.00      0.00      0.00         7
  inthenews       0.25      0.12      0.17         8
      manga       0.58      0.74      0.65       318
        nba       0.86      0.76      0.81      1085
       news       0.83      0.61      0.70       526
relationships       0.86      0.97      0.91      3424
weightlifting       0.37      0.37      0.37        94
 weightroom       0.14      0.27      0.18        11

avg / total       0.84      0.83      0.83      8804


#######################################
##### SCORE CLSSIFER WITH 1000 TF IDF
             precision    recall  f1-score   support

          1       0.82      0.99      0.90      7079
          2       0.61      0.07      0.12      1287
          3       0.44      0.14      0.21       438

avg / total       0.77      0.81      0.75      8804

[[7018   39   22]
 [1144   85   58]
 [ 361   15   62]]

### WITH 100 TFIDF FEATURES
             precision    recall  f1-score   support

          1       0.81      1.00      0.89      7079
          2       0.69      0.02      0.05      1287
          3       0.00      0.00      0.00       438

avg / total       0.75      0.81      0.73      8804

[[7071    6    2]
 [1249   31    7]
 [ 430    8    0]]


###### WITH MOST FREQUENT 1000 WORDS
             precision    recall  f1-score   support

          1       0.85      0.86      0.86      7079
          2       0.29      0.23      0.26      1287
          3       0.26      0.37      0.30       438

avg / total       0.74      0.74      0.74      8804

[[6077  676  326]
 [ 840  296  151]
 [ 214   60  164]]

# MOST FREQUENT 100 WORDS
             precision    recall  f1-score   support

          1       0.84      0.89      0.86      7079
          2       0.25      0.20      0.22      1287
          3       0.33      0.18      0.24       438

avg / total       0.73      0.75      0.74      8804

[[6308  676   95]
 [ 961  259   67]
 [ 267   91   80]]


# subreddit classifiers with diff features 
Results fro subreddit classifier for relationships
             precision    recall  f1-score   support

          1       0.86      0.78      0.82      2735
          2       0.22      0.27      0.24       535
          3       0.30      0.53      0.38       174

avg / total       0.73      0.69      0.71      3444

[[2140  457  138]
 [ 307  144   84]
 [  38   43   93]]
Results fro subreddit classifier for TrueAnime
             precision    recall  f1-score   support

          1       0.75      0.33      0.46         9
          2       0.62      0.91      0.74        11

avg / total       0.68      0.65      0.62        20

[[ 3  6]
 [ 1 10]]
Results fro subreddit classifier for fantasybball
             precision    recall  f1-score   support

          1       0.00      0.00      0.00         3
          2       0.25      0.50      0.33         2

avg / total       0.10      0.20      0.13         5

[[0 3]
 [1 1]]
Results fro subreddit classifier for animenews
             precision    recall  f1-score   support

          1       1.00      1.00      1.00         1

avg / total       1.00      1.00      1.00         1

[[1]]
Results fro subreddit classifier for Divorce
             precision    recall  f1-score   support

          1       0.94      1.00      0.97       102
          2       0.00      0.00      0.00         6

avg / total       0.89      0.94      0.92       108

[[102   0]
 [  6   0]]
Results fro subreddit classifier for weightroom
             precision    recall  f1-score   support

          1       1.00      0.17      0.29         6
          2       0.54      1.00      0.70         7
          3       0.00      0.00      0.00         1

avg / total       0.70      0.57      0.47        14

[[1 5 0]
 [0 7 0]
 [0 1 0]]
Results fro subreddit classifier for Stronglifts5x5
             precision    recall  f1-score   support

          1       0.89      0.94      0.92        35
          2       0.00      0.00      0.00         4

avg / total       0.80      0.85      0.82        39

[[33  2]
 [ 4  0]]
Results fro subreddit classifier for inthenews
             precision    recall  f1-score   support

          1       0.86      0.75      0.80         8
          2       0.33      1.00      0.50         1
          3       0.00      0.00      0.00         1

avg / total       0.72      0.70      0.69        10

[[6 2 0]
 [0 1 0]
 [1 0 0]]
Results fro subreddit classifier for Marriage
             precision    recall  f1-score   support

          1       0.68      1.00      0.81        23
          2       1.00      0.08      0.15        12

avg / total       0.79      0.69      0.58        35

[[23  0]
 [11  1]]
Results fro subreddit classifier for anime
             precision    recall  f1-score   support

          1       0.66      0.88      0.75       389
          2       0.64      0.31      0.41       272
          3       0.46      0.50      0.48       101

avg / total       0.62      0.63      0.60       762

[[343  32  14]
 [143  83  46]
 [ 35  15  51]]
Results fro subreddit classifier for Fitness
             precision    recall  f1-score   support

          1       0.92      0.92      0.92      2124
          2       0.08      0.09      0.09       148
          3       0.16      0.12      0.14        59

avg / total       0.85      0.85      0.85      2331

[[1961  131   32]
 [ 129   14    5]
 [  31   21    7]]
Results fro subreddit classifier for weightlifting
             precision    recall  f1-score   support

          1       0.93      0.96      0.95        84
          2       0.00      0.00      0.00         6

avg / total       0.87      0.90      0.88        90

[[81  3]
 [ 6  0]]
Results fro subreddit classifier for news
             precision    recall  f1-score   support

          1       1.00      0.98      0.99       523
          3       0.00      0.00      0.00         0

avg / total       1.00      0.98      0.99       523

[[513  10]
 [  0   0]]
Results fro subreddit classifier for nba
             precision    recall  f1-score   support

          1       0.74      0.81      0.77       790
          2       0.23      0.17      0.19       222
          3       0.35      0.29      0.31       108

avg / total       0.60      0.63      0.61      1120

[[641 108  41]
 [168  37  17]
 [ 60  17  31]]
Results fro subreddit classifier for NBA_Draft
             precision    recall  f1-score   support

          1       0.71      0.45      0.56        11
          2       0.14      0.33      0.20         3

avg / total       0.59      0.43      0.48        14

[[5 6]
 [2 1]]
Results fro subreddit classifier for manga
             precision    recall  f1-score   support

          1       0.82      0.87      0.85       231
          2       0.38      0.29      0.33        63
          3       0.00      0.00      0.00         1

avg / total       0.72      0.75      0.73       295

[[202  29   0]
 [ 44  18   1]
 [  1   0   0]]


##### RESULTS WITH SAME FEATURES FOR ALL SR CLASSIFIERS
Results fro subreddit classifier for relationships
             precision    recall  f1-score   support

          1       0.86      0.79      0.82      2735
          2       0.22      0.25      0.23       535
          3       0.30      0.53      0.38       174

avg / total       0.73      0.69      0.71      3444

[[2151  449  135]
 [ 314  136   85]
 [  40   41   93]]
Results fro subreddit classifier for TrueAnime
             precision    recall  f1-score   support

          1       0.75      0.33      0.46         9
          2       0.62      0.91      0.74        11

avg / total       0.68      0.65      0.62        20

[[ 3  6]
 [ 1 10]]
Results fro subreddit classifier for fantasybball
             precision    recall  f1-score   support

          1       0.00      0.00      0.00         3
          2       0.25      0.50      0.33         2

avg / total       0.10      0.20      0.13         5

[[0 3]
 [1 1]]
Results fro subreddit classifier for animenews
             precision    recall  f1-score   support

          1       1.00      1.00      1.00         1

avg / total       1.00      1.00      1.00         1

[[1]]
Results fro subreddit classifier for Divorce
             precision    recall  f1-score   support

          1       0.94      0.99      0.97       102
          2       0.00      0.00      0.00         6

avg / total       0.89      0.94      0.91       108

[[101   1]
 [  6   0]]
Results fro subreddit classifier for weightroom
             precision    recall  f1-score   support

          1       0.00      0.00      0.00         6
          2       0.50      1.00      0.67         7
          3       0.00      0.00      0.00         1

avg / total       0.25      0.50      0.33        14

[[0 6 0]
 [0 7 0]
 [0 1 0]]
Results fro subreddit classifier for Stronglifts5x5
             precision    recall  f1-score   support

          1       0.89      0.97      0.93        35
          2       0.00      0.00      0.00         4

avg / total       0.80      0.87      0.84        39

[[34  1]
 [ 4  0]]
Results fro subreddit classifier for inthenews
             precision    recall  f1-score   support

          1       0.80      0.50      0.62         8
          2       0.20      1.00      0.33         1
          3       0.00      0.00      0.00         1

avg / total       0.66      0.50      0.53        10

[[4 4 0]
 [0 1 0]
 [1 0 0]]
Results fro subreddit classifier for Marriage
             precision    recall  f1-score   support

          1       0.66      0.91      0.76        23
          2       0.33      0.08      0.13        12

avg / total       0.55      0.63      0.55        35

[[21  2]
 [11  1]]
Results fro subreddit classifier for anime
             precision    recall  f1-score   support

          1       0.67      0.90      0.77       389
          2       0.67      0.34      0.45       272
          3       0.48      0.49      0.48       101

avg / total       0.65      0.65      0.62       762

[[350  27  12]
 [137  93  42]
 [ 34  18  49]]
Results fro subreddit classifier for Fitness
             precision    recall  f1-score   support

          1       0.92      0.93      0.93      2124
          2       0.10      0.09      0.09       148
          3       0.14      0.12      0.13        59

avg / total       0.85      0.86      0.85      2331

[[1984  103   37]
 [ 130   13    5]
 [  38   14    7]]
Results fro subreddit classifier for weightlifting
             precision    recall  f1-score   support

          1       0.93      1.00      0.97        84
          2       0.00      0.00      0.00         6

avg / total       0.87      0.93      0.90        90

[[84  0]
 [ 6  0]]
Results fro subreddit classifier for news
             precision    recall  f1-score   support

          1       1.00      0.99      0.99       523
          3       0.00      0.00      0.00         0

avg / total       1.00      0.99      0.99       523

[[516   7]
 [  0   0]]
Results fro subreddit classifier for nba
             precision    recall  f1-score   support

          1       0.74      0.86      0.79       790
          2       0.21      0.11      0.14       222
          3       0.42      0.37      0.39       108

avg / total       0.61      0.66      0.63      1120

[[676  78  36]
 [179  24  19]
 [ 57  11  40]]
Results fro subreddit classifier for NBA_Draft
             precision    recall  f1-score   support

          1       0.80      0.36      0.50        11
          2       0.22      0.67      0.33         3

avg / total       0.68      0.43      0.46        14

[[4 7]
 [1 2]]
Results fro subreddit classifier for manga
             precision    recall  f1-score   support

          1       0.82      0.86      0.84       231
          2       0.37      0.30      0.33        63
          3       0.00      0.00      0.00         1

avg / total       0.72      0.74      0.73       295

[[199  32   0]
 [ 44  19   0]
 [  0   1   0]]