In [1]:
# import warnings filter
from warnings import simplefilter
# ignore all warnings
simplefilter(action='ignore', category=Warning)

import numpy as np
import pandas as pd

#import sklearn algorithms
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, ComplementNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import LinearSVC
from sklearn import metrics

#useful text storage and processing imports
import re
import nltk
import unicodedata
from string import punctuation
from nltk import corpus

#might need to download if necessary
#nltk.download("stopwords")
#nltk.download("porter_test")
#nltk.download("punkt")
#nltk.download("wordnet")

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

stop_words = stopwords.words("english")
lemmatizer = WordNetLemmatizer()
pStemmer = PorterStemmer()
sStemmer = SnowballStemmer("english")

In [2]:
#encoding our subreddits
labels = {
    "anime": 1,
    "AskReddit": 2,
    "baseball": 3,
    "canada": 4, 
    "conspiracy": 5, 
    "europe": 6, 
    "funny": 7, 
    "gameofthrones": 8, 
    "GlobalOffensive": 9,
    "hockey" :10, 
    "leagueoflegends": 11, 
    "movies": 12, 
    "Music": 13, 
    "nba":14, 
    "nfl":15, 
    "Overwatch":16, 
    "soccer":17, 
    "trees":18, 
    "worldnews":19, 
    "wow":20
}

def clean_text(text):
    #lowercase remove
    lowercase_text = text.lower()
    
    #html, url, ampersand and line break remove
    cleaned_text = re.sub('<[^<]+?>','', lowercase_text)
    cleaned_text = re.sub(r'https?:\/\/.*[\r\n]*|www\..*[\r\n]*', "", cleaned_text, flags=re.MULTILINE)
    cleaned_text = re.sub(r'\s&\w+|&\w+|&', "", cleaned_text)
    cleaned_text = re.sub(r'\n', ' ', cleaned_text)
    
    #remove digits and punct.
    no_dig = ''.join(c for c in cleaned_text if not c.isdigit())
    pure_text = ''.join(c for c in no_dig if c not in punctuation)
    
    #remove stopwords and stemmer
    word_tokens = nltk.word_tokenize(pure_text)
    removing_stopwords = [word for word in word_tokens if word not in stop_words]
    removing_stopwords = ' '.join(word for word in removing_stopwords)
    
    word_tokens = nltk.word_tokenize(removing_stopwords)
    stemmed_word = [sStemmer.stem(word) for word in word_tokens]
    stemmed_word = ' '.join(word for word in stemmed_word)
    
    #word_tokens = nltk.word_tokenize(removing_stopwords)
    #lemmatized_word = [pStemmer.stem(word) for word in word_tokens]
    #lemmatized_word = ' '.join(word for word in lemmatized_word)
    
    return stemmed_word

def k_fold(dataframe, k):
    temp = []
    folds = []
    r = dataframe.shape[0]
    q = int(r/k)
    rest = r%k

    n = 0
    for i in range(k):
        if i < rest:
            q += 1
        data = dataframe.iloc[n:n+q]
        temp.append(data)
        n += q
        if i < rest:
            q -= 1

    for i in range(k):
        data = temp[i]
        for j in range(1,k-1):
            data = data.append(temp[(j+i)%k], ignore_index=True)
        folds.append(data)
        folds.append(temp[(i+k-1)%k])
    return folds

def main(classes):
    train_data = pd.read_csv("reddit_train.csv")
    train_data = train_data.drop_duplicates(subset={"id"})
    train_data = train_data.drop(['id'], axis=1)
    train_data['subreddits'] = train_data['subreddits'].map(classes)
    
    #train_data['comments'] = train_data['comments'].apply(clean_text)
    #train_data.to_csv("reddit_train_clean.csv")
    
    k = 5
    k_folds = k_fold(train_data, k)
        
    k = 0
    scoreLR = 0
    scoreCB = 0
    scoreSVC = 0
    scoreDT = 0
    scoreMNB = 0
    while (k<9):   
        X_train = k_folds[k]['comments']
        X_test = k_folds[k+1]['comments']
        y_train = k_folds[k]['subreddits']
        y_test = k_folds[k+1]['subreddits']

        tf_idf_vectorizer = TfidfVectorizer(smooth_idf=True, sublinear_tf=True, norm='l2', max_df=0.42, strip_accents='unicode')
        vectors_train_idf = tf_idf_vectorizer.fit_transform(X_train)
        vectors_test_idf = tf_idf_vectorizer.transform(X_test)

        clf = LogisticRegression()
        clf.fit(vectors_train_idf, y_train)
        y_pred = clf.predict(vectors_test_idf)
        scoreLR = scoreLR + metrics.accuracy_score(y_test, y_pred)
        
        clf = ComplementNB(alpha = 0.81)
        clf.fit(vectors_train_idf, y_train)
        y_pred = clf.predict(vectors_test_idf)
        scoreCB = scoreCB + metrics.accuracy_score(y_test, y_pred)
        
        clf = MultinomialNB(alpha = 0.17)
        clf.fit(vectors_train_idf, y_train)
        y_pred = clf.predict(vectors_test_idf)
        scoreMNB = scoreMNB + metrics.accuracy_score(y_test, y_pred)
        
        clf = LinearSVC()
        clf.fit(vectors_train_idf, y_train)
        y_pred = clf.predict(vectors_test_idf)
        scoreSVC = scoreSVC + metrics.accuracy_score(y_test, y_pred)
        
        #Decision trees are horrible for text classification
        clf = DecisionTreeClassifier()
        clf.fit(vectors_train_idf, y_train)
        y_pred = clf.predict(vectors_test_idf)
        scoreDT = scoreDT + metrics.accuracy_score(y_test, y_pred)
        print("-")
        k = k+2
    
    print("LR", 100*scoreLR/5)
    print("CNB", 100*scoreCB/5)
    print("SVC", 100*scoreSVC/5)
    print("DT", 100*scoreDT/5)
    print("MNB", 100*scoreMNB/5)

main(labels)

-
-
-
-
-
LR 54.36142857142856
CNB 58.19571428571429
SVC 55.33142857142857
DT 26.192857142857143
MNB 57.261428571428574


In [3]:
def decode_labels(num):
    if num == 1:
        return "anime"
    elif num == 2:
        return "AskReddit"
    elif num == 3:
        return "baseball"
    elif num == 4:
        return "canada"
    elif num == 5:
        return "conspiracy"
    elif num == 6:
        return "europe"
    elif num == 7:
        return "funny"
    elif num == 8:
        return "gameofthrones"
    elif num == 9:
        return "GlobalOffensive"
    elif num == 10:
        return "hockey"
    elif num == 11:
        return "leagueoflegends"
    elif num == 12:
        return "movies"
    elif num == 13:
        return "Music"
    elif num == 14:
        return "nba"
    elif num == 15:
        return "nfl"
    elif num == 16:
        return "Overwatch"
    elif num == 17:
        return "soccer"
    elif num == 18:
        return "trees"
    elif num == 19:
        return "worldnews"
    elif num == 20:
        return "wow"

In [4]:
train_data = pd.read_csv("reddit_train.csv")
test_data = pd.read_csv("reddit_test.csv")
train_data = train_data.drop_duplicates(subset={"id"})
train_data = train_data.drop(['id'], axis=1)
test_data = test_data.drop(['id'], axis=1)
train_data['subreddits'] = train_data['subreddits'].map(labels)

X_train = train_data['comments']
X_test = test_data['comments']
y_train = train_data['subreddits']

tf_idf_vectorizer = TfidfVectorizer(smooth_idf=True, sublinear_tf=True, norm='l2', max_df=0.42, strip_accents='unicode')
vectors_train_idf = tf_idf_vectorizer.fit_transform(X_train)
vectors_test_idf = tf_idf_vectorizer.transform(X_test)

clf = ComplementNB(alpha = 0.81)
clf.fit(vectors_train_idf, y_train)
y_pred = clf.predict(vectors_test_idf)
shape = int(y_pred.shape[0])
y_pred = pd.DataFrame(y_pred)

i=0
while (i < shape):
    y_pred[0][i] = decode_labels(y_pred[0][i])
    i = i + 1
prediction = y_pred.to_csv('prediction.csv')