<a href="https://colab.research.google.com/github/ebenajayi/Reddit-Comment-Classification/blob/main/Project_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import time
import re 

'''Features'''
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import label_binarize
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import chi2, SelectKBest

'''Classifiers'''
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

'''Metrics/Evaluation'''
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, roc_auc_score
from scipy import interp
from itertools import cycle

'''Plotting'''
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('darkgrid')

'''Display'''
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))
import warnings
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:,.2f}'.format

'''Classifiers'''
from sklearn.feature_extraction.text import CountVectorizer  
from sklearn.feature_extraction.text import TfidfTransformer  
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import GridSearchCV

'''Lemmatizer'''
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

'''Stemmer'''
from nltk.stem.snowball import SnowballStemmer
#from nltk.stem import PorterStemmer

'''Progress  Bar'''
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

'''Stopwords Corpus'''
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords, wordnet
stop_words = set(stopwords.words('english'))


import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc

from nlpaug.util import Action

np.random.seed(1)

[nltk_data] Downloading package stopwords to /home/hamza/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Data Import

In [None]:
df = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [None]:
from collections import Counter

X_datatrain = df['body']
y_datatrain = df['subreddit']
X_datatest = test['body']
print(y_datatrain.unique())

['rpg' 'anime' 'datascience' 'hardware' 'cars' 'gamernews' 'gamedev'
 'computers']


# Preprocessing

## Drop Duplicates

In [None]:
df=df.drop_duplicates(keep='first',subset='body')

## Filter Functions

In [None]:
# Function to remove noise from text and deal with negation
def cleaning_data(comment):
    
    #Remove Non-ASCII characters
    comment = re.sub(r"\xa0",r" ",comment)
    comment = comment.encode("ascii", errors="ignore").decode()
    
    #comment = re.sub(r'^https?:\/\/.*[\r\n]*', '', comment, flags=re.MULTILINE)
    
    #Lower case the data
    comment = comment.lower()
    
    #Remove HTML Links
    #comment = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', ' ', comment)   
    #comment = re.sub(r'\n','',comment)
    
    #Cleaning the data by removing special characters
    comment = re.sub(r"[^A-Za-z0-9!?\'\`]", " ", comment)
    
    #Handling negations
    comment = re.sub(r"youtu.be", " youtube", comment)
    comment = re.sub(r"https", " http", comment)
    comment = re.sub(r"i'm", " i am", comment)
    #comment = re.sub(r"it's", " it is", comment)
    #comment = re.sub(r"ain't", "is not",comment)
    #comment = re.sub(r"aren't", "are not",comment)
    #comment = re.sub(r"couldn't", "could not",comment)
    #comment = re.sub(r"didnt", "did not",comment)
    #comment = re.sub(r"dont", "do not",comment)
    #comment = re.sub(r"doesn't", "does not",comment)
    #comment = re.sub(r"hadn't", "had not",comment)
    #comment = re.sub(r"hasn't", "has not",comment)
    #comment = re.sub(r"haven't", "have not",comment)
    #comment = re.sub(r"isn't", "is not",comment)
    #comment = re.sub(r"shouldn't", "should not",comment)
    #comment = re.sub(r"shan't", "shall not",comment)
    #comment = re.sub(r"wasn't", "was not",comment)
    #comment = re.sub(r"weren't", "were not",comment)
    #comment = re.sub(r"oughtn't", "ought not",comment)
    #comment = re.sub(r"that's", " that is", comment)
    comment = re.sub(r"\'s", " ", comment)
     #comment = re.sub(r"\'ve", " have", comment)
    #comment = re.sub(r"won't", " will not", comment)
    #comment = re.sub(r"wouldn't", " would not", comment)
    #comment = re.sub(r"don't", " do not", comment)
    #comment = re.sub(r"can't", " can not", comment)
    #comment = re.sub(r"cannot", " can not", comment)
    #comment = re.sub(r"n\'t", " n\'t", comment)
    #comment = re.sub(r"\'re", " are", comment)
    #comment = re.sub(r"\'d", " would", comment)
    #comment = re.sub(r"\'ll", " will", comment)    
    comment = re.sub(r"!", " ! ", comment)
    comment = re.sub(r"\?", " ? ", comment)
    comment = re.sub(r"\s{2,}", " ", comment)

    # Removing all the numbers
    comment = re.sub(r'[0-9]+', ' ', comment)
    
    #Removing all punctuations
    comment = re.sub(r'[^\w\s]','',comment)
    
    # Substituting multiple spaces with single space
    comment = re.sub(r'\s+', ' ', comment, flags=re.I)
    
    # Remove single letters
    #comment= re.sub(r"\b[a-zA-Z]\b","", comment)
        
    comment = [word for word in comment.split() if len(word) >= 2]
    comment = ' '.join(comment)
    
    return comment
  
#Removing the stop words
def stopwords_data(comment):
    more_stop_words = ["wa", "gt","amp", "u", "ha", "le", "doe", "don",'ve']
    stop_words_ = stop_words.union(more_stop_words)
    comment = [word for word in comment.split() if not word in stop_words_]
    comment = ' '.join(comment)           
    return comment

In [None]:
#Progress bar
def process_data(data):
    data = data.progress_map(preprocess_data)  
    return data

#Preprocessing the data
def preprocess_data(document):
    document = cleaning_data(document)
    document = stopwords_data(document)
    #document = lemmatize_data(document)
    #document = stem_data(document)
    
    return document

In [None]:
print("Cleaning of Training data on process...")
cleandata_Train = process_data(X_datatrain)
print("Task completed")
print("Cleaning of Testing data on process...")
cleandata_Test = process_data(X_datatest)
print("Task completed")

progress-bar:   6%|▌         | 707/11582 [00:00<00:03, 3513.61it/s]

Cleaning of Training data on process...


progress-bar: 100%|██████████| 11582/11582 [00:02<00:00, 4274.62it/s]
progress-bar:  16%|█▌        | 452/2898 [00:00<00:00, 4500.51it/s]

Task completed
Cleaning of Testing data on process...


progress-bar: 100%|██████████| 2898/2898 [00:00<00:00, 4776.27it/s]

Task completed





In [None]:
X_train, X_val, y_train, y_val = train_test_split(cleandata_Train, y_datatrain, test_size=0.1, stratify=y_datatrain)
X_test = cleandata_Test

In [None]:
"""from random import shuffle

def augment_data(data, n=3):
    aug = naw.SynonymAug(aug_src='wordnet')
    #aug = naw.AntonymAug()
        
    augmented_text = []

    for i in range(len(data)):
        tmp = aug.augment(data[i][0], n=n)
        for j in range(len(tmp)):
            augmented_text.append([tmp[j], data[i][1]])
        augmented_text.append(data[i].tolist())
    return augmented_text

augmented_data = augment_data(np.column_stack((X_train, y_train)), n=3)

shuffle(augmented_data)
augmented_data = np.array(augmented_data)
X_train, y_train = augmented_data[:,0], augmented_data[:,1]"""

"from random import shuffle\n\ndef augment_data(data, n=3):\n    aug = naw.SynonymAug(aug_src='wordnet')\n    #aug = naw.AntonymAug()\n        \n    augmented_text = []\n\n    for i in range(len(data)):\n        tmp = aug.augment(data[i][0], n=n)\n        for j in range(len(tmp)):\n            augmented_text.append([tmp[j], data[i][1]])\n        augmented_text.append(data[i].tolist())\n    return augmented_text\n\naugmented_data = augment_data(np.column_stack((X_train, y_train)), n=3)\n\nshuffle(augmented_data)\naugmented_data = np.array(augmented_data)\nX_train, y_train = augmented_data[:,0], augmented_data[:,1]"

In [None]:
import nltk

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/hamza/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/hamza/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/hamza/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import Normalizer
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import wordnet

normalizer_train = Normalizer()

tf_idf_vectorizer = TfidfVectorizer(stop_words=stop_words)
vectors_train_idf = tf_idf_vectorizer.fit_transform(X_train)
vectors_test_idf = tf_idf_vectorizer.transform(X_test)
vectors_val_idf = tf_idf_vectorizer.transform(X_val)

vectors_train_idf = normalizer_train.transform(vectors_train_idf)
vectors_test_idf = normalizer_train.transform(vectors_test_idf)
vectors_val_idf = normalizer_train.transform(vectors_val_idf)

In [None]:
print(vectors_train_idf.shape)

(10423, 34524)


In [None]:
chi2_selector = SelectKBest(chi2, 10000)
vectors_train_idf_chi = chi2_selector.fit_transform(vectors_train_idf, y_train)
vectors_test_idf_chi = chi2_selector.transform(vectors_test_idf)
vectors_val_idf_chi = chi2_selector.transform(vectors_val_idf)

In [None]:
from sklearn.naive_bayes import MultinomialNB

clf = MultinomialNB(alpha=.019)
pred_acc = cross_val_score(clf, vectors_train_idf_chi, y_train, cv = 5)
clf.fit(vectors_train_idf_chi, y_train)
val = clf.score(vectors_val_idf_chi, y_val)
print(pred_acc.mean())
print(val)

0.9142288625913089
0.8886971527178602


In [None]:
prediction = clf.predict(vectors_test_idf_chi)

In [None]:
subm = test.copy()
subm['subreddit'] = prediction
del subm['body']
subm.head()
subm.to_csv('submission.csv', index=False)  

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from collections import defaultdict
from collections import OrderedDict

label_tokenizer = Tokenizer()
label_tokenizer.fit_on_texts(np.array(y_datatrain))

y_train_encoded = np.array(label_tokenizer.texts_to_sequences(np.array(y_train)))
y_val_encoded = np.array(label_tokenizer.texts_to_sequences(np.array(y_val)))

y_train_encoded = to_categorical(y_train_encoded)
y_val_encoded = to_categorical(y_val_encoded)


def get_class_weights(y):
    counter = Counter(y)
    majority = max(counter.values())
    return  {cls: float(majority/count) for cls, count in counter.items()}

class_weight = get_class_weights(y_train)
reverse_word_map = dict(label_tokenizer.word_index.items())

sDict = defaultdict(int)
for key, value in class_weight.items():
    new_key = reverse_word_map[key] - 1
    sDict[new_key] = value
    
class_weight_new = dict(sDict)
class_weight = dict(OrderedDict(sorted(class_weight_new.items())))
print(class_weight)

{0: 1.0, 1: 1.1545503500269252, 2: 1.198434879821129, 3: 1.4735395189003437, 4: 1.8262350936967633, 5: 2.3457330415754925, 6: 3.036827195467422, 7: 5.583333333333333}


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input, Activation, Dropout, GaussianNoise
from tensorflow.keras import regularizers

model = Sequential()
model.add(Input(shape=(vectors_train_idf.shape[1],), sparse=True))
model.add(Dense(128))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(GaussianNoise(2))
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.2))
model.add(Dense(y_train_encoded.shape[1]))
model.add(Activation('softmax'))

optimizer = tf.keras.optimizers.Adam()

model.compile(loss='categorical_crossentropy', 
              optimizer=optimizer, 
              metrics=['accuracy'])

vectors_train_idf.sort_indices()
vectors_val_idf.sort_indices()

history = model.fit(vectors_train_idf, y_train_encoded, 
                    batch_size=32, 
                    epochs=50,
                    verbose=1,
                    #class_weight=class_weight,
                    validation_data=(vectors_val_idf, y_val_encoded))


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
vectors_test_idf.sort_indices()
prediction = np.argmax(model.predict(vectors_test_idf), axis = 1)
reverse_word_map = dict(map(reversed, label_tokenizer.word_index.items()))
prediction = np.vectorize(reverse_word_map.get)(prediction)

subm = test.copy()
subm['subreddit'] = prediction
del subm['body']
subm.head()
subm.to_csv('submission4.csv', index=False)  

In [None]:
def confusion_matrix(ypred, y_test):
    from sklearn.metrics import classification_report, confusion_matrix
    fig, ax = plt.subplots(figsize=(12,12))
    sns.heatmap(confusion_matrix(y_test, ypred, labels=np.unique(y_test)), annot=True, fmt="d", xticklabels=np.unique(y_test), yticklabels=np.unique(y_test), ax=ax)
    #display(pd.DataFrame(confusion_matrix(y_test , ypred), columns=['Pred: Reddit','CS:GO','Overwatch','Music','anime','baseball','canada','conspiracy','europe','funny','GOT','hockey','movies','LOL','nba','soccer','nfl','trees','news','wow'], index=['AskReddit','GlobalOffensive','Overwatch','Music','anime','baseball','canada','conspiracy','europe','funny','GOT','hockey','movies','league of legends','nba','soccer','nfl','trees','news','wow']))
    print(classification_report(y_test,ypred))  
    Accuracy = accuracy_score(y_test, ypred)
    return Accuracy


prediction = np.argmax(model.predict(vectors_val_idf), axis = 1)
reverse_word_map = dict(map(reversed, label_tokenizer.word_index.items()))
prediction = np.vectorize(reverse_word_map.get)(prediction)
#confusion_matrix(prediction, y_val)

misclassified = np.where(prediction != y_val)
X_val_print = np.array(X_val)
y_val_print = np.array(y_val)

for i in misclassified[0]:
    print(X_val_print[i], ", ", y_val_print[i], ", ", prediction[i])
    print("\n")


dont think coverd anything new quite heard digital media security related communities really appreciate screen play entire documentary showing prey big boys data able influence lives ,  datascience ,  hardware


hear lot shadiness bitcoin well read lot literally convert earnings fiat generate quarterly reports saving price coin time purchase sales accounted taxed ,  gamedev ,  cars


one point another forum someone worked atlas teased scenario collection fs afaik hes working anymore wonder initial scenarios initially planned collection ,  rpg ,  cars


waves hand know besm decently deal besm end result want transformation end day effects primary thing knowing want transformation going critical applies everything else whole system youre looking create head ,  rpg ,  gamedev


really loved art direction direction general reboot big meh feels like theyre trying jump remake success even tho rayman wouldve best option kinda looks like one arabian king porn games advertised like found wife c

In [None]:
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier(loss='hinge', alpha=1e-4)
pred_acc = cross_val_score(clf, vectors_train_idf_chi, y_train, cv = 5)
clf.fit(vectors_train_idf_chi, y_train)
val = clf.score(vectors_val_idf_chi, y_val)
print(pred_acc.mean())
print(val)

0.8977454462847465
0.903448275862069
