# Importing Libraries


In [1]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.externals import joblib
from xgboost import XGBClassifier
import pickle
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Defining The Flair Categories


In [2]:

flairs = ["AskIndia", "Non-Political", "[R]eddiquette", 
          "Scheduled", "Photography", "Science/Technology",
          "Politics", "Business/Finance", "Policy/Economy",
          "Sports", "Food", "AMA"]


# Fetching the Data

In [3]:
data = pd.read_csv('data/data.csv')

In [4]:
data.head()

Unnamed: 0,author,authors,body,comment,comms_num,created,flair,id,score,title,url,combined_features
0,dhavalcoholic,ICICIPruLifeIns,reposting lack activity r askindiahello last y...,dear policy holder dhavalcoholic request help ...,1,1386254000.0,AskIndia,1s57oi,1,need feedback insurance policy took xpost aski...,https://www.reddit.com/r/india/comments/1s57oi...,need feedback insurance policy took xpost aski...
1,amitkumarthakur,RAD-Business RAD-Business None barcam10 _snor...,24hrs local police station register case dont ...,calm downgo sp office town file grievance imme...,24,1554080000.0,AskIndia,b7pvwt,94,somebody want kill full family,https://www.reddit.com/r/india/comments/b7pvwt...,somebody want kill full familycalm downgo sp o...
2,FrustratedOCIHopeful,plshelpthedog ayyylmaaaoo Proper_Boysenberry ...,hello askindia first time poster long time lur...,honestly supervisor behaved exactly government...,27,1555361000.0,AskIndia,bdfid1,10,ambassador india takes back newly issued oci c...,https://www.reddit.com/r/india/comments/bdfid1...,ambassador india takes back newly issued oci c...
3,aloo_vs_bhaloo,vcdarklord tilismilis aloo_vs_bhaloo dogaa fo...,r tooafraidtoask india edition,modi control sex desires jerk someone else pro...,22,1566529000.0,AskIndia,cu1xn4,18,randians afraid ask,https://www.reddit.com/r/india/comments/cu1xn4...,randians afraid askmodi control sex desires je...
4,multubunu,,hello submitted r raskindia week ago got answe...,,0,1361085000.0,AskIndia,18ntue,0,askindia cingari cengar tzengar,https://www.reddit.com/r/india/comments/18ntue...,askindia cingari cengar tzengarhttps://www.red...


In [5]:
data.fillna("",inplace = True)

# Text Preprocessing

In [6]:
from gensim import utils
import gensim.parsing.preprocessing as gsp

filters = [
           gsp.strip_tags, 
           gsp.strip_punctuation,
           gsp.strip_multiple_whitespaces,
           gsp.strip_numeric,
           gsp.remove_stopwords, 
           gsp.strip_short, 
           gsp.stem_text
          ]

def clean_text(s):
    s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s

data.body = data.body.apply(lambda x:clean_text(x))


unable to import 'smart_open.gcs', disabling that module


# Trying different models

In [7]:
##logistic regression
def logisticreg(X_train, X_test, y_train, y_test):

  from sklearn.linear_model import LogisticRegression

  logreg = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', LogisticRegression(n_jobs=1, C=1e5)),
                 ])
  logreg.fit(X_train, y_train)

  y_pred = logreg.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  print(classification_report(y_test, y_pred,target_names=flairs))

In [8]:
## naive bayes
def nb_classifier(X_train, X_test, y_train, y_test):
  nb = Pipeline([('vect', CountVectorizer()),
                 ('tfidf', TfidfTransformer()),
                 ('clf', MultinomialNB()),
                ])
  nb.fit(X_train, y_train)

  y_pred = nb.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  print(classification_report(y_test, y_pred,target_names=flairs))

In [9]:
## SVM
def linear_svm(X_train, X_test, y_train, y_test):
  sgd = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
                 ])
  sgd.fit(X_train, y_train)

  y_pred = sgd.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  print(classification_report(y_test, y_pred,target_names=flairs))

In [10]:
## random forest
def randomforest(X_train, X_test, y_train, y_test):
  ranfor = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', RandomForestClassifier(n_estimators = 1000, random_state = 42)),
                 ])
  ranfor.fit(X_train, y_train)

  y_pred = ranfor.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  print(classification_report(y_test, y_pred,target_names=flairs))

In [11]:
## mlp
def mlpclassifier(X_train, X_test, y_train, y_test):  
  mlp = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', MLPClassifier(hidden_layer_sizes=(30,30,30))),
                 ])
  mlp.fit(X_train, y_train)

  y_pred = mlp.predict(X_test)

  print('accuracy %s' % accuracy_score(y_pred, y_test))
  print(classification_report(y_test, y_pred,target_names=flairs))

In [12]:
## xgboost
def xgbclassifier(X_train, X_test, y_train, y_test):  
    xgb_clf = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', XGBClassifier(random_state=42, seed=2, colsample_bytree=0.6, subsample=0.7,objective='multi:softmax')),
                 ])
    xgb_clf.fit(X_train, y_train)

    y_pred = xgb_clf.predict(X_test)

    print('accuracy %s' % accuracy_score(y_pred, y_test))
    print(classification_report(y_test, y_pred,target_names=flairs))

# Evaluation

In [13]:
def train_test(X,y):
 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)
   
    print("Results of Naive Bayes Classifier")
    nb_classifier(X_train, X_test, y_train, y_test)
    print("Results of Linear Support Vector Machine")
    linear_svm(X_train, X_test, y_train, y_test)
    print("Results of Logistic Regression")
    logisticreg(X_train, X_test, y_train, y_test)
    print("Results of Random Forest")
    randomforest(X_train, X_test, y_train, y_test)
    print("Results of MLP Classifier")
    mlpclassifier(X_train, X_test, y_train, y_test)
    print("Results of XGB Classifier")
    xgbclassifier(X_train, X_test, y_train, y_test)


We will be checking the accuracy with title,URL,body,comments and combination of them as features.

In [14]:
cat = data.flair

V = data.combined_features
W = data.comment
X = data.title
Y = data.body
Z = data.url

print("Flair Detection using Title as Feature")
train_test(X,cat)
print("Flair Detection using Body as Feature")
train_test(Y,cat)
print("Flair Detection using URL as Feature")
train_test(Z,cat)
print("Flair Detection using Comments as Feature")
train_test(W,cat)
print("Flair Detection using Combined Features")
train_test(V,cat)

Flair Detection using Title as Feature
Results of Naive Bayes Classifier
accuracy 0.8571428571428571
                    precision    recall  f1-score   support

          AskIndia       0.85      0.85      0.85        34
     Non-Political       0.61      0.93      0.74        30
     [R]eddiquette       0.82      0.75      0.79        44
         Scheduled       0.97      0.87      0.92        38
       Photography       0.88      0.93      0.90        45
Science/Technology       0.94      0.96      0.95        47
          Politics       0.82      0.94      0.88        34
  Business/Finance       0.81      0.81      0.81        37
    Policy/Economy       0.88      0.91      0.89        46
            Sports       0.95      0.79      0.86        48
              Food       0.94      0.86      0.90        37
               AMA       0.00      0.00      0.00         8

          accuracy                           0.86       448
         macro avg       0.79      0.80      0.79       4

# Saving the model

In [15]:
import xgboost as xgb
X_train, X_test, y_train, y_test = train_test_split(V, cat, test_size=0.2, random_state = 42)
model = Pipeline([('vect', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
                   ('clf', xgb.sklearn.XGBClassifier(random_state=42, seed=2, colsample_bytree=0.6, subsample=0.7,objective='multi:softmax')),
                  ])
XGB = model.fit(X_train, y_train)
# pickle.dump(XGB,open("xgb.bin",'wb'))
# y_pred = model.predict(X_test)

In [16]:
joblib.dump(XGB, open('xgb.bin', 'wb'))

# to load the saved model
bst = joblib.load(open('xgb.bin', 'rb'))

# Loading the model and checking the predictions

In [17]:
import pickle
import logging
import gensim
import praw
from praw.models import MoreComments
import os
import json
import joblib
from gensim import utils
import gensim.parsing.preprocessing as gsp
import xgboost as xgb
filters = [
           gsp.strip_tags, 
           gsp.strip_punctuation,
           gsp.strip_multiple_whitespaces,
           gsp.strip_numeric,
           gsp.remove_stopwords, 
           gsp.strip_short, 
           gsp.stem_text
          ]

def clean(s):
    s = s.lower()
    s = utils.to_unicode(s)
    for f in filters:
        s = f(s)
    return s

model = bst


reddit = praw.Reddit(client_id = "########",
					client_secret = "#######",
					user_agent = "#######",
					username = "######",
					password = "######")

def prediction(url):
	submission = reddit.submission(url = url)
	data = {}
	data["title"] = str(submission.title)
	data["url"] = str(submission.url)
	data["body"] = str(submission.selftext)

	submission.comments.replace_more(limit=None)
	comment = ''
	count = 0
	for top_level_comment in submission.comments:
		comment = comment + ' ' + top_level_comment.body
		count+=1
		if(count > 10):
		 	break
		
	data["comment"] = str(comment)

	data['title'] = clean(str(data['title']))
	data['body'] = clean(str(data['body']))
	data['comment'] = clean(str(data['comment']))
    
	combined_features = data["title"] + data["comment"] + data["body"] + data["url"]

	return str(model.predict([combined_features]))[2:-2]
 
prediction("https://www.reddit.com/r/india/comments/d1m9ld/iran_removes_antiindia_banners_from_pak_consulate/")


'Politics'