# Part III - Building a Flare Detector

Posts in ​ r/india can be corresponding to multiple topics. Each post is tagged for filtering purposes. These tags are called a flares in the reddit world. ​ r/india has flairs like Politics,AskIndia, Science/Technology etc.

To build a classifier which can predict the flare of a reddit post, Using data collected in Part I as training and validation data. 


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.externals import joblib
import pickle



In [4]:

flairs = ["AskIndia", "Non-Political", "[R]eddiquette", "Scheduled", "Photography", "Science/Technology", "Politics", "Business/Finance", "Policy/Economy", "Sports", "Food", "AMA"]



data = pd.read_csv('sample_update_final.csv')

In [79]:

#Dependent and independent variables
y = data.flair
X = data.feature_combine
print(int(len(y)))

1200


In [63]:
#Setting training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.09,random_state = 42)


In [64]:
#Naive Bayes
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])


In [65]:
NB = nb.fit(X_train, y_train)
#pickle.dump(NB,open("model_NB.sav",'wb'))
y_pred = nb.predict(X_test)


print(f"NB accuracy {accuracy_score(y_pred, y_test)}")
print(classification_report(y_test, y_pred,target_names=flairs))

NB accuracy 0.5092592592592593
                    precision    recall  f1-score   support

          AskIndia       0.45      1.00      0.62         9
     Non-Political       0.50      0.17      0.25        12
     [R]eddiquette       0.50      0.20      0.29         5
         Scheduled       0.33      0.17      0.22         6
       Photography       0.50      0.08      0.13        13
Science/Technology       0.88      0.78      0.82         9
          Politics       0.33      0.11      0.17         9
  Business/Finance       0.32      0.86      0.46         7
    Policy/Economy       0.56      1.00      0.71        15
            Sports       1.00      0.22      0.36         9
              Food       1.00      0.71      0.83         7
               AMA       0.38      0.71      0.50         7

          accuracy                           0.51       108
         macro avg       0.56      0.50      0.45       108
      weighted avg       0.57      0.51      0.45       108



In [66]:
#SGD
sgd = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),
               ])
SGD = sgd.fit(X_train, y_train)
#pickle.dump(SGD,open("model_SGC.sav",'wb'))
y_pred = sgd.predict(X_test)


print(f"SGD accuracy % {accuracy_score(y_pred, y_test)}")
print(classification_report(y_test, y_pred,target_names=flairs))

SGD accuracy % 0.6574074074074074
                    precision    recall  f1-score   support

          AskIndia       0.69      1.00      0.82         9
     Non-Political       0.50      0.33      0.40        12
     [R]eddiquette       0.43      0.60      0.50         5
         Scheduled       0.62      0.83      0.71         6
       Photography       0.60      0.23      0.33        13
Science/Technology       0.69      1.00      0.82         9
          Politics       0.75      0.33      0.46         9
  Business/Finance       0.33      0.86      0.48         7
    Policy/Economy       0.88      1.00      0.94        15
            Sports       1.00      0.56      0.71         9
              Food       1.00      0.86      0.92         7
               AMA       0.75      0.43      0.55         7

          accuracy                           0.66       108
         macro avg       0.69      0.67      0.64       108
      weighted avg       0.70      0.66      0.64       108



In [67]:
#Logistic Regression

logreg = Pipeline([('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=1,max_iter=1300, C=1e5)),
               ])

In [68]:

LOGREG = logreg.fit(X_train, y_train)
#pickle.dump(LOGREG,open("model_LOGREG.sav",'wb'))
y_pred = logreg.predict(X_test)

print(f"LOG accuracy % {accuracy_score(y_pred, y_test)}")
print(classification_report(y_test, y_pred,target_names=flairs))

LOG accuracy % 0.6851851851851852
                    precision    recall  f1-score   support

          AskIndia       0.90      1.00      0.95         9
     Non-Political       0.43      0.25      0.32        12
     [R]eddiquette       0.44      0.80      0.57         5
         Scheduled       0.67      0.67      0.67         6
       Photography       0.45      0.38      0.42        13
Science/Technology       0.80      0.89      0.84         9
          Politics       0.83      0.56      0.67         9
  Business/Finance       0.45      0.71      0.56         7
    Policy/Economy       0.94      1.00      0.97        15
            Sports       0.86      0.67      0.75         9
              Food       0.86      0.86      0.86         7
               AMA       0.50      0.57      0.53         7

          accuracy                           0.69       108
         macro avg       0.68      0.70      0.67       108
      weighted avg       0.69      0.69      0.68       108



In [69]:
#Random Forest
ranfor = Pipeline([('vect', CountVectorizer()),
                   ('tfidf', TfidfTransformer()),
                   ('clf', RandomForestClassifier(n_estimators = 1000, random_state = 42)),
                  ])
RM = ranfor.fit(X_train, y_train)
#pickle.dump(RM,open("model_RM.sav",'wb'))
y_pred = ranfor.predict(X_test)

print(f"RM accuracy % {accuracy_score(y_pred, y_test)}")
print(classification_report(y_test, y_pred,target_names=flairs))

RM accuracy % 0.6018518518518519
                    precision    recall  f1-score   support

          AskIndia       0.90      1.00      0.95         9
     Non-Political       0.67      0.50      0.57        12
     [R]eddiquette       0.33      0.40      0.36         5
         Scheduled       0.60      0.50      0.55         6
       Photography       0.67      0.15      0.25        13
Science/Technology       0.57      0.89      0.70         9
          Politics       0.50      0.33      0.40         9
  Business/Finance       0.33      0.71      0.45         7
    Policy/Economy       0.94      1.00      0.97        15
            Sports       0.40      0.44      0.42         9
              Food       0.58      1.00      0.74         7
               AMA       0.50      0.14      0.22         7

          accuracy                           0.60       108
         macro avg       0.58      0.59      0.55       108
      weighted avg       0.62      0.60      0.57       108



In [52]:


#mlp
mlp = Pipeline([('vect', CountVectorizer()),
                  ('tfidf', TfidfTransformer()),
                  ('clf', MLPClassifier(hidden_layer_sizes=(30,30,30))),
                 ])
MLP = mlp.fit(X_train, y_train)
#pickle.dump(MLP,open("model_MLP.sav",'wb'))
y_pred = mlp.predict(X_test)

print(f"MLP accuracy % {accuracy_score(y_pred, y_test)}")
print(classification_report(y_test, y_pred,target_names=flairs))

MLP accuracy % 0.5916666666666667
                    precision    recall  f1-score   support

          AskIndia       0.91      1.00      0.95        10
     Non-Political       0.25      0.42      0.31        12
     [R]eddiquette       0.40      0.33      0.36         6
         Scheduled       0.62      0.71      0.67         7
       Photography       0.00      0.00      0.00        16
Science/Technology       0.82      1.00      0.90         9
          Politics       0.56      0.50      0.53        10
  Business/Finance       0.67      0.25      0.36         8
    Policy/Economy       1.00      0.94      0.97        16
            Sports       1.00      0.44      0.62         9
              Food       1.00      0.78      0.88         9
               AMA       0.27      0.88      0.41         8

          accuracy                           0.59       120
         macro avg       0.62      0.60      0.58       120
      weighted avg       0.61      0.59      0.57       120



In [5]:
import itertools
import os

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils

Using TensorFlow backend.


In [6]:
pip install keras

Note: you may need to restart the kernel to use updated packages.


In [7]:




train_posts = data.feature_combine
train_tags = data.flair

test_posts = train_posts[540:]
test_tags = train_tags[540:]

In [8]:


max_words = 1200
tokenize = text.Tokenizer(num_words=max_words, char_level=False)
tokenize.fit_on_texts(train_posts) # only fit on train

x_train = tokenize.texts_to_matrix(train_posts)
x_test = tokenize.texts_to_matrix(test_posts)

encoder = LabelEncoder()
encoder.fit(train_tags)
y_train = encoder.transform(train_tags)
y_test = encoder.transform(test_tags)

num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)
y_test = utils.to_categorical(y_test, num_classes)

batch_size = 32
epochs = 2

# Build the model
model = Sequential()
model.add(Dense(512, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
              
history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1)

Train on 1080 samples, validate on 120 samples
Epoch 1/2
Epoch 2/2


In [9]:
score = model.evaluate(x_test, y_test,
                       batch_size=batch_size, verbose=1)
print('Test accuracy:', score[1])

Test accuracy: 0.668181836605072
