In [1]:
#Importing Libraries


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
from sklearn.metrics import classification_report,accuracy_score,confusion_matrix
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.externals import joblib
import pickle
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [7]:
#defining Flairs

In [8]:
flairs = ["AskIndia", "Non-Political", "[R]eddiquette", 
          "Scheduled", "Photography", "Science/Technology",
          "Politics", "Business/Finance", "Policy/Economy",
          "Sports", "Food", "AMA"]

In [9]:
#fetching the data


In [10]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,title,score,id,url,comms_num,created,body,author,flair,comment,authors,combined_features
0,4 days ago pending orders 100 million hydroxyc...,92,fwjdqr,https://www.reddit.com/r/india/comments/fwjdqr...,6,1586290000.0,getting frantic calls pharma customers delayed...,india_ko_vanakkam,AskIndia,modi stockholm syndrome,Meraxes373,4 days ago pending orders 100 million hydroxyc...
1,randians big time users dating apps like tinde...,21,fizkkk,https://www.reddit.com/r/india/comments/fizkkk...,19,1584298000.0,id stint apps couple months one point didnt fe...,__knockknockturnal__,AskIndia,someone matched tell im fat,JagdishBhagat12khadi,randians big time users dating apps like tinde...
2,r india thinks flat earthers,6,f25vx0,https://www.reddit.com/r/india/comments/f25vx0...,31,1581441000.0,encountered foreigner ig says round earth hoax...,Dev1003,AskIndia,havent found indian yet believes earth flat,sudhanshu_sharma,r india thinks flat earthershavent found india...
3,people left 9 5 jobs pursue career music art f...,43,dtvliq,https://www.reddit.com/r/india/comments/dtvliq...,34,1573333000.0,couldnt add askindia flair mobile browser,c0mrade34,AskIndia,engineer advertisement shoots since last 1year...,dslrbhai,people left 9 5 jobs pursue career music art f...
4,somebody want kill full family,97,b7pvwt,https://www.reddit.com/r/india/comments/b7pvwt...,24,1554080000.0,24hrs local police station register case dont ...,amitkumarthakur,AskIndia,calm downgo sp office town file grievance imme...,RAD-Business,somebody want kill full familycalm downgo sp o...


In [11]:
#filling NAs with ""
df.fillna("",inplace=True)

In [12]:
#trying different models

In [24]:
def log_reg(x_train,x_test,y_train,y_test):
    from sklearn.linear_model import LogisticRegression
    
    logreg = Pipeline([('vect',CountVectorizer()),('tfidf',TfidfTransformer()),('clf',LogisticRegression(n_jobs =1,C=1e5)),])
    
    logreg.fit(x_train,y_train)
    
    y_pred = logreg.predict(x_test)
    
    print('accuracy %s', round(accuracy_score(y_pred,y_test),3))
    print(classification_report(y_test, y_pred, target_names = flairs))

In [26]:
def nb_classifier(x_train,x_test,y_train,y_test):
    nb = Pipeline([('vect',CountVectorizer()),('tfidf',TfidfTransformer()),('clf',MultinomialNB()),])
    nb.fit(x_train,y_train)
    
    y_pred = nb.predict(x_test)
    
    print('accuracy %s', round(accuracy_score(y_pred,y_test),3))
    print(classification_report(y_test, y_pred, target_names = flairs))

In [27]:
def lin_svm(x_train,x_test,y_train,y_test):
    sdg = Pipeline([('vect',CountVectorizer()),('tfidf',TfidfTransformer()),('clf',SGDClassifier(loss = 'hinge', penalty ='l2',alpha=1e-3, random_state=42, max_iter=5, tol=None)),])
    
    sdg.fit(x_train,y_train)
    y_pred = sdg.predict(x_test)
    
    print('accuracy %s', round(accuracy_score(y_pred,y_test),3))
    print(classification_report(y_test, y_pred, target_names = flairs))

In [28]:
def rand_forest(x_train,x_test,y_train,y_test):
    rand_for = Pipeline([('vect',CountVectorizer()),('tfidf',TfidfTransformer()),('clf',RandomForestClassifier(n_estimators = 1000, random_state = 42)),])
    
    rand_for.fit(x_train,y_train)
    
    y_pred = rand_for.predict(x_test)
    
    print('accuracy %s', round(accuracy_score(y_pred,y_test),3))
    print(classification_report(y_test, y_pred, target_names = flairs))

In [29]:
def mlp_classifier(x_train,x_test,y_train,y_test):
    mlp = Pipeline([('vect',CountVectorizer()),('tfidf',TfidfTransformer()),('clf',MLPClassifier(hidden_layer_sizes =(30,30,30))),])
    
    mlp.fit(x_train,y_train)
    
    y_pred = mlp.predict(x_test)
    print('accuracy %s', round(accuracy_score(y_pred,y_test),3))
    print(classification_report(y_test, y_pred, target_names = flairs))

In [30]:
def train_test_run(x,y):
    x_train,x_test,y_train,y_test = train_test_split(x,y,test_size =0.2, random_state = 42)
    
    print("\nResults of Naive Bayes Classifier")
    nb_classifier(x_train,x_test,y_train,y_test)
    print("\nResults of Linear SVM")
    lin_svm(x_train,x_test,y_train,y_test)
    print("\nResults of Logistic Regression")
    log_reg(x_train,x_test,y_train,y_test)
    print("\nResults of Random Forest")
    rand_forest(x_train,x_test,y_train,y_test)
    print("\nResults of MLP Classifier")
    mlp_classifier(x_train,x_test,y_train,y_test)
    
    

In [31]:
print('Flair Detection using "title" as a Feature')
train_test_run(df.title,df.flair)
print('Flair Detection using "comment" as a Feature')
train_test_run(df.comment,df.flair)
print('Flair Detection using "URL" as a Feature')
train_test_run(df.url,df.flair)
print('Flair Detection using "body" as a Feature')
train_test_run(df.body,df.flair)
print('Flair Detection using "combined" as a Feature')
train_test_run(df.combined_features,df.flair)

Flair Detection using "title" as a Feature

Results of Naive Bayes Classifier
accuracy %s 0.713
                    precision    recall  f1-score   support

          AskIndia       0.71      0.62      0.67        16
     Non-Political       0.71      0.96      0.82        47
     [R]eddiquette       1.00      0.09      0.17        22
         Scheduled       1.00      0.83      0.91        12
       Photography       0.67      1.00      0.80        43
Science/Technology       0.81      0.98      0.89        44
          Politics       0.60      0.30      0.40        20
  Business/Finance       1.00      0.12      0.21        17
    Policy/Economy       0.67      0.71      0.69        41
            Sports       0.63      1.00      0.78        40
              Food       1.00      0.20      0.33        20
               AMA       0.00      0.00      0.00         6

          accuracy                           0.71       328
         macro avg       0.74      0.57      0.56       328
  

accuracy %s 0.567
                    precision    recall  f1-score   support

          AskIndia       0.33      0.19      0.24        16
     Non-Political       0.56      0.70      0.62        47
     [R]eddiquette       0.50      0.18      0.27        22
         Scheduled       0.54      0.58      0.56        12
       Photography       0.77      0.70      0.73        43
Science/Technology       0.72      0.75      0.73        44
          Politics       0.45      0.25      0.32        20
  Business/Finance       0.50      0.24      0.32        17
    Policy/Economy       0.51      0.56      0.53        41
            Sports       0.46      0.95      0.62        40
              Food       0.60      0.15      0.24        20
               AMA       1.00      0.50      0.67         6

          accuracy                           0.57       328
         macro avg       0.58      0.48      0.49       328
      weighted avg       0.58      0.57      0.54       328


Results of MLP Cla

accuracy %s 0.521
                    precision    recall  f1-score   support

          AskIndia       0.43      0.38      0.40        16
     Non-Political       0.82      0.77      0.79        47
     [R]eddiquette       0.62      0.36      0.46        22
         Scheduled       0.60      0.25      0.35        12
       Photography       0.28      0.95      0.44        43
Science/Technology       0.92      0.50      0.65        44
          Politics       0.40      0.50      0.44        20
  Business/Finance       0.00      0.00      0.00        17
    Policy/Economy       0.75      0.44      0.55        41
            Sports       0.83      0.50      0.62        40
              Food       0.57      0.20      0.30        20
               AMA       0.75      0.50      0.60         6

          accuracy                           0.52       328
         macro avg       0.58      0.45      0.47       328
      weighted avg       0.63      0.52      0.52       328


Results of Random 

In [32]:
#Random forest gave the best result with combined features

In [35]:
x_train,x_test,y_train,y_test = train_test_split(df.combined_features,df.flair,test_size =0.2, random_state = 42)
rand_for = Pipeline([('vect',CountVectorizer()),('tfidf',TfidfTransformer()),('clf',RandomForestClassifier(n_estimators = 1000, random_state = 42)),])
RF = rand_for.fit(x_train,y_train)

pickle.dump(RF,open('RF.pkl','wb'))
y_pred = rand_for.predict(x_test)