In [40]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from yellowbrick.classifier import ClassPredictionError
from sklearn.naive_bayes import BernoulliNB
import warnings
warnings.filterwarnings("ignore")

# from yellowbrick.classifier import ClassPredictionError
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report, roc_curve, auc, f1_score, accuracy_score, confusion_matrix, recall_score, precision_score

In [3]:
df = pd.read_csv('p3_reddit.csv')[['text', 'label']]
print(df.shape)
df = df.drop_duplicates()
df

(1511, 2)


Unnamed: 0,text,label
0,alway taught peopl like jordan peterson matt w...,0
1,don’t actual know qanon hundr leagu past stand...,0
2,f becom concern cousin’ f well relationship cl...,0
3,marri narcissist man littl decad final let go ...,0
4,f brotherinlaw alway total jackass marri siste...,0
...,...,...
1506,cmv uswestern media larg neglect long histori ...,2
1507,agre support idea uk elect head state current ...,2
1508,disagre unit state democrat nation defin const...,2
1509,disagre clear cut side line drawn sand seen pa...,2


In [9]:
0 -  human
1 - gpt3
2 - instruct_gpt

NameError: name 'human' is not defined

In [10]:
df.dropna(inplace = True)

In [None]:
df.info()

### Splitting the data

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df['text'],
                                                    df['label'],
                                                    stratify = df['label'],
                                                    test_size = 0.2,
                                                    random_state = 1234)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1194,) (299,) (1194,) (299,)


In [12]:
y_train.value_counts()

1    401
2    400
0    393
Name: label, dtype: int64

In [13]:
v = TfidfVectorizer()

X_train = v.fit_transform(X_train)
X_test = v.transform(X_test)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(1194, 9541) (299, 9541) (1194,) (299,)


In [14]:
X_train

<1194x9541 sparse matrix of type '<class 'numpy.float64'>'
	with 86095 stored elements in Compressed Sparse Row format>

### Model Training

In [15]:
labels = ['ctrl', 'gpt', 'gpt2', 'grover', 'xlm', 'xlnet', 'pplm', 'human', 'fair', 'gpt3', 'instruct_gpt']
    
def visualizer(clf):
    visualizer = ClassPredictionError(clf, classes= labels)

    # Fit the training data to the visualizer
    visualizer.fit(X_train, y_train)

    # Evaluate the model on the test data
    visualizer.score(X_test, y_test)

    # Draw visualization
    visualizer.show()
    
from yellowbrick.classifier import ConfusionMatrix

def get_confusion_matrix(model):
   
    cm = ConfusionMatrix(
        model, classes=labels,
        percent=True
        #label_encoder={0: 'Adelie', 1: 'Chinstrap', 2: 'Gentoo'}
    )
    cm.fit(X_train, y_train)
    cm.score(X_test, y_test)
    cm.show();

from sklearn.metrics import roc_curve, auc, roc_auc_score
    
def metrics(pred):
    matrix = confusion_matrix(y_test, pred, labels = labels)
    mat = matrix.diagonal()/matrix.sum(axis=1)
    print(classification_report(y_test, pred, labels = labels,
                                digits=4))

    print('confusion matrix: ', mat)

    Accuracy = accuracy_score(y_test,pred)
    F1 = f1_score(y_test, pred, average='macro')
    print("Accuracy:", Accuracy)

    rec = recall_score(y_test, pred, average='macro')
    print('Recall: ', rec)
    prec = precision_score(y_test, pred, average='macro')
    print('Precision: ', prec)

    print('F1:', F1)
    
def get_predictions(tpr, fpr, threshold, ypred):
    #If tpr is hight & fpr is low : (tpr*(1-fpr)) is maximum
    actual_ypred = []

    thres = threshold[np.argmax(tpr * (1 - fpr))]
    for value in ypred:
        if value < thres:
            actual_ypred.append(0)
        else:
            actual_ypred.append(1)
    return actual_ypred
    
# def get_roc_curve(model_name, classifier):
from yellowbrick.classifier import ROCAUC

def plot_ROC_curve(model):
    # Creating visualization with the readable labels
    visualizer = ROCAUC(model, classes = labels)
                                        
    # Fitting to the training data first then scoring with the test data                                    
    visualizer.fit(X_train, y_train)
    visualizer.score(X_test, y_test)
    visualizer.show()
    
    return visualizer


def get_confusion_matrix(model_name, roc_results):
    actual_ypred_train = get_predictions(roc_results['tpr_train'],
                                         roc_results['fpr_train'],
                                         roc_results['threshold_train'],
                                         roc_results['ypred_train'])

    matrix_train = confusion_matrix(y_train, actual_ypred_train)

    actual_ypred_test = get_predictions(roc_results['tpr_test'],
                                        roc_results['fpr_test'],
                                        roc_results['threshold_test'],
                                        roc_results['ypred_test'])

    matrix_test = confusion_matrix(y_test, actual_ypred_test)

    plt.rcParams["figure.figsize"] = [12, 5]
    plt.rcParams["figure.autolayout"] = True
    f, axes = plt.subplots(1, 2)
    
    axes[0].title.set_text(f'{model_name} Training Confusion Matrix')
    sns.heatmap(matrix_train, annot = True, ax = axes[0], fmt = "d")


    sns.heatmap(matrix_test, annot = True, ax = axes[1], fmt = "d")
    axes[1].title.set_text(f'{model_name} Testing Confusion Matrix')

    return actual_ypred_test, actual_ypred_train

### Random Forest Classifier

In [16]:
randomforest_model = RandomForestClassifier()

parameters = { 'max_depth' : [10, 20, 30],
               'n_estimators' : [90, 150, 180],
               'max_samples' : [0.6, 0.8]
 }

cross_validation = 3
scoring_metric = "f1"
randomforest_model_cv = GridSearchCV(randomforest_model, 
                                     parameters,
                                     cv = cross_validation,
                                     scoring = scoring_metric,
                                     return_train_score=True)

randomforest_model_cv.fit(X_train, y_train)
print('Best Params ', randomforest_model_cv.best_params_)

Best Params  {'max_depth': 10, 'max_samples': 0.6, 'n_estimators': 90}


In [17]:
parameters = {
     'max_depth' : 10,
     'n_estimators' : 90,
     'max_samples' : 0.6
}
randomforest_model = RandomForestClassifier(max_depth = parameters['max_depth'],
                                            max_samples = parameters['max_samples'],
                                            n_estimators = parameters['n_estimators'])

randomforest_model.fit(X_train, y_train)
pred = randomforest_model.predict(X_test)

In [20]:
from sklearn.metrics import classification_report, roc_curve, auc, f1_score, accuracy_score, confusion_matrix
matrix = confusion_matrix(y_test, pred, labels = [0,1,2])
mat = matrix.diagonal()/matrix.sum(axis=1)
print(classification_report(y_test, pred, labels = [0,1,2],
                            digits=4))
print('confusion matrix: ', mat)

Accuracy = accuracy_score(y_test,pred)
F1 = f1_score(y_test, pred, average='macro')
print("Accuracy:", Accuracy)

rec = recall_score(y_test, pred, average='macro')
print('Recall: ', rec)
prec = precision_score(y_test, pred, average='macro')
print('Precision: ', prec)

print('F1:', F1)

              precision    recall  f1-score   support

           0     1.0000    0.9082    0.9519        98
           1     0.6899    0.8812    0.7739       101
           2     0.8272    0.6700    0.7403       100

    accuracy                         0.8194       299
   macro avg     0.8390    0.8198    0.8220       299
weighted avg     0.8375    0.8194    0.8210       299

confusion matrix:  [0.90816327 0.88118812 0.67      ]
Accuracy: 0.8193979933110368
Recall:  0.8197837947060012
Precision:  0.8390276581491052
F1: 0.8220387309816596


### Logistic Regression

In [21]:
logistic_model = LogisticRegression()
parameters = { 'C' : [0.01, 0.1, 1, 3, 10],
               'penalty' : ['l2', 'elasticnet']
             }

cross_validation = 3
scoring_metric = "f1"

logistic_model_cv = GridSearchCV(logistic_model,
                                 parameters,
                                 cv = cross_validation,
                                 scoring = scoring_metric,
                                 return_train_score=True)

logistic_model_cv.fit(X_train, y_train)
print('Best Params ', logistic_model_cv.best_params_)

Best Params  {'C': 0.01, 'penalty': 'l2'}


In [26]:
parameters = { 'C' : 1,
               'penalty' : 'l2'
 }

logistic_model = LogisticRegression(C = parameters['C'], penalty = parameters['penalty'])
logistic_model.fit(X_train, y_train)
pred = logistic_model.predict(X_test)

In [27]:
matrix = confusion_matrix(y_test, pred, labels = [0,1,2])
mat = matrix.diagonal()/matrix.sum(axis=1)
print(classification_report(y_test, pred, labels = [0,1,2],
                            digits=4))
print('confusion matrix: ', mat)

Accuracy = accuracy_score(y_test,pred)
F1 = f1_score(y_test, pred, average='macro')
print("Accuracy:", Accuracy)

rec = recall_score(y_test, pred, average='macro')
print('Recall: ', rec)
prec = precision_score(y_test, pred, average='macro')
print('Precision: ', prec)

print('F1:', F1)

              precision    recall  f1-score   support

           0     0.9368    0.9082    0.9223        98
           1     0.7374    0.7228    0.7300       101
           2     0.7429    0.7800    0.7610       100

    accuracy                         0.8027       299
   macro avg     0.8057    0.8036    0.8044       299
weighted avg     0.8046    0.8027    0.8034       299

confusion matrix:  [0.90816327 0.72277228 0.78      ]
Accuracy: 0.802675585284281
Recall:  0.8036451808446151
Precision:  0.8056909951646793
F1: 0.8044184675007373


### XG Boost

In [28]:
import xgboost

xgboost_model = xgboost.XGBClassifier()

parameters = { 'max_depth' : [10, 20, 30],
               'n_estimators' : [90, 150, 180],
               'min_child_weight' : [1, 5, 10 ]
 }

cross_validation = 3
scoring_metric = "f1"
xgboost_model_cv = GridSearchCV(xgboost_model, 
                                parameters,
                                cv = cross_validation,
                                scoring = scoring_metric,
                                return_train_score=True)

xgboost_model_cv.fit(X_train, y_train)
print('Best Params ', xgboost_model_cv.best_params_)

Best Params  {'max_depth': 10, 'min_child_weight': 1, 'n_estimators': 90}


In [34]:
parameters = { 'max_depth' : 10,
               'min_child_weight' : 1,
               'n_estimators' : 150
 }

xgboost_model = xgboost.XGBClassifier(max_depth = parameters['max_depth'],
                                      min_child_weight = parameters['min_child_weight'],
                                      n_estimators = parameters['n_estimators'])
xgboost_model.fit(X_train, y_train)
pred = xgboost_model.predict(X_test)

In [35]:
matrix = confusion_matrix(y_test, pred, labels = [0,1,2])
mat = matrix.diagonal()/matrix.sum(axis=1)
print(classification_report(y_test, pred, labels = [0,1,2],
                            digits=4))
print('confusion matrix: ', mat)

Accuracy = accuracy_score(y_test,pred)
F1 = f1_score(y_test, pred, average='macro')
print("Accuracy:", Accuracy)

rec = recall_score(y_test, pred, average='macro')
print('Recall: ', rec)
prec = precision_score(y_test, pred, average='macro')
print('Precision: ', prec)

print('F1:', F1)

              precision    recall  f1-score   support

           0     0.9394    0.9490    0.9442        98
           1     0.7434    0.8317    0.7850       101
           2     0.8276    0.7200    0.7701       100

    accuracy                         0.8328       299
   macro avg     0.8368    0.8336    0.8331       299
weighted avg     0.8358    0.8328    0.8322       299

confusion matrix:  [0.94897959 0.83168317 0.72      ]
Accuracy: 0.8327759197324415
Recall:  0.833554253384522
Precision:  0.8367809927162995
F1: 0.833087547152005


### SVM

In [36]:
svm_model = SVC()

parameters = { 'kernel' : ['poly', 'rbf', 'sigmoid'],
               'max_iter' : [20, 50, 100]
 }

cross_validation = 3
scoring_metric = "f1"
svm_model_cv = GridSearchCV(svm_model, 
                            parameters,
                            cv = cross_validation,
                            scoring = scoring_metric,
                            return_train_score=True)

svm_model_cv.fit(X_train, y_train)
print('Best Params ', svm_model_cv.best_params_)

Best Params  {'kernel': 'poly', 'max_iter': 20}


In [37]:
parameters = {
         'kernel' : 'poly',
         'max_iter' : 20,
        }

svm_model = SVC(kernel = parameters['kernel'],
max_iter = parameters['max_iter'], probability = True)

svm_model.fit(X_train, y_train)
pred = svm_model.predict(X_test)

In [38]:
from sklearn.metrics import classification_report, roc_curve, auc, f1_score, accuracy_score, confusion_matrix
matrix = confusion_matrix(y_test, pred, labels = [0,1,2])
mat = matrix.diagonal()/matrix.sum(axis=1)
print(classification_report(y_test, pred, labels = [0,1,2],
                            digits=4))
print('confusion matrix: ', mat)

Accuracy = accuracy_score(y_test,pred)
F1 = f1_score(y_test, pred, average='macro')
print("Accuracy:", Accuracy)

rec = recall_score(y_test, pred, average='macro')
print('Recall: ', rec)
prec = precision_score(y_test, pred, average='macro')
print('Precision: ', prec)

print('F1:', F1)

              precision    recall  f1-score   support

           0     0.8889    0.2449    0.3840        98
           1     0.3483    0.9208    0.5054       101
           2     0.0000    0.0000    0.0000       100

    accuracy                         0.3913       299
   macro avg     0.4124    0.3886    0.2965       299
weighted avg     0.4090    0.3913    0.2966       299

confusion matrix:  [0.24489796 0.92079208 0.        ]
Accuracy: 0.391304347826087
Recall:  0.3885633461305314
Precision:  0.41240116521015396
F1: 0.29647826086956525


### Binomial Naive Bayes 

In [41]:
bnb_model = BernoulliNB()
bnb_model.fit(X_train, y_train)
pred = bnb_model.predict(X_test)

In [43]:
matrix = confusion_matrix(y_test, pred, labels = [0,1,2])
mat = matrix.diagonal()/matrix.sum(axis=1)
print(classification_report(y_test, pred, labels = [0,1,2],
                            digits=4))
print('confusion matrix: ', mat)

Accuracy = accuracy_score(y_test,pred)
F1 = f1_score(y_test, pred, average='macro')
print("Accuracy:", Accuracy)

rec = recall_score(y_test, pred, average='macro')
print('Recall: ', rec)
prec = precision_score(y_test, pred, average='macro')
print('Precision: ', prec)

print('F1:', F1)

              precision    recall  f1-score   support

           0     1.0000    0.8265    0.9050        98
           1     0.7179    0.8317    0.7706       101
           2     0.7228    0.7300    0.7264       100

    accuracy                         0.7960       299
   macro avg     0.8136    0.7961    0.8007       299
weighted avg     0.8120    0.7960    0.7999       299

confusion matrix:  [0.82653061 0.83168317 0.73      ]
Accuracy: 0.7959866220735786
Recall:  0.7960712601872432
Precision:  0.8135736650588136
F1: 0.8006794313332454


In [45]:
import tabulate
print('Reddit Task P3 : TF vectors')
conclusion = [['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score', 'Train AUC', 'Test AUC', ],
              ['Logistic Regression', 80.2, 80.4, 80.3, 80.4],
              ['XGBoost Classifier', 83.2, 83.6, 83.3, 83.3],
              ['Random Forest Classifier', 81.9, 83.9, 81.9, 82.2],
              ['SVM Classifier', 39.1, 41.2, 38.8, 30.7],
              ['Naive Bayes Classifier',  79.5, 81.3, 79.6, 80.0],   
             ]
print(tabulate.tabulate(conclusion, tablefmt='fancy_grid'))

Reddit Task P3 : TF vectors
╒══════════════════════════╤══════════╤═══════════╤════════╤══════════╤═══════════╤══════════╕
│ Model                    │ Accuracy │ Precision │ Recall │ F1 Score │ Train AUC │ Test AUC │
├──────────────────────────┼──────────┼───────────┼────────┼──────────┼───────────┼──────────┤
│ Logistic Regression      │ 80.2     │ 80.4      │ 80.3   │ 80.4     │           │          │
├──────────────────────────┼──────────┼───────────┼────────┼──────────┼───────────┼──────────┤
│ XGBoost Classifier       │ 83.2     │ 83.6      │ 83.3   │ 83.3     │           │          │
├──────────────────────────┼──────────┼───────────┼────────┼──────────┼───────────┼──────────┤
│ Random Forest Classifier │ 81.9     │ 83.9      │ 81.9   │ 82.2     │           │          │
├──────────────────────────┼──────────┼───────────┼────────┼──────────┼───────────┼──────────┤
│ SVM Classifier           │ 39.1     │ 41.2      │ 38.8   │ 30.7     │           │          │
├─────────────────────