In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [8]:
df = pd.read_csv("data/cleaned_test.csv")

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,article,label,no_contract,article_str,tokenized,lower,no_punc,stopwords_removed,pos_tags,wordnet_pos,lemmatized
0,0,The president has yet to clarify what victory ...,right,"['The', 'president', 'has', 'yet', 'to', 'clar...",The president has yet to clarify what victory ...,"['The', 'president', 'has', 'yet', 'to', 'clar...","['the', 'president', 'has', 'yet', 'to', 'clar...","['the', 'president', 'has', 'yet', 'to', 'clar...","['president', 'yet', 'clarify', 'victory', 'pa...","[('president', 'NN'), ('yet', 'RB'), ('clarify...","[('president', 'n'), ('yet', 'r'), ('clarify',...","['president', 'yet', 'clarify', 'victory', 'pa..."
1,1,"To hear President Joe Biden tell it, the Amer...",right,"['To', 'hear', 'President', 'Joe', 'Biden', 't...","To hear President Joe Biden tell it, the Ameri...","['To', 'hear', 'President', 'Joe', 'Biden', 't...","['to', 'hear', 'president', 'joe', 'biden', 't...","['to', 'hear', 'president', 'joe', 'biden', 't...","['hear', 'president', 'joe', 'biden', 'tell', ...","[('hear', 'JJ'), ('president', 'NN'), ('joe', ...","[('hear', 'a'), ('president', 'n'), ('joe', 'n...","['hear', 'president', 'joe', 'biden', 'tell', ..."
2,2,The mainstream media's honeymoon with Preside...,right,"['The', 'mainstream', ""media's"", 'honeymoon', ...",The mainstream media's honeymoon with Presiden...,"['The', 'mainstream', 'media', ""'s"", 'honeymoo...","['the', 'mainstream', 'media', ""'s"", 'honeymoo...","['the', 'mainstream', 'media', ""'s"", 'honeymoo...","['mainstream', 'media', ""'s"", 'honeymoon', 'pr...","[('mainstream', 'JJ'), ('media', 'NNS'), (""'s""...","[('mainstream', 'a'), ('media', 'n'), (""'s"", '...","['mainstream', 'medium', ""'s"", 'honeymoon', 'p..."
3,3,"The ""Squad"" makes a demand for which there is ...",right,"['The', '""Squad""', 'makes', 'a', 'demand', 'fo...","The ""Squad"" makes a demand for which there is ...","['The', '``', 'Squad', ""''"", 'makes', 'a', 'de...","['the', '``', 'squad', ""''"", 'makes', 'a', 'de...","['the', '``', 'squad', ""''"", 'makes', 'a', 'de...","['``', 'squad', ""''"", 'makes', 'demand', 'mili...","[('``', '``'), ('squad', 'NN'), (""''"", ""''""), ...","[('``', 'n'), ('squad', 'n'), (""''"", 'n'), ('m...","['``', 'squad', ""''"", 'make', 'demand', 'milit..."
4,4,The restraint crowd delivers America's humilia...,right,"['The', 'restraint', 'crowd', 'delivers', ""Ame...",The restraint crowd delivers America's humilia...,"['The', 'restraint', 'crowd', 'delivers', 'Ame...","['the', 'restraint', 'crowd', 'delivers', 'ame...","['the', 'restraint', 'crowd', 'delivers', 'ame...","['restraint', 'crowd', 'delivers', 'america', ...","[('restraint', 'NN'), ('crowd', 'NN'), ('deliv...","[('restraint', 'n'), ('crowd', 'n'), ('deliver...","['restraint', 'crowd', 'delivers', 'america', ..."


In [10]:
def modify(string):
    string = string[1:-1]
    for i in ["'",'"',"[","]",",","`","’"]:
        string = string.replace(i,"")
    return string

In [12]:
X, y = df.lemmatized,df.label

In [13]:
X = X.apply(lambda x: modify(x))

In [14]:
X.head()

0    president yet clarify victory pandemic look li...
1    hear president joe biden tell american evacuat...
2    mainstream medium s honeymoon president joe bi...
3     squad  make demand military solution well par...
4    restraint crowd delivers america s humiliation...
Name: lemmatized, dtype: object

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

In [17]:
X_train_counts.shape

(2619, 27869)

In [18]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [19]:
X_train_tfidf.shape

(2619, 27869)

In [20]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, y_train)

# BUILDING PIPELINE

In [21]:
from sklearn.pipeline import Pipeline

In [22]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                    ])

In [23]:
text_clf = text_clf.fit(X_train, y_train)

In [24]:
import numpy as np
preds = text_clf.predict(X_test)

In [25]:
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

In [30]:
def print_scores(clf, X_test, y_true):
    y_preds = clf.predict(X_test)
    print("*"*50)
    print("confusion matrix:\n", confusion_matrix(y_true, y_preds), "\n")
    print("*"*50)
    print("f1:\n", f1_score(y_true, y_preds, average='weighted'), "\n")
    print("*"*50)
    print("precision:\n", precision_score(y_true, y_preds, average='weighted'), "\n")
    print("*"*50)
    print("recall:\n", recall_score(y_true, y_preds, average='weighted'), "\n")
    

In [31]:
print_scores(text_clf, X_test, y_test)

**************************************************
confusion matrix:
 [[291   5  10]
 [ 60  99  60]
 [  0   2 346]] 

**************************************************
f1:
 0.8243920046894603 

**************************************************
precision:
 0.8564402648724766 

**************************************************
recall:
 0.843069873997709 



In [21]:
np.mean(preds == y_test)

0.843069873997709

In [32]:
from sklearn.linear_model import SGDClassifier

In [33]:
text_clf_svm = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf_svm', SGDClassifier(loss='hinge',
                              penalty='l2',
                              alpha=1e-3,
                              n_iter_no_change=5,
                              n_jobs=-1,
                              random_state=42)),
])

In [34]:
text_clf_svm = text_clf_svm.fit(X_train, y_train)
preds = text_clf_svm.predict(X_test)
np.mean(preds == y_test)

0.9484536082474226

In [36]:
print_scores(text_clf_svm, X_test, y_test)

**************************************************
confusion matrix:
 [[289  17   0]
 [  6 207   6]
 [  0  16 332]] 

**************************************************
f1:
 0.949207994709462 

**************************************************
precision:
 0.9513015537153967 

**************************************************
recall:
 0.9484536082474226 



In [37]:
from sklearn.model_selection import GridSearchCV

In [38]:
parameters = {
    'vect__ngram_range': [(1,1),(1,2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}

In [39]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)

In [40]:
gs_clf.best_score_

0.9286040605432545

In [41]:
gs_clf.best_params_

{'clf__alpha': 0.001, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}

In [42]:
print_scores(gs_clf, X_test, y_test)

**************************************************
confusion matrix:
 [[279  27   0]
 [ 13 206   0]
 [  0  21 327]] 

**************************************************
f1:
 0.9318006608757446 

**************************************************
precision:
 0.9369884141547693 

**************************************************
recall:
 0.9301260022909508 

