In [36]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [37]:
df = pd.read_csv("data/test.csv")

In [38]:
df.head()

Unnamed: 0,article,label
0,The president has yet to clarify what victory ...,right
1,"SHARETo hear President Joe Biden tell it, the ...",right
2,SHAREThe mainstream media's honeymoon with Pre...,right
3,"The ""Squad"" makes a demand for which there is ...",right
4,The restraint crowd delivers America's humilia...,right


In [39]:
X, y = df.article, df.label

In [40]:
def modify(string):
    string = string[1:-1]
    for i in ["'",'"',"[","]",",","`","’"]:
        string = string.replace(i,"")
    return string

In [41]:
X.head()

0    The president has yet to clarify what victory ...
1    SHARETo hear President Joe Biden tell it, the ...
2    SHAREThe mainstream media's honeymoon with Pre...
3    The "Squad" makes a demand for which there is ...
4    The restraint crowd delivers America's humilia...
Name: article, dtype: object

In [42]:
#X = X.apply(lambda x: modify(x))

In [43]:
X.head()

0    The president has yet to clarify what victory ...
1    SHARETo hear President Joe Biden tell it, the ...
2    SHAREThe mainstream media's honeymoon with Pre...
3    The "Squad" makes a demand for which there is ...
4    The restraint crowd delivers America's humilia...
Name: article, dtype: object

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X,y, random_state=42)

In [45]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)

In [46]:
X_train_counts.shape

(2619, 31949)

In [47]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

In [48]:
X_train_tfidf.shape

(2619, 31949)

In [49]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, y_train)

# BUILDING PIPELINE

In [50]:
from sklearn.pipeline import Pipeline

In [51]:
text_clf = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                    ])

In [52]:
text_clf = text_clf.fit(X_train, y_train)

In [53]:
import numpy as np
preds = text_clf.predict(X_test)

In [54]:
np.mean(preds == y_test)

0.8247422680412371

In [55]:
from sklearn.linear_model import SGDClassifier

In [56]:
text_clf_svm = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf_svm', SGDClassifier(loss='hinge',
                              penalty='l2',
                              alpha=1e-3,
                              n_iter_no_change=5,
                              n_jobs=-1,
                              random_state=42)),
])

In [57]:
text_clf_svm = text_clf_svm.fit(X_train, y_train)
preds = text_clf_svm.predict(X_test)
np.mean(preds == y_test)

0.9381443298969072

In [58]:
from sklearn.model_selection import GridSearchCV

In [59]:
parameters = {
    'vect__ngram_range': [(1,1),(1,2)],
    'tfidf__use_idf': (True, False),
    'clf__alpha': (1e-2, 1e-3),
}

In [60]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)

In [61]:
gs_clf.best_score_

0.921726533650548

In [62]:
gs_clf.best_params_

{'clf__alpha': 0.001, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}

In [65]:
parameters_svm = {
    'vect__ngram_range': [(1,1), (1,2)],
    'tfidf__use_idf': (True, False),
    'clf_svm__alpha':(1e-2, 1e-3),
}

In [66]:
gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X_train, y_train)

In [68]:
print(gs_clf_svm.best_score_)
gs_clf_svm.best_params_

0.9381555325266738


{'clf_svm__alpha': 0.001, 'tfidf__use_idf': True, 'vect__ngram_range': (1, 1)}

In [70]:
text_clf = Pipeline([('vect', CountVectorizer(stop_words='english')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
                    ])
text_clf = text_clf.fit(X_train, y_train)
preds = text_clf.predict(X_test)
np.mean(preds == y_test)

0.8579610538373424

In [71]:
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)

In [72]:
gs_clf.best_score_

0.9286011413892254

In [73]:
gs_clf.best_params_

{'clf__alpha': 0.001, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}

In [74]:
text_clf_svm = Pipeline([
    ('vect', CountVectorizer(stop_words='english')),
    ('tfidf', TfidfTransformer()),
    ('clf_svm', SGDClassifier(loss='hinge',
                              penalty='l2',
                              alpha=1e-3,
                              n_iter_no_change=5,
                              n_jobs=-1,
                              random_state=42)),
])

In [75]:
text_clf_svm = text_clf_svm.fit(X_train, y_train)
preds = text_clf_svm.predict(X_test)
np.mean(preds == y_test)

0.9495990836197021

In [76]:
gs_clf_svm = GridSearchCV(text_clf_svm, parameters_svm, n_jobs=-1)
gs_clf_svm = gs_clf_svm.fit(X_train, y_train)

In [77]:
print(gs_clf_svm.best_score_)
gs_clf_svm.best_params_

0.9599156364485572


{'clf_svm__alpha': 0.001, 'tfidf__use_idf': False, 'vect__ngram_range': (1, 2)}

In [78]:
import nltk
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [79]:
from nltk.stem.snowball import SnowballStemmer

In [80]:
stemmer = SnowballStemmer("english", ignore_stopwords=True)

In [81]:
class StemmerCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmerCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])

In [82]:
stemmed_count_vect = StemmerCountVectorizer(stop_words='english')

In [91]:
text_mnb_stemmed = Pipeline([
    ('vect', stemmed_count_vect),
    ('tfidf', TfidfTransformer()),
    ('mnb', MultinomialNB(fit_prior=False)),
])

In [92]:
text_mnb_stemmed = text_mnb_stemmed.fit(X_train, y_train)

In [93]:
preds_mnb_stemmed = text_mnb_stemmed.predict(X_test)

In [94]:
np.mean((preds_mnb_stemmed == y_test))

0.861397479954181

In [96]:
text_svm_stemmed = Pipeline([
    ('vect', stemmed_count_vect),
    ('tfidf', TfidfTransformer()),
    ('svm', SGDClassifier(loss='hinge',
                              penalty='l2',
                              alpha=1e-3,
                              n_iter_no_change=5,
                              n_jobs=-1,
                              random_state=42)),
])

In [97]:
text_svm_stemmed = text_svm_stemmed.fit(X_train, y_train)

In [98]:
preds_svm_stemmed = text_svm_stemmed.predict(X_test)

In [99]:
np.mean((preds_svm_stemmed == y_test))

0.9461626575028637

In [100]:
text = "Arizona’s highly controversial election audit wrapped up Friday as the GOP-backed company conducting it announced the investigation had actually increased President Joe Biden’s margin of victory—but Republicans in multiple other states are already underway on other probes that could sow new doubts in the presidential vote count. Three different election investigations are underway in Wisconsin: an investigation initiated by Republican Assembly Speaker Robin Vos, a separate so-called forensic audit modeled after Arizona by GOP Rep. Janel Brandtjen and an investigation by the state’s nonpartisan Legislative Audit Bureau, which the Republican-led legislature asked it to conduct.While Brandtjen’s audit has stalled after Vos refused to sign subpoenas for counties’ election equipment, Vos’ investigation is moving forward, led by a former state Supreme Court justice who has expressed openness to MyPillow CEO Mike Lindell’s baseless claims of election fraud.Ex-justice Michael Gableman sent emails to local election clerks asking them to preserve election-related records—but the email was flagged as a security risk or junk mail for many of the counties because it came from a Gmail account seemingly unconnected to Gableman, the Associated Press reports, and many clerks are ignoring his requests. Pennsylvania’s election audit is now underway after a contentious power struggle over which Republican lawmaker would lead it, as GOP state senators started hearings in the investigation and controversially subpoenaed the state for personal information on all of Pennsylvania’s 6.9 million voters.Democrats in the state are pushing back on the subpoenas, with both state senators and Pennsylvania Attorney General Josh Shapiro suing to block them.Former President Donald Trump asked Texas on Thursday to conduct an audit into the state’s election results—despite winning the state with 52% of the vote—and the Texas Secretary of State’s office complied, launching investigations into four counties’ vote counts.99. That’s the number of additional votes for Biden in Maricopa County that Cyber Ninjas—the controversial private company leading the Arizona audit—counted as compared with the state’s official tally. The audit also found 261 fewer votes for Trump than the state.“Why do we audit everything in this world, but people raise their hands in concern when we audit elections, which is fundamental to our democracy?” Texas Gov. Greg Abbott said on Fox News Sunday in defense of the state’s election audit. The governor also claimed the election investigations “began months ago” and “were already underway,” but host Chris Wallace noted the counties’ investigations were only publicly announced hours after Trump made his request to the state. (Abbott did not offer any clarification on that timeline.)The chairman of Wisconsin’s Republican Party of Fond du Lac County said on Twitter after the Arizona audit results came out that he hoped the findings mean “my fellow [Republicans will] stop with this nonsense, and begin to focus on 2022 elections.” “Right now [the] GOP effort is hacking off Republican clerks, and alienating soft GOP voters who think we’re nuts and wasting their tax dollars,” Rohn W. Bishop said.While proponents of GOP election investigations typically refer to their efforts as “forensic audits,” that term has been disputed as inaccurate. Election experts cited by Georgia Public Broadcasting explained the term typically refers to financial investigations that examine “minute details” that can be traced to individual accounts, which is impossible with election ballots that are submitted anonymously. The Associated Press also notes that “forensic” is traditionally used to describe investigations into a crime, but there is no evidence to support that any such crimes occurred in the 2020 elections, including the baseless allegations of fraud pushed by Trump and his supporters.The Arizona audit results released Friday brought to an end a months-long investigation into the vote count that had drawn heavy criticism for issues like a lack of transparency and privacy and concerns about how it was being funded. The House Oversight Committee is now investigating the audit, which cost nearly $6 million, though that hasn’t stopped the election probe from becoming a source of inspiration to other GOP lawmakers across the country. The Arizona audit and its copycats will not have any actual effect on the official election results, which have already been certified. The probes have been criticized for helping to undermine faith in the vote count and further stoke baseless allegations that the election was “rigged,” however, which there is no evidence to support. Numerous official audits conducted by state and local governments have confirmed the election results’ accuracy and have not found any widespread issues. The Biden administration has also warned the right-wing election audits may violate federal law by taking election materials out of election officials’ possession."

In [102]:
pr_text = text_svm_stemmed.predict([text])

In [105]:
text_svm_stemmed.classes_

array(['center', 'left', 'right'], dtype='<U6')

In [106]:
text2 = """
However, Biden made it clear that he does not believe the “full promise of equality” has been reached yet, citing “anti-LGBTQ+ bills” which he believes are proliferating state legislatures.
“Bullying and harassment — particularly of young transgender Americans and LGBTQ+ people of color — still abounds, diminishing our national character,” he claimed, touting the necessity of advancing the Equality act — a bill which effectively replaces the fundamental concept of biological sex in favor of gender identity. The radical piece of legislation would severely affect women, specifically, as it would destroy women’s sports and eliminate the traditional right to privacy for women in public facilities because gender identity, rather than biological sex, would be prioritized.
“To LGBTQ+ people across the country, and especially those who are contemplating coming out: know that you are loved for who you are, you are admired for your courage, and you will have a community — and a nation — to welcome you,” Biden concluded, promising to “always” have their back.
Biden’s statement follows a bout of absurdity displayed by Canadian Prime Minister Justin Trudeau last week, using the phrase “2SLGBTQQIA+ people” in a social media post honoring “Indigenous women, girls, and 2SLGBTQQIA+ people who are missing or have been murdered”:
"""

In [113]:
text_svm_stemmed.predict([text, text2, text3, text4])

array(['left', 'left', 'left', 'left'], dtype='<U6')

In [110]:
text3 = """
North Carolina Lt. Gov. Mark Robinson (R) has made clear that he will not succumb to the leftist calls for his resignation for labeling the promotion of “trangenderism” and “homosexuality” in schools as “filth.”
He referred to the promotion of transgenderism and homosexuality as ‘filth’ in a clip that was recorded in June at Asbury Baptist Church, in Seagrove North Carolina, according to CBS 17. The clip has recently made its way around social media.
“There’s no reason anybody anywhere in America should be telling any child about transgenderism, homosexuality, any of that filth,” Robinson says in the video. “And yes I called it filth. And if you don’t like it that I called it filth come see me, and I’ll explain it to you.”
The video has sparked outrage on the left. Deputy White House press secretary Andrew Bates referred to the clip as “repugnant and offensive,” according to Fox News.
“For several days now, I have been viciously attacked because of a clip video where I talk about removing the sexualization of children from the classrooms in our public education system,” Robinson states at the beginning of the video.
“Well let me tell you plainly right here and right now: I will not back down. I will not be silenced, and I will not be bullied into submission,” Robinson added. “I will continue to fight for the rights of our children to receive an education, that is free from sexual concepts that do not belong in the classroom, and I don’t care who doesn’t like it.”
Robinson highlighted how the left has aimed to make his comments about the LGBTQ community rather than a conversation regarding explicit content in the classroom.
“Of course, the media and those on the left have tried to change the focus from education to the LGBTQ community, specifically that I hate them,” Robinson states. “Let me be clear, I will fight for and protect the rights of all citizens, including those in the LGBTQ community to express themselves however they want. That is their right as Americans and I don’t think that government has any role in telling them otherwise.”
“However the idea that our children should be taught about concepts of transgenderism and be exposed to sexually explicit materials in the classroom is abhorrent,” he stated. 
"""

In [112]:
text4 = """
On Sunday, Fox News Channel “Life, Liberty & Levin” host Mark Levin warned that the First Amendment is under attack by Attorney General Merrick Garland and the Biden administration.
According to Levin, the Biden administration and the National School Boards Association “conspired” and “colluded” to silence parents who are expressing their First Amendment rights by recently announcing that the Department of Justice would intervene to address perceived threats against educators and school boards.
“The First Amendment of the Constitution of the United States, the Bill of Rights, is under attack by the Attorney General of the United States, Merrick Garland, and his staff,” Levin opened his Sunday show. “And now we have learned it is also under attack by Joe Biden’s staff, as well as others at the White House in a coordinated attack to try to silence parents and taxpayers, the citizens of this nation who elect their school boards, who send their children into these classrooms. Why? Because they are challenging the poison, the rot, the radical Marxist propaganda that is being taught to your children from kindergarten through 12th grade, and apparently that is too much for them to tolerate.”
He continued, “Ladies and gentlemen, this is the memo that will go down in history as one of the most egregious violations of your liberty. It is a memo that is signed by Merrick Garland. … And make no mistake about it — parents in this country, they’re going to spy on you, they’re going to gather intelligence on you on, they’re going to track you and organizations that you belong to, and a special phone number is set up so if a teacher or a bureaucrat or union or whomever thinks if you are threatening or harassing them, they will set loose the FBI to come to your home and to interview you. Among other things in this memo, loose language like they’re going to look at efforts to intimidate individuals based on their views. They’re going to be committed to using their authority and their resources. They have a dedicated line of communication for threat reporting. Now, there’s a problem here: The federal government has absolutely no authority whatsoever in the classroom, in the school district, at board meetings, period. There is no federal nexus of any kind whatsoever, and they know it.”
"""

In [115]:
X_test[:10], y[:10]

(295     Here’s a line to amuse your liberal pals: When...
 718     We’ve been hearing a lot of speculation that t...
 952     Physicians spreading medical misinformation, p...
 2561    WASHINGTON, D.C. -- Eighty-two percent of Amer...
 2547    WASHINGTON, D.C. -- Fifty-six percent of Ameri...
 2310    WASHINGTON, D.C. -- According to the latest We...
 1084    Physicians spreading medical misinformation, p...
 144     It is becoming so painfully clear that those i...
 1588    MyPillow CEO Mike Lindell has a new date for a...
 2162    Former National Security Agency contractor Rea...
 Name: article, dtype: object,
 0    right
 1    right
 2    right
 3    right
 4    right
 5    right
 6    right
 7    right
 8    right
 9    right
 Name: label, dtype: object)

In [116]:
text_svm_stemmed.predict(X_test[:10])

array(['right', 'right', 'right', 'center', 'center', 'center', 'right',
       'right', 'left', 'left'], dtype='<U6')

In [121]:
from sklearn.metrics import f1_score, confusion_matrix

In [120]:
f1_score(y_test, preds_svm_stemmed, average='micro')

0.9461626575028637

In [122]:
confusion_matrix(y_test, preds_svm_stemmed)

array([[288,  18,   0],
       [  8, 204,   7],
       [  0,  14, 334]])