### Cleaning the data

In [136]:
import re
import string
import pandas as pd

In [137]:
# Read data
left = pd.read_pickle("../data/left.pkl")
right = pd.read_pickle("../data/right.pkl")
left.shape, right.shape

((10, 2), (19, 2))

In [138]:
def clean_text_round1(text):
    text = text.lower()
    text = re.sub('<[^>]*>', '', text)
    text = re.sub('\t', '', text)
    text = re.sub('\([^\)]*\)', '', text)
    return text

In [139]:
round1 = lambda x: clean_text_round1(x)
left.content = left.content.apply(round1)
right.content = right.content.apply(round1)

In [142]:
def clean_text_round2(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''

    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('\\xa0', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\“', '', text)
    text = re.sub('\”', '', text)
    
    return text

round2 = lambda x: clean_text_round2(x)

In [143]:
left.content = left.content.apply(round2)
right.content = right.content.apply(round2)

##### Combine text together

In [144]:
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

In [145]:
data = [combine_text(left.content.values), 
        combine_text(right.content.values)]

In [146]:
data = pd.DataFrame(data, columns=["content"], index=["left","right"])
data

Unnamed: 0,content
left,hey we’ve got our first political sex scandal ...
right,her names nicolle and shes quite a troll nicol...


In [147]:
data["wing"] = ["left", "right"]
data

Unnamed: 0,content,wing
left,hey we’ve got our first political sex scandal ...,left
right,her names nicolle and shes quite a troll nicol...,right


In [148]:
# Save as corpus
data.to_pickle("../data/corpus.pkl")

### Document-term matrix or Bags of words

In [149]:
from sklearn.feature_extraction.text import CountVectorizer
import pickle

In [150]:
cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data.content)

In [157]:
data_dtm = pd.DataFrame(data_cv.toarray(),
                        columns=cv.get_feature_names(),
                        index=["left", "right"])
data_dtm

Unnamed: 0,aapi,aba,abandon,abandoned,abel,abest,abilities,ability,able,abortion,...,yorker,yorkers,youd,youll,young,youre,youve,zeal,zero,zucker
left,0,1,1,0,1,0,0,2,2,0,...,0,0,0,0,3,0,0,0,0,0
right,2,0,0,1,0,1,1,0,3,10,...,1,3,1,4,6,2,2,1,4,1


In [153]:
data_dtm.to_pickle("../data/docterm_matrix.pkl")
pickle.dump(cv, open("../data/cv.pkl", "wb"))

### Word vectors

In [159]:
import spacy

In [160]:
nlp = spacy.load("en_core_web_md")

In [167]:
right['wing'] = 'right'
left['wing'] = 'left'
data = pd.concat([right, left], ignore_index=True)
data.shape

In [170]:
train_x = data.content.values
train_y = data.wing.values

In [182]:
docs = [nlp(text) for text in train_x]
train_x_vecs = [x.vector for x in docs]
len(train_x_vecs)

In [184]:
# Train a SVM for classifying left/right wing
from sklearn import svm

In [185]:
clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vecs, train_y)

SVC(kernel='linear')

In [191]:
test_x = ["I went to the bank and wrote a check", 
          "let me check that out", 
          "last week president biden introduced a trill",
          "along with many others i have long argued that the reason so few police officers are ever charged in their killings of unarmed black people  is that our legal system has effectively rendered those killings legal this is the case regardless of how horrendous the killings are or how much evidence including video makes clear what took place the defense in the trial of derek chauvin in the death of george floyd raised this very concept wednesday when questioning sgt jody stiger a los angeles police department useofforce expert who was a witness for the prosecution eric nelson an attorney for chauvin asked if sergeant stiger had ever had anything to do with a training called awful but lawful or lawful but awful he said that he had nelson continued his questioning the general concept is that sometimes the use of force it looks really bad right and sometimes it may be so it may be caught on video right and it looks bad right sergeant stiger responds yes nelson then says but it is still lawful the officer concludes yes based on that department’s policies or based on that state’s law this concept seems on its face morally depraved the bar for actions and in this case use of lethal force isn’t propriety or decency but the likelihood of legal exposure and jeopardy but the very existence of awful but lawful training reminds us that this concept isn’t new as chuck wexler the executive director of the police executive research forum and j scott thomson the chief operating officer of holtec security international and former president of the forum wrote in the new york times in  just because the police can legally use deadly force doesn’t always mean they should and the goal is to prevent lawfulbutawful outcomes while increasing officer safety they explained that the legal standard used in police shootings allows prosecutors and grand juries to conclude that although an officer’s shooting of a suspect may be questionable it isn’t criminal they went on to trace the origins of the standard the standard came from a  supreme court decision graham v connor the justices ruled that an officer’s use of force must be ‘objectively reasonable’ but the court went on to caution that ‘police officers are often forced to make splitsecond judgments — in circumstances that are tense uncertain and rapidly evolving — about the amount of force that is necessary in a particular situation’ police officers operate in the field — and enter courtrooms if it ever comes to that — with a staggering amount of blue privilege a benefitofthedoubt shield of protection that it is incredibly difficult to penetrate this creates that bizarre legal phenomenon of faultless killings the taking of life without the taking of responsibility a cain and abel scenario in which blood cries out from the soil but in these cases no one is seriously punished the standard implies that officers must be allowed to make mistakes even deadly ones because their jobs are dangerous policing is one of the few dangerous professions in which people can be killed and written off as collateral damage this standard allows callous wanton behavior the reckless and willful taking of life any action can be excused as a reasonable response to fear as the american bar association pointed out in february of  the awful but lawful concept creates a high burden for prosecution of bad police actions the group was summarizing the sentiments of a panel of legal experts at an aba meeting one person on the panel ronald a norwood a defense lawyer in st louis who has served as counsel to the st louis metropolitan police department put it this way officials should not be held liable for bad guesses but these bad guesses are not benign in many cases they result in someone being killed as kalfani ture an assistant professor of criminal justice at quinnipiac university in connecticut and a former police officer in the atlanta metropolitan area told reporters in june about the killing by the police of rayshard brooks in atlanta would i have shot rayshard brooks my answer is no’’ but he continued it’s a questionable use of force but there are many officers who may find this a lawful use of force so it’s one of those things we call in law enforcement ‘lawful but awful’ meaning that the officer could have taken alternative action that did not result in the civilian’s death’’ deadly use of force by police officers is highly discretionary but these police officers are humans who bring to their jobs biases both conscious and subconscious where one person may be shown patience and leniency another will be rewarded with violence and harm and often all of it is legal we have a legal system that has shirked its judicial responsibility allowing for extrajudicial killings without consequence — curbside capital sentences in this system it is too often the case that police officers are judge jury and executioner the times is committed to publishing a diversity of letters to the editor we’d like to hear what you think about this or any of our articles here are some tips and here’s our email lettersnytimescom follow the new york times opinion section on facebook and twitter  and instagram"]
test_docs = [nlp(text) for text in test_x]
test_x_word_vectors =  [x.vector for x in test_docs]

clf_svm.predict(test_x_word_vectors)

array(['right', 'right', 'right', 'right'], dtype=object)

### Stemming

In [192]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [196]:
stemmer = PorterStemmer()
words_left = word_tokenize(' '.join([s for s in left.content.values]))
words_right = word_tokenize(' '.join([s for s in right.content.values]))

In [198]:
len(words_left), len(words_right)

(10626, 17791)

In [200]:
stemmed_left = []
for word in words_left:
    stemmed_left.append(stemmer.stem(word))
stemmed_left = " ".join(stemmed_left)