### Cleaning the data

In [136]:
import re
import string
import pandas as pd

In [137]:
# Read data
left = pd.read_pickle("../data/left.pkl")
right = pd.read_pickle("../data/right.pkl")
left.shape, right.shape

((10, 2), (19, 2))

In [138]:
def clean_text_round1(text):
    text = text.lower()
    text = re.sub('<[^>]*>', '', text)
    text = re.sub('\t', '', text)
    text = re.sub('\([^\)]*\)', '', text)
    return text

In [139]:
round1 = lambda x: clean_text_round1(x)
left.content = left.content.apply(round1)
right.content = right.content.apply(round1)

In [142]:
def clean_text_round2(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''

    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('\\xa0', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\“', '', text)
    text = re.sub('\”', '', text)
    
    return text

round2 = lambda x: clean_text_round2(x)

In [143]:
left.content = left.content.apply(round2)
right.content = right.content.apply(round2)

##### Combine text together

In [144]:
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

In [145]:
data = [combine_text(left.content.values), 
        combine_text(right.content.values)]

In [146]:
data = pd.DataFrame(data, columns=["content"], index=["left","right"])
data

Unnamed: 0,content
left,hey we’ve got our first political sex scandal ...
right,her names nicolle and shes quite a troll nicol...


In [147]:
data["wing"] = ["left", "right"]
data

Unnamed: 0,content,wing
left,hey we’ve got our first political sex scandal ...,left
right,her names nicolle and shes quite a troll nicol...,right


In [148]:
# Save as corpus
data.to_pickle("../data/corpus.pkl")

### Document-term matrix or Bags of words

In [149]:
from sklearn.feature_extraction.text import CountVectorizer
import pickle

In [150]:
cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data.content)

In [157]:
data_dtm = pd.DataFrame(data_cv.toarray(),
                        columns=cv.get_feature_names(),
                        index=["left", "right"])
data_dtm

Unnamed: 0,aapi,aba,abandon,abandoned,abel,abest,abilities,ability,able,abortion,...,yorker,yorkers,youd,youll,young,youre,youve,zeal,zero,zucker
left,0,1,1,0,1,0,0,2,2,0,...,0,0,0,0,3,0,0,0,0,0
right,2,0,0,1,0,1,1,0,3,10,...,1,3,1,4,6,2,2,1,4,1


In [153]:
data_dtm.to_pickle("../data/docterm_matrix.pkl")
pickle.dump(cv, open("../data/cv.pkl", "wb"))

### Word vectors

In [159]:
import spacy

In [160]:
nlp = spacy.load("en_core_web_md")

In [167]:
right['wing'] = 'right'
left['wing'] = 'left'
data = pd.concat([right, left], ignore_index=True)
data.shape

In [170]:
train_x = data.content.values
train_y = data.wing.values

In [182]:
docs = [nlp(text) for text in train_x]
train_x_vecs = [x.vector for x in docs]
len(train_x_vecs)

In [184]:
# Train a SVM for classifying left/right wing
from sklearn import svm

In [185]:
clf_svm = svm.SVC(kernel='linear')
clf_svm.fit(train_x_vecs, train_y)

SVC(kernel='linear')

In [210]:
# Test again
test_x = train_x
test_docs = [nlp(text) for text in test_x]
test_x_word_vectors =  [x.vector for x in test_docs]

clf_svm.predict(test_x_word_vectors)

array(['right', 'right', 'right', 'right', 'right', 'right', 'right',
       'right', 'right', 'right', 'right', 'right', 'right', 'right',
       'right', 'right', 'right', 'right', 'right', 'right', 'right',
       'right', 'right', 'right', 'right', 'right', 'right', 'right',
       'right'], dtype=object)

### Stemming

In [192]:
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

In [203]:
stemmer = PorterStemmer()
words_left = word_tokenize(' '.join([s for s in left.content.values]))
len(words_left)

10626

In [205]:
stemmed_left = []
for word in words_left:
    stemmed_left.append(stemmer.stem(word))
stemmed_left = " ".join(stemmed_left)

### Lemmatizing

In [206]:
from nltk.stem import WordNetLemmatizer

In [209]:
phrase = "reading the books"
words = word_tokenize(phrase)
lemmatizer = WordNetLemmatizer()
lemmatized_words = []
for word in words:
  lemmatized_words.append(lemmatizer.lemmatize(word, pos='v'))

" ".join(lemmatized_words)

'read the book'