### Cleaning the data

In [2]:
import pandas as pd

In [19]:
# Read data
left = pd.read_pickle("../data/left.pkl")
right = pd.read_pickle("../data/right.pkl")
left.shape, right.shape

((10, 2), (30, 2))

In [38]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('\\xa0', '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\“', '', text)
    text = re.sub('\”', '', text)
    
    return text

round1 = lambda x: clean_text_round1(x)

In [39]:
left.content = left.content.apply(round1)
right.content = right.content.apply(round1)

In [40]:
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

In [45]:
data = [combine_text(left.content.values), 
        combine_text(right.content.values)]

In [49]:
data = pd.DataFrame(data, columns=["content"], index=["left","right"])
data

Unnamed: 0,content
left,trouble for the florida fanboy our legal syste...
right,tucker carlson tonight host tucker carlson unp...


In [50]:
data["wing"] = ["left", "right"]
data

Unnamed: 0,content,wing
left,trouble for the florida fanboy our legal syste...,left
right,tucker carlson tonight host tucker carlson unp...,right


In [53]:
# Save as corpus
data.to_pickle("../data/corpus.pkl")

### Document-term matrix

In [60]:
from sklearn.feature_extraction.text import CountVectorizer
import pickle

In [61]:
cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data.content)

In [62]:
data_dtm = pd.DataFrame(data_cv.toarray(),columns=cv.get_feature_names())
data_dtm

Unnamed: 0,abest,acosta,acting,administration,affects,ahmed,airlines,align,allows,ambitions,...,welloff,white,whoops,wilfred,willingness,woke,world,year,york,zero
0,0,0,0,0,1,0,0,0,1,1,...,1,0,0,0,1,0,0,0,0,0
1,1,1,2,3,0,1,2,1,0,0,...,0,2,1,1,0,1,1,3,1,1


In [63]:
data_dtm.to_pickle("../data/docterm_matrix.plk")
pickle.dump(cv, open("../data/cv.pkl", "wb"))