# Vectorizing and Preparing for Training

Date: 01/30/2021

## About this Notebook
This is developing the train and test data for the model

## Adminstrative Activity

### Import Packages

In [1]:
import os, json, sys
from packaging import version

import pandas as pd
import numpy as np
import datetime

import re, string #Text cleaning

# TensorFlow and tf.keras
import tensorflow as tf
from tensorflow import keras

from sklearn.model_selection import train_test_split #train_test_split

#Custom Code
from bin.text_cleaner import text_cleaner
from bin.html_functions import ez_display as d
from bin.plot_performance_graph import plot_performance
from bin.word_vectorizer import word_vectorizer_columns

### Python Version

In [2]:
d("<b>Current Python Version Used:</b> Python " +  sys.version.split('(')[0].strip())
d("This notebook requires TensorFlow 2.0 or above")
d("<b>Current TensorFlow version: </b>"+ tf.__version__)
d("<b>Current Keras version: </b>"+ keras.__version__)
assert version.parse(tf.__version__).release[0] >=2

### Establish Experiment Reproducibility

In [3]:
keras.backend.clear_session()
np.random.seed(45)
tf.random.set_seed(45)

### Variables

In [4]:
data_folder = "data"
raw_data_folder = os.path.join(data_folder,'RAW')
images_folder = "images"
model_summary_folder = os.path.join(images_folder,"model_summary")
model_folder = "models"
graph_folder = os.path.join(images_folder,"graphs")
word_freq_folder = os.path.join(data_folder,"word_frequency")
padded_folder = os.path.join(data_folder,"padded")
cleaned_data_folder = os.path.join(data_folder,'cleaned')
cleaned_data_filename = "articles.feather"
cleaned_data_filepath = os.path.join(cleaned_data_folder,cleaned_data_filename)
article_filenames = ['articles1.csv', 'articles2.csv', 'articles3.csv']
publication_scorecard_filename = "publication_scorecard.json"
publication_scorecard_filepath = os.path.join(data_folder,publication_scorecard_filename)
text_cols = ['content','simple_clean','stopwords_clean','lemming_clean']

In [5]:
if os.path.isdir(padded_folder) == False:
    os.mkdir(padded_folder)

## Pulling Data

In [6]:
%%time
df = pd.read_feather(cleaned_data_filepath)

Wall time: 1.98 s


In [7]:
with open(publication_scorecard_filepath) as fp:
    scorecard = json.load(fp)

## Organizing Data for Model

#### Splitting text

In [10]:
for col in text_cols:
    df[col] = df[col].str.split()

In [11]:
d('<b>Dataframe Shape:</b> '+str(df.shape))
df.head()

Unnamed: 0,id,title,publication,author,date,year,month,url,content,simple_clean,stopwords_clean,lemming_clean,political_score
0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,"[WASHINGTON, —, Congressional, Republicans, ha...","[WASHINGTON, Congressional, Republicans, fear,...","[WASHINGTON, Congressional, Republicans, fear,...","[WASHINGTON, Congressional, Republicans, fear,...",-0.5
1,17284,Rift Between Officers and Residents as Killing...,New York Times,Benjamin Mueller and Al Baker,2017-06-19,2017.0,6.0,,"[After, the, bullet, shells, get, counted,, th...","[After, bullet, shells, counted, blood, dries,...","[After, bullet, shell, counted, blood, dry, vo...","[After, bullet, shell, counted, blood, dry, vo...",-0.5
2,17285,"Tyrus Wong, ‘Bambi’ Artist Thwarted by Racial ...",New York Times,Margalit Fox,2017-01-06,2017.0,1.0,,"[When, Walt, Disney’s, “Bambi”, opened, in, 19...","[When, Walt, Disneys, Bambi, opened, 1942, cri...","[When, Walt, Disneys, Bambi, opened, 1942, cri...","[When, Walt, Disneys, Bambi, opened, 1942, cri...",-0.5
3,17286,"Among Deaths in 2016, a Heavy Toll in Pop Musi...",New York Times,William McDonald,2017-04-10,2017.0,4.0,,"[Death, may, be, the, great, equalizer,, but, ...","[Death, great, equalizer, necessarily, evenhan...","[Death, great, equalizer, necessarily, evenhan...","[Death, great, equalizer, necessarily, evenhan...",-0.5
4,17287,Kim Jong-un Says North Korea Is Preparing to T...,New York Times,Choe Sang-Hun,2017-01-02,2017.0,1.0,,"[SEOUL,, South, Korea, —, North, Korea’s, lead...","[SEOUL, South, Korea, North, Koreas, leader, s...","[SEOUL, South, Korea, North, Koreas, leader, s...","[SEOUL, South, Korea, North, Koreas, leader, s...",-0.5


## Top 200k words

In [12]:
vocab_size = 200000
word_map = {}
for col in text_cols:
    word_freq_fn = col + "_word_frequency.feather"
    word_freq_df = pd.read_feather(os.path.join(word_freq_folder,word_freq_fn))
    word_freq_df.index += 1 #0 is for padding
    word_freq_100k_df = word_freq_df[:vocab_size] #Top X words
    word_dict = word_freq_100k_df.to_dict()['Words']
    word_dict = dict([(v,k) for k,v in word_dict.items()])
    word_map[col] = word_dict

In [13]:
#Saving Word Map
for k,v in word_map.items():
    word_map_json = json.dumps(v)
    with open(os.path.join(word_freq_folder,f"{k}_word_map_dict.json"),'w') as f:
        f.write(word_map_json)

## Vectorizer Words

In [14]:
%%time
for col in text_cols:
    df[col] = df[col].apply(lambda x: word_vectorizer_columns(x,col,word_map))

Wall time: 47.4 s


In [15]:
df.head(1)

Unnamed: 0,id,title,publication,author,date,year,month,url,content,simple_clean,stopwords_clean,lemming_clean,political_score
0,17283,House Republicans Fret About Winning Their Hea...,New York Times,Carl Hulse,2016-12-31,2016.0,12.0,,"[2542, 19, 2842, 257, 18, 4, 89, 1059, 57, 16,...","[2181, 2560, 102, 736, 271, 119, 206, 1077, 37...","[2124, 2448, 127, 535, 45, 150, 218, 835, 49, ...","[2124, 2448, 127, 535, 45, 150, 218, 835, 49, ...",-0.5


In [16]:
for col in text_cols:
    empty = df[df[col].str.len()==0]
    d(f'{col}: {len(empty)}')

In [17]:
for col in text_cols:
    empty_index = df[df[col].str.len()==0].index
    df.drop(empty_index,inplace=True)
d(f'<b>Size of Dataframe after removing empty rows:</b> {len(df)}')

## Saving Vectorized DF

In [18]:
#Saving New Comments DF
df.reset_index(inplace=True)
fn = os.path.join(cleaned_data_folder,"articles_vectorized.feather")
df.to_feather(path=fn)

## Train-Test Split

In [19]:
X_train, X_test, y_train, y_test = train_test_split(df,df['political_score'], test_size=0.2, random_state = 45)
d('<b> Train Shape:</b> '+str(X_train.shape)+'&nbsp;'*6+
  '<b> Test Shape:</b> '+str(X_test.shape))

### Analyzing Training/Test Data

In [20]:
frame_storage = {}
for name,data in zip(['X_train','X_test'],[X_train,X_test]):
    frame_storage[name] = {}
    for col in text_cols:
        frame_storage[name][col] = {}
        word_len = data[col].str.len().describe().to_frame().round(2)
        frame_storage[name][col]['Word Length'] = word_len
        review_word_size = sum(data[col].str.len())
        review_vocab_size = list(set([word for article in data[col] for word in article]))
        frame_storage[name][col]['Unique Words'] = len(review_vocab_size)
        frame_storage[name][col]['Total Words'] = review_word_size

### Training

In [21]:
d("<h3>Training Data</h3>")
frames = []
for col in text_cols:
    frames.append(frame_storage['X_train'][col]['Word Length'])
train_length = pd.concat(frames,axis=1,sort=False)
d(f'<h4>Article Length</h4>')
d(train_length.to_html())
train_word_counts = {}
name = 'X_train'
for col in text_cols:
    train_word_counts[col] = {}
    train_word_counts[col]['Unique Words'] = frame_storage[name][col]['Unique Words'] 
    train_word_counts[col]['Total Words'] = frame_storage[name][col]['Total Words']
train_word_counts = pd.DataFrame(train_word_counts)
d(f'<h4>Word Counts</h4>')
d(train_word_counts.to_html())

Unnamed: 0,content,simple_clean,stopwords_clean,lemming_clean
count,113969.0,113969.0,113969.0,113969.0
mean,728.93,379.59,379.49,379.49
std,745.96,373.2,373.02,373.02
min,1.0,1.0,1.0,1.0
25%,334.0,177.0,177.0,177.0
50%,582.0,306.0,306.0,306.0
75%,907.0,477.0,476.0,476.0
max,49707.0,22792.0,22735.0,22735.0


Unnamed: 0,content,simple_clean,stopwords_clean,lemming_clean
Unique Words,199590,194035,193045,193045
Total Words,83075100,43261496,43250321,43250321


### Test

In [22]:
d("<h3>Test Data</h3>")
frames = []
for col in text_cols:
    frames.append(frame_storage['X_test'][col]['Word Length'])
test_length = pd.concat(frames,axis=1,sort=False)
d(f'<h4>Article Length</h4>')
d(test_length.to_html())
test_word_counts = {}
name = 'X_test'
for col in text_cols:
    test_word_counts[col] = {}
    test_word_counts[col]['Unique Words'] = frame_storage[name][col]['Unique Words'] 
    test_word_counts[col]['Total Words'] = frame_storage[name][col]['Total Words']
test_word_counts = pd.DataFrame(test_word_counts)
d(f'<h4>Word Counts</h4>')
d(test_word_counts.to_html())

Unnamed: 0,content,simple_clean,stopwords_clean,lemming_clean
count,28493.0,28493.0,28493.0,28493.0
mean,731.07,380.45,380.35,380.35
std,722.41,363.3,363.13,363.13
min,1.0,1.0,1.0,1.0
25%,337.0,178.0,178.0,178.0
50%,585.0,308.0,307.0,307.0
75%,908.0,476.0,476.0,476.0
max,28344.0,13233.0,13220.0,13220.0


Unnamed: 0,content,simple_clean,stopwords_clean,lemming_clean
Unique Words,187994,137993,131746,131746
Total Words,20830268,10840061,10837250,10837250


## Padding Input Vector

In [23]:
padded_train = {}
padded_test = {}
padding = { 'content': 700,
            'simple_clean': 350,
            'stopwords_clean': 350,
            'lemming_clean': 350}

In [24]:
for col in text_cols:
    padded_train_data = tf.keras.preprocessing.sequence.pad_sequences(X_train[col], padding='post',maxlen=padding[col])
    padded_test_data = tf.keras.preprocessing.sequence.pad_sequences(X_test[col], padding='post',maxlen=padding[col])
    padded_train[col] = padded_train_data
    padded_test[col] = padded_test_data

## Saving Padded and Vectorized Data

In [26]:
padded_train_folder = os.path.join(padded_folder,'train')
padded_test_folder = os.path.join(padded_folder,'test')
if os.path.isdir(padded_train_folder) == False:
    os.mkdir(padded_train_folder)
if os.path.isdir(padded_test_folder) == False:
    os.mkdir(padded_test_folder)

In [29]:
for k,v in padded_train.items():
    fn = os.path.join(padded_train_folder,k+"_train_padded.npy")
    if os.path.isfile(fn):
        os.remove(fn)
    with open(fn, 'wb') as f:
        np.save(f,v)
        np.save(f,y_train)

In [30]:
for k,v in padded_test.items():
    fn = os.path.join(padded_test_folder,k+"_test_padded.npy")
    if os.path.isfile(fn):
        os.remove(fn)
    with open(fn, 'wb') as f:
        np.save(f,v)
        np.save(f,y_test)