In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('bmh')
import re
from tqdm import tqdm
import spacy
from spacy.lang.en import English
import pickle
import string
from datetime import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.decomposition import TruncatedSVD

In [2]:
nlp = spacy.load('en_core_web_md')

In [3]:
# create functions for preprocessing text 

def remove_nums(s):
    # 1. Replace numbers with strings
    s = re.sub(r'\d+', '', s)
    return s
    
def remove_punct(s):
    # 2. Punctuation removal, new line removal
    translator = str.maketrans('', '', string.punctuation)
    s = s.translate(translator)
    s = re.sub('\n', '', s)
    return s

def remove_nonenglish(s):
    #remove non english characters
    s = re.sub(r'[^\x00-\x7f]',r'', s) 
    return s

    

In [14]:
# read in data to be processed
df = pd.read_pickle('data/yelp_review1pct_business_merge.pkl')

In [15]:
# clean, preprocess string in "text" column of dataframe
df['clean_text'] = df.text.apply(remove_nums).apply(remove_punct).apply(remove_nonenglish).str.lower()

In [16]:
# do train-test split before vectorizing text data
labels = df.pop('sentiment')


In [25]:
x_train, x_test, y_train, y_test = train_test_split(df, labels, test_size=0.1, random_state=20)

In [61]:
y_train.to_pickle('data/sentiment_y_train.pkl')
y_test.to_pickle('data/sentiment_y_test.pkl')

In [9]:
#text generator object to generate lemmas on TRAINING dataset
textgen_train = ([tok.lemma_ for tok in nlp(text)] for text in x_train.clean_text)

#text generator object to generate lemmas on TEST dataset
textgen_test = ([tok.lemma_ for tok in nlp(text)] for text in x_test.clean_text)

In [10]:
bow_pipe = Pipeline([('tfidf', TfidfVectorizer(tokenizer=list, lowercase=False, min_df=0.01, max_df=0.2)),
                     ('svd', TruncatedSVD(n_components=100, random_state=12))
                    ])

In [11]:
x_train_svd = bow_pipe.fit_transform(textgen_train)

In [12]:
x_test_svd = bow_pipe.transform(textgen_test)

In [35]:
x_train_svd_df = pd.DataFrame(x_train_svd)
x_train_svd_df.set_index(x_train.index, inplace=True)
x_train_svd_all = x_train_svd_df.join(x_train)

x_test_svd_df = pd.DataFrame(x_test_svd)
x_test_svd_df.set_index(x_test.index, inplace=True)
x_test_svd_all = x_test_svd_df.join(x_test)

In [62]:
x_train_svd_all.to_pickle('data/review_1pct_bus_svd_x_train.pkl')
x_test_svd_all.to_pickle('data/review_1pct_bus_svd_x_test.pkl')

In [36]:
#text generator object to generate lemmas on TRAINING dataset
textgen_train = ([tok.lemma_ for tok in nlp(text)] for text in x_train.clean_text)

#text generator object to generate lemmas on TEST dataset
textgen_test = ([tok.lemma_ for tok in nlp(text)] for text in x_test.clean_text)

tfidf_pipe = Pipeline([('tfidf', TfidfVectorizer(tokenizer=list, lowercase=False, min_df=0.01, max_df=0.2))])

x_train_bow = tfidf_pipe.fit_transform(textgen_train)
x_test_bow = tfidf_pipe.transform(textgen_test)

In [45]:
x_train_bow_df = pd.DataFrame(x_train_bow.todense())
x_train_bow_df.columns = tfidf_pipe.named_steps['tfidf'].get_feature_names()
x_train_bow_df.set_index(x_train.index, inplace=True)
x_train_bow_all = x_train_bow_df.join(x_train, lsuffix='_tok')

x_test_bow_df = pd.DataFrame(x_test_bow.todense())
x_test_bow_df.columns = tfidf_pipe.named_steps['tfidf'].get_feature_names()
x_test_bow_df.set_index(x_test.index, inplace=True)
x_test_bow_all = x_test_bow_df.join(x_test, lsuffix='_tok')


In [46]:
x_train_bow_all.to_pickle('data/review_1pct_bus_bow_x_train.pkl')
x_test_bow_all.to_pickle('data/review_1pct_bus_bow_x_test.pkl')

In [47]:
# using word embeddings
from sklearn.base import BaseEstimator, TransformerMixin
class SpacyVectorTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, nlp):
        self.nlp = nlp
        self.dim = 300

    def fit(self, X, y):
        return self

    def transform(self, X):
        # Doc.vector defaults to an average of the token vectors.
        # https://spacy.io/api/doc#vector
        return [self.nlp(text).vector for text in X]

In [49]:
embed_pipe = Pipeline(steps=[("mean_embeddings", SpacyVectorTransformer(nlp))])

x_train_em = embed_pipe.transform(x_train.text)
x_test_em = embed_pipe.transform(x_test.text)

In [51]:
x_train_em_df = pd.DataFrame(x_train_em)
x_train_em_df.set_index(x_train.index, inplace=True)
x_train_em_all = x_train_em_df.join(x_train, lsuffix='_tok')

x_test_em_df = pd.DataFrame(x_test_em)
x_test_em_df.set_index(x_test.index, inplace=True)
x_test_em_all = x_test_em_df.join(x_test, lsuffix='_tok')

In [54]:
x_train_em_all.to_pickle('data/review_1pct_bus_em_x_train.pkl')
x_test_em_all.to_pickle('data/review_1pct_bus_em_x_test.pkl')