In [None]:
import pandas as pd
import pickle
import numpy as np
import re

In [None]:
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import Markdown, display

In [None]:
from tqdm.autonotebook import tqdm
pd.set_option('display.max_colwidth', 300)
pd.set_option('display.float_format', '{:,.2f}'.format)
tqdm.pandas()

In [None]:
from gensim.parsing import (strip_tags, strip_numeric, 
                            strip_multiple_whitespaces, 
                            stem_text, strip_punctuation, 
                            remove_stopwords, preprocess_string)

### Get the Amazon Dataset
You can find the full list of Amazon datasets [here](http://jmcauley.ucsd.edu/data/amazon/qa/). <br>
You can download the dataset and unzip it and then follow the steps below

In [None]:
def parse(path):
  g = open(path, 'rb')
  for l in g:
    yield eval(l)

In [None]:
def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

In [None]:
amazon_df = getDF('qa_Software.json')
amazon_df.head()

### Create a simple preprocessing pipeline

In [None]:
transform_to_lower = lambda s: s.lower()
remove_punct = lambda s: re.sub(r'[:,;?]', '', s)

CLEAN_FILTERS = [remove_punct,
                 strip_multiple_whitespaces, 
                 transform_to_lower,
                 remove_stopwords]

def cleaning_pipe(document):
    processed_words = preprocess_string(document, CLEAN_FILTERS)    
    return ' '.join(processed_words)

In [None]:
amazon_df['question_cleaned'] = amazon_df['question'].progress_apply(cleaning_pipe)

### View the sentence length distribution

In [None]:
get_len_funct = lambda x: len(x.split())
get_len = np.vectorize(get_len_funct)
amazon_df['question_len'] = get_len(amazon_df['question'])
sns.displot(amazon_df['question_len'])
plt.title('Question Length');

### Store the DF 
Store the data locally so we can use it in the other notebooks

In [None]:
amazon_df.to_pickle("amazon_clean_data.pkl")